Fix buffered/direct/mmap I/O race

When a page is faulted in for memory mapped I/O the page lock
may be dropped before it has been read and marked up to date.
If a buffered read encounters such a page in mappedread() it
must wait until the page has been updated. Failure to do so
will result in a panic on debug builds and incorrect data on
production builds.

The critical part of this change is in mappedread() where pages
which are not up to date are now handled. Additionally, it
includes the following simplifications.

- zfs_getpage() and zfs_fillpage() could be passed an array of
  pages. This could be more efficient if it was used but in
  practice only a single page was ever provided. These
  interfaces were simplified to acknowledge that.

- update_pages() was modified to correctly set the PG_error bit
  on a page when it cannot be read by dmu_read().

- Setting PG_error and PG_uptodate was moved to zfs_fillpage()
  from zpl_readpage_common(). This is consistent with the
  handling in update_pages() and mappedread().

- Minor additional refactoring to comments and variable
  declarations to improve readability.

- Add a test case to exercise concurrent buffered, direct,
  and mmap IO to the same file.

- Reduce the mmap_sync test case default run time.

Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #13608 
Closes #14498
This commit is contained in:
Brian Behlendorf 2023-02-23 10:57:24 -08:00 committed by GitHub
parent 7cb67d627c
commit 89cd2197b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 174 additions and 104 deletions

View File

@ -72,7 +72,7 @@ extern void zfs_inactive(struct inode *ip);
extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
offset_t offset, cred_t *cr);
extern int zfs_fid(struct inode *ip, fid_t *fidp);
extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages);
extern int zfs_getpage(struct inode *ip, struct page *pp);
extern int zfs_putpage(struct inode *ip, struct page *pp,
struct writeback_control *wbc, boolean_t for_sync);
extern int zfs_dirty_inode(struct inode *ip, int flags);

View File

@ -220,43 +220,46 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)
}
#if defined(_KERNEL)
static int zfs_fillpage(struct inode *ip, struct page *pp);
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
* between the DMU cache and the memory mapped pages. Update all mapped
* pages with the contents of the coresponding dmu buffer.
*/
void
update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
{
struct inode *ip = ZTOI(zp);
struct address_space *mp = ip->i_mapping;
struct page *pp;
uint64_t nbytes;
int64_t off;
void *pb;
struct address_space *mp = ZTOI(zp)->i_mapping;
int64_t off = start & (PAGE_SIZE - 1);
off = start & (PAGE_SIZE-1);
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
nbytes = MIN(PAGE_SIZE - off, len);
uint64_t nbytes = MIN(PAGE_SIZE - off, len);
pp = find_lock_page(mp, start >> PAGE_SHIFT);
struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
if (pp) {
if (mapping_writably_mapped(mp))
flush_dcache_page(pp);
pb = kmap(pp);
(void) dmu_read(os, zp->z_id, start + off, nbytes,
pb + off, DMU_READ_PREFETCH);
void *pb = kmap(pp);
int error = dmu_read(os, zp->z_id, start + off,
nbytes, pb + off, DMU_READ_PREFETCH);
kunmap(pp);
if (mapping_writably_mapped(mp))
flush_dcache_page(pp);
if (error) {
SetPageError(pp);
ClearPageUptodate(pp);
} else {
ClearPageError(pp);
SetPageUptodate(pp);
if (mapping_writably_mapped(mp))
flush_dcache_page(pp);
mark_page_accessed(pp);
}
mark_page_accessed(pp);
SetPageUptodate(pp);
ClearPageError(pp);
unlock_page(pp);
put_page(pp);
}
@ -267,38 +270,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
}
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
* On Read: We "read" preferentially from memory mapped pages,
* else we default from the dmu buffer.
*
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
* the file is memory mapped.
* When a file is memory mapped, we must keep the I/O data synchronized
* between the DMU cache and the memory mapped pages. Preferentially read
* from memory mapped pages, otherwise fallback to reading through the dmu.
*/
int
mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
{
struct inode *ip = ZTOI(zp);
struct address_space *mp = ip->i_mapping;
struct page *pp;
int64_t start, off;
uint64_t bytes;
int64_t start = uio->uio_loffset;
int64_t off = start & (PAGE_SIZE - 1);
int len = nbytes;
int error = 0;
void *pb;
start = uio->uio_loffset;
off = start & (PAGE_SIZE-1);
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
bytes = MIN(PAGE_SIZE - off, len);
uint64_t bytes = MIN(PAGE_SIZE - off, len);
pp = find_lock_page(mp, start >> PAGE_SHIFT);
struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
if (pp) {
ASSERT(PageUptodate(pp));
/*
* If filemap_fault() retries there exists a window
* where the page will be unlocked and not up to date.
* In this case we must try and fill the page.
*/
if (unlikely(!PageUptodate(pp))) {
error = zfs_fillpage(ip, pp);
if (error) {
unlock_page(pp);
put_page(pp);
return (error);
}
}
ASSERT(PageUptodate(pp) || PageDirty(pp));
unlock_page(pp);
pb = kmap(pp);
void *pb = kmap(pp);
error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
kunmap(pp);
@ -314,9 +323,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
len -= bytes;
off = 0;
if (error)
break;
}
return (error);
}
#endif /* _KERNEL */
@ -3959,55 +3970,41 @@ zfs_inactive(struct inode *ip)
* Fill pages with data from the disk.
*/
static int
zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
zfs_fillpage(struct inode *ip, struct page *pp)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
objset_t *os;
struct page *cur_pp;
u_offset_t io_off, total;
size_t io_len;
loff_t i_size;
unsigned page_idx;
int err;
os = zfsvfs->z_os;
io_len = nr_pages << PAGE_SHIFT;
i_size = i_size_read(ip);
io_off = page_offset(pl[0]);
loff_t i_size = i_size_read(ip);
u_offset_t io_off = page_offset(pp);
size_t io_len = PAGE_SIZE;
if (io_off + io_len > i_size)
io_len = i_size - io_off;
/*
* Iterate over list of pages and read each page individually.
*/
page_idx = 0;
for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
caddr_t va;
void *va = kmap(pp);
int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
PAGE_SIZE, va, DMU_READ_PREFETCH);
kunmap(pp);
cur_pp = pl[page_idx++];
va = kmap(cur_pp);
err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
DMU_READ_PREFETCH);
kunmap(cur_pp);
if (err) {
/* convert checksum errors into IO errors */
if (err == ECKSUM)
err = SET_ERROR(EIO);
return (err);
}
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = SET_ERROR(EIO);
SetPageError(pp);
ClearPageUptodate(pp);
} else {
ClearPageError(pp);
SetPageUptodate(pp);
}
return (0);
return (error);
}
/*
* Uses zfs_fillpage to read data from the file and fill the pages.
* Uses zfs_fillpage to read data from the file and fill the page.
*
* IN: ip - inode of file to get data from.
* pl - list of pages to read
* nr_pages - number of pages to read
* pp - page to read
*
* RETURN: 0 on success, error code on failure.
*
@ -4015,24 +4012,22 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
* vp - atime updated
*/
int
zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
zfs_getpage(struct inode *ip, struct page *pp)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
int err;
znode_t *zp = ITOZ(ip);
int error;
if (pl == NULL)
return (0);
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);
if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (err);
err = zfs_fillpage(ip, pl, nr_pages);
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE);
error = zfs_fillpage(ip, pp);
if (error == 0)
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
zfs_exit(zfsvfs, FTAG);
return (err);
return (error);
}
/*

View File

@ -657,29 +657,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
static inline int
zpl_readpage_common(struct page *pp)
{
struct inode *ip;
struct page *pl[1];
int error = 0;
fstrans_cookie_t cookie;
ASSERT(PageLocked(pp));
ip = pp->mapping->host;
pl[0] = pp;
cookie = spl_fstrans_mark();
error = -zfs_getpage(ip, pl, 1);
int error = -zfs_getpage(pp->mapping->host, pp);
spl_fstrans_unmark(cookie);
if (error) {
SetPageError(pp);
ClearPageUptodate(pp);
} else {
ClearPageError(pp);
SetPageUptodate(pp);
flush_dcache_page(pp);
}
unlock_page(pp);
return (error);
}

View File

@ -687,7 +687,8 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
tags = ['functional', 'migration']
[tests/functional/mmap]
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos']
tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos',
'mmap_sync_001_pos', 'mmap_write_001_pos']
tags = ['functional', 'mmap']
[tests/functional/mount]

View File

@ -59,7 +59,7 @@ main(int argc, char *argv[])
return (1);
}
int run_time_mins = 5;
int run_time_mins = 1;
if (argc >= 2) {
run_time_mins = atoi(argv[1]);
}

View File

@ -1491,6 +1491,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/migration/setup.ksh \
functional/mmap/cleanup.ksh \
functional/mmap/mmap_libaio_001_pos.ksh \
functional/mmap/mmap_mixed.ksh \
functional/mmap/mmap_read_001_pos.ksh \
functional/mmap/mmap_seek_001_pos.ksh \
functional/mmap/mmap_sync_001_pos.ksh \

View File

@ -0,0 +1,86 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/mmap/mmap.cfg
#
# DESCRIPTION:
# Verify mixed buffered and mmap IO.
#
# STRATEGY:
# 1. Create an empty file.
# 2. Start a background buffered read/write fio to the file.
# 3. Start a background mmap read/write fio to the file.
#
verify_runnable "global"
function cleanup
{
log_must rm -f "$tmp_file"
}
log_assert "Verify mixed buffered and mmap IO"
log_onexit cleanup
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
tmp_file=$mntpnt/file
bs=$((128 * 1024))
blocks=64
size=$((bs * blocks))
runtime=60
log_must dd if=/dev/zero of=$tmp_file bs=$bs count=$blocks
# Buffered IO writes
log_must eval "fio --filename=$tmp_file --name=buffer-write \
--rw=randwrite --size=$size --bs=$bs --direct=0 --numjobs=1 \
--ioengine=sync --fallocate=none --group_reporting --minimal \
--runtime=$runtime --time_based --norandommap &"
# Buffered IO reads
log_must eval "fio --filename=$tmp_file --name=buffer-read \
--rw=randread --size=$size --bs=$bs --direct=0 --numjobs=1 \
--ioengine=sync --fallocate=none --group_reporting --minimal \
--runtime=$runtime --time_based --norandommap &"
# mmap IO writes
log_must eval "fio --filename=$tmp_file --name=mmap-write \
--rw=randwrite --size=$size --bs=$bs --numjobs=1 \
--ioengine=mmap --fallocate=none --group_reporting --minimal \
--runtime=$runtime --time_based --norandommap &"
# mmap IO reads
log_must eval "fio --filename=$tmp_file --name=mmap-read \
--rw=randread --size=$size --bs=$bs --numjobs=1 \
--ioengine=mmap --fallocate=none --group_reporting --minimal \
--runtime=$runtime --time_based --norandommap &"
log_must wait
log_pass "Verfied mixed buffered and mmap IO"