read-behind / read-ahead support for zfs_getpages()

ZFS caches blocks it reads in its ARC, so in general the optional
pages are not as useful as with filesystems that read the data
directly into the target pages.  But still the optional pages
are useful to reduce the number of page faults and associated
VM / VFS / ZFS calls.
Another case that gets optimized (as a side effect) is paging in
from a hole.  ZFS DMU does not currently provide a convenient
API to check for a hole.  Instead it creates a temporary zero-filled
block and allows accessing it as if it were a normal data block.
Getting multiple pages one by one from a hole results in repeated
creation and destruction of the temporary block (and an associated
ARC header).

Tested with fsx using various supported blocks sizes from 512 bytes
to 128 KB and additionally 1 MB.

Please note that in illumos and ZoL they do not do the range-locking in
the page-in path. This is because ZFS has a double-caching problem
between ARC and page cache and that requires zfs_read() and zfs_write()
to consult pages in the page cache. So, in those functions they first
lock a range and then lock pages corresponding to the range. While in
the page-in (and maybe page-out) path they first lock the pages and then
would lock the range. So, they would have a deadlock.

I believe that FreeBSD does not have that problem, because the page-in
deals only with invalid pages while zfs_read() and zfs_write() need to
access only valid pages. They do not wait on a busy page unless it's
already valid.

Reviewed by:	kib
MFC after:	3 weeks
Differential Revision: https://reviews.freebsd.org/D14263
This commit is contained in:
Andriy Gapon 2018-02-16 06:59:35 +00:00
parent b761400b51
commit c945107d23
3 changed files with 241 additions and 53 deletions

View File

@ -1518,6 +1518,188 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
int
dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
int *rbehind, int *rahead, int last_size)
{
struct sf_buf *sf;
vm_object_t vmobj;
vm_page_t m;
dmu_buf_t **dbp;
dmu_buf_t *db;
caddr_t va;
int numbufs, i;
int bufoff, pgoff, tocpy;
int mi, di;
int err;
ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
ASSERT(last_size <= PAGE_SIZE);
err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
if (err != 0)
return (err);
#ifdef DEBUG
IMPLY(last_size < PAGE_SIZE, *rahead == 0);
if (dbp[0]->db_offset != 0 || numbufs > 1) {
for (i = 0; i < numbufs; i++) {
ASSERT(ISP2(dbp[i]->db_size));
ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
}
}
#endif
vmobj = ma[0]->object;
zfs_vmobject_wlock(vmobj);
db = dbp[0];
for (i = 0; i < *rbehind; i++) {
m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
if (m == NULL)
break;
if (m->valid != 0) {
ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
break;
}
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
ASSERT(db->db_size > PAGE_SIZE);
bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
va = zfs_map_page(m, &sf);
bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
zfs_unmap_page(sf);
m->valid = VM_PAGE_BITS_ALL;
vm_page_lock(m);
if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
vm_page_activate(m);
else
vm_page_deactivate(m);
vm_page_unlock(m);
}
*rbehind = i;
bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
pgoff = 0;
for (mi = 0, di = 0; mi < count && di < numbufs; ) {
if (pgoff == 0) {
m = ma[mi];
vm_page_assert_xbusied(m);
ASSERT(m->valid == 0);
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
va = zfs_map_page(m, &sf);
}
if (bufoff == 0)
db = dbp[di];
ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
db->db_offset + bufoff);
/*
* We do not need to clamp the copy size by the file
* size as the last block is zero-filled beyond the
* end of file anyway.
*/
tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
pgoff += tocpy;
ASSERT(pgoff <= PAGESIZE);
if (pgoff == PAGESIZE) {
zfs_unmap_page(sf);
m->valid = VM_PAGE_BITS_ALL;
ASSERT(mi < count);
mi++;
pgoff = 0;
}
bufoff += tocpy;
ASSERT(bufoff <= db->db_size);
if (bufoff == db->db_size) {
ASSERT(di < numbufs);
di++;
bufoff = 0;
}
}
#ifdef DEBUG
/*
* Three possibilities:
* - last requested page ends at a buffer boundary and , thus,
* all pages and buffers have been iterated;
* - all requested pages are filled, but the last buffer
* has not been exhausted;
* the read-ahead is possible only in this case;
* - all buffers have been read, but the last page has not been
* fully filled;
* this is only possible if the file has only a single buffer
* with a size that is not a multiple of the page size.
*/
if (mi == count) {
ASSERT(di >= numbufs - 1);
IMPLY(*rahead != 0, di == numbufs - 1);
IMPLY(*rahead != 0, bufoff != 0);
ASSERT(pgoff == 0);
}
if (di == numbufs) {
ASSERT(mi >= count - 1);
ASSERT(*rahead == 0);
IMPLY(pgoff == 0, mi == count);
if (pgoff != 0) {
ASSERT(mi == count - 1);
ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
}
}
#endif
if (pgoff != 0) {
bzero(va + pgoff, PAGESIZE - pgoff);
zfs_unmap_page(sf);
m->valid = VM_PAGE_BITS_ALL;
}
for (i = 0; i < *rahead; i++) {
m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
if (m == NULL)
break;
if (m->valid != 0) {
ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
break;
}
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
ASSERT(db->db_size > PAGE_SIZE);
bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
tocpy = MIN(db->db_size - bufoff, PAGESIZE);
va = zfs_map_page(m, &sf);
bcopy((char *)db->db_data + bufoff, va, tocpy);
if (tocpy < PAGESIZE) {
ASSERT(i == *rahead - 1);
ASSERT((db->db_size & PAGE_MASK) != 0);
bzero(va + tocpy, PAGESIZE - tocpy);
}
zfs_unmap_page(sf);
m->valid = VM_PAGE_BITS_ALL;
vm_page_lock(m);
if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
vm_page_activate(m);
else
vm_page_deactivate(m);
vm_page_unlock(m);
}
*rahead = i;
zfs_vmobject_wunlock(vmobj);
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (0);
}
#endif /* illumos */
#endif /* _KERNEL */

View File

@ -750,6 +750,8 @@ int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
#else
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
int *rbehind, int *rahead, int last_size);
#endif
#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);

View File

@ -4514,81 +4514,85 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
}
static int
zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind,
zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
int *rahead)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zp->z_zfsvfs->z_os;
vm_page_t mlast;
rl_t *rl;
vm_object_t object;
caddr_t va;
struct sf_buf *sf;
off_t startoff, endoff;
int i, error;
vm_pindex_t reqstart, reqend;
int lsize, size;
object = m[0]->object;
error = 0;
off_t start, end, obj_size;
uint_t blksz;
int pgsin_b, pgsin_a;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
zfs_vmobject_wlock(object);
if (m[count - 1]->valid != 0 && --count == 0) {
zfs_vmobject_wunlock(object);
goto out;
start = IDX_TO_OFF(ma[0]->pindex);
end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
/*
* Lock a range covering all required and optional pages.
* Note that we need to handle the case of the block size growing.
*/
for (;;) {
blksz = zp->z_blksz;
rl = zfs_range_lock(zp, rounddown(start, blksz),
roundup(end, blksz) - rounddown(start, blksz), RL_READER);
if (blksz == zp->z_blksz)
break;
zfs_range_unlock(rl);
}
mlast = m[count - 1];
if (IDX_TO_OFF(mlast->pindex) >=
object->un_pager.vnp.vnp_size) {
zfs_vmobject_wunlock(object);
object = ma[0]->object;
zfs_vmobject_wlock(object);
obj_size = object->un_pager.vnp.vnp_size;
zfs_vmobject_wunlock(object);
if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (zfs_vm_pagerret_bad);
}
VM_CNT_INC(v_vnodein);
VM_CNT_ADD(v_vnodepgsin, count);
lsize = PAGE_SIZE;
if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
lsize = object->un_pager.vnp.vnp_size -
IDX_TO_OFF(mlast->pindex);
zfs_vmobject_wunlock(object);
for (i = 0; i < count; i++) {
size = PAGE_SIZE;
if (i == count - 1)
size = lsize;
va = zfs_map_page(m[i], &sf);
error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
size, va, DMU_READ_PREFETCH);
if (size != PAGE_SIZE)
bzero(va + size, PAGE_SIZE - size);
zfs_unmap_page(sf);
if (error != 0)
goto out;
pgsin_b = 0;
if (rbehind != NULL) {
pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
pgsin_b = MIN(*rbehind, pgsin_b);
}
zfs_vmobject_wlock(object);
for (i = 0; i < count; i++)
m[i]->valid = VM_PAGE_BITS_ALL;
zfs_vmobject_wunlock(object);
pgsin_a = 0;
if (rahead != NULL) {
pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
pgsin_a = MIN(*rahead, pgsin_a);
}
out:
/*
* NB: we need to pass the exact byte size of the data that we expect
* to read after accounting for the file size. This is required because
* ZFS will panic if we request DMU to read beyond the end of the last
* allocated block.
*/
error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
MIN(end, obj_size) - (end - PAGE_SIZE));
zfs_range_unlock(rl);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
if (error == 0) {
if (rbehind)
*rbehind = 0;
if (rahead)
*rahead = 0;
return (zfs_vm_pagerret_ok);
} else
if (error != 0)
return (zfs_vm_pagerret_error);
VM_CNT_INC(v_vnodein);
VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
if (rbehind != NULL)
*rbehind = pgsin_b;
if (rahead != NULL)
*rahead = pgsin_a;
return (zfs_vm_pagerret_ok);
}
static int