read-behind / read-ahead support for zfs_getpages()

ZFS caches blocks it reads in its ARC, so in general the optional pages are not as useful as with filesystems that read the data directly into the target pages. But still the optional pages are useful to reduce the number of page faults and associated VM / VFS / ZFS calls. Another case that gets optimized (as a side effect) is paging in from a hole. ZFS DMU does not currently provide a convenient API to check for a hole. Instead it creates a temporary zero-filled block and allows accessing it as if it were a normal data block. Getting multiple pages one by one from a hole results in repeated creation and destruction of the temporary block (and an associated ARC header). Tested with fsx using various supported blocks sizes from 512 bytes to 128 KB and additionally 1 MB. Please note that in illumos and ZoL they do not do the range-locking in the page-in path. This is because ZFS has a double-caching problem between ARC and page cache and that requires zfs_read() and zfs_write() to consult pages in the page cache. So, in those functions they first lock a range and then lock pages corresponding to the range. While in the page-in (and maybe page-out) path they first lock the pages and then would lock the range. So, they would have a deadlock. I believe that FreeBSD does not have that problem, because the page-in deals only with invalid pages while zfs_read() and zfs_write() need to access only valid pages. They do not wait on a busy page unless it's already valid. Reviewed by: kib MFC after: 3 weeks Differential Revision: https://reviews.freebsd.org/D14263
2018-02-16 06:59:35 +00:00 · 2018-02-16 06:59:35 +00:00 · c945107d23
commit c945107d23
parent b761400b51
3 changed files with 241 additions and 53 deletions
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@ -1518,6 +1518,188 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
+
+int
+dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, int last_size)
+{
+	struct sf_buf *sf;
+	vm_object_t vmobj;
+	vm_page_t m;
+	dmu_buf_t **dbp;
+	dmu_buf_t *db;
+	caddr_t va;
+	int numbufs, i;
+	int bufoff, pgoff, tocpy;
+	int mi, di;
+	int err;
+
+	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
+	ASSERT(last_size <= PAGE_SIZE);
+
+	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
+	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+	if (err != 0)
+		return (err);
+
+#ifdef DEBUG
+	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
+	if (dbp[0]->db_offset != 0 || numbufs > 1) {
+		for (i = 0; i < numbufs; i++) {
+			ASSERT(ISP2(dbp[i]->db_size));
+			ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
+			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
+		}
+	}
+#endif
+
+	vmobj = ma[0]->object;
+	zfs_vmobject_wlock(vmobj);
+
+	db = dbp[0];
+	for (i = 0; i < *rbehind; i++) {
+		m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
+		if (m == NULL)
+			break;
+		if (m->valid != 0) {
+			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+			break;
+		}
+		ASSERT(m->dirty == 0);
+		ASSERT(!pmap_page_is_mapped(m));
+
+		ASSERT(db->db_size > PAGE_SIZE);
+		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+		va = zfs_map_page(m, &sf);
+		bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
+		zfs_unmap_page(sf);
+		m->valid = VM_PAGE_BITS_ALL;
+		vm_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+			vm_page_activate(m);
+		else
+			vm_page_deactivate(m);
+		vm_page_unlock(m);
+	}
+	*rbehind = i;
+
+	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
+	pgoff = 0;
+	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
+		if (pgoff == 0) {
+			m = ma[mi];
+			vm_page_assert_xbusied(m);
+			ASSERT(m->valid == 0);
+			ASSERT(m->dirty == 0);
+			ASSERT(!pmap_page_is_mapped(m));
+			va = zfs_map_page(m, &sf);
+		}
+		if (bufoff == 0)
+			db = dbp[di];
+
+		ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
+		    db->db_offset + bufoff);
+
+		/*
+		 * We do not need to clamp the copy size by the file
+		 * size as the last block is zero-filled beyond the
+		 * end of file anyway.
+		 */
+		tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
+		bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
+
+		pgoff += tocpy;
+		ASSERT(pgoff <= PAGESIZE);
+		if (pgoff == PAGESIZE) {
+			zfs_unmap_page(sf);
+			m->valid = VM_PAGE_BITS_ALL;
+			ASSERT(mi < count);
+			mi++;
+			pgoff = 0;
+		}
+
+		bufoff += tocpy;
+		ASSERT(bufoff <= db->db_size);
+		if (bufoff == db->db_size) {
+			ASSERT(di < numbufs);
+			di++;
+			bufoff = 0;
+		}
+	}
+
+#ifdef DEBUG
+	/*
+	 * Three possibilities:
+	 * - last requested page ends at a buffer boundary and , thus,
+	 *   all pages and buffers have been iterated;
+	 * - all requested pages are filled, but the last buffer
+	 *   has not been exhausted;
+	 *   the read-ahead is possible only in this case;
+	 * - all buffers have been read, but the last page has not been
+	 *   fully filled;
+	 *   this is only possible if the file has only a single buffer
+	 *   with a size that is not a multiple of the page size.
+	 */
+	if (mi == count) {
+		ASSERT(di >= numbufs - 1);
+		IMPLY(*rahead != 0, di == numbufs - 1);
+		IMPLY(*rahead != 0, bufoff != 0);
+		ASSERT(pgoff == 0);
+	}
+	if (di == numbufs) {
+		ASSERT(mi >= count - 1);
+		ASSERT(*rahead == 0);
+		IMPLY(pgoff == 0, mi == count);
+		if (pgoff != 0) {
+			ASSERT(mi == count - 1);
+			ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
+		}
+	}
+#endif
+	if (pgoff != 0) {
+		bzero(va + pgoff, PAGESIZE - pgoff);
+		zfs_unmap_page(sf);
+		m->valid = VM_PAGE_BITS_ALL;
+	}
+
+	for (i = 0; i < *rahead; i++) {
+		m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
+		if (m == NULL)
+			break;
+		if (m->valid != 0) {
+			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+			break;
+		}
+		ASSERT(m->dirty == 0);
+		ASSERT(!pmap_page_is_mapped(m));
+
+		ASSERT(db->db_size > PAGE_SIZE);
+		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+		tocpy = MIN(db->db_size - bufoff, PAGESIZE);
+		va = zfs_map_page(m, &sf);
+		bcopy((char *)db->db_data + bufoff, va, tocpy);
+		if (tocpy < PAGESIZE) {
+			ASSERT(i == *rahead - 1);
+			ASSERT((db->db_size & PAGE_MASK) != 0);
+			bzero(va + tocpy, PAGESIZE - tocpy);
+		}
+		zfs_unmap_page(sf);
+		m->valid = VM_PAGE_BITS_ALL;
+		vm_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+			vm_page_activate(m);
+		else
+			vm_page_deactivate(m);
+		vm_page_unlock(m);
+	}
+	*rahead = i;
+	zfs_vmobject_wunlock(vmobj);
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (0);
+}
 #endif	/* illumos */
 #endif	/* _KERNEL */

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@ -750,6 +750,8 @@ int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
 #else
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
    uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
+int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, int last_size);
 #endif
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@ -4514,81 +4514,85 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
 }

 static int
-zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind,
+zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
    int *rahead)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zp->z_zfsvfs->z_os;
-	vm_page_t mlast;
+	rl_t *rl;
 	vm_object_t object;
-	caddr_t va;
-	struct sf_buf *sf;
-	off_t startoff, endoff;
-	int i, error;
-	vm_pindex_t reqstart, reqend;
-	int lsize, size;
-
-	object = m[0]->object;
-	error = 0;
+	off_t start, end, obj_size;
+	uint_t blksz;
+	int pgsin_b, pgsin_a;
+	int error;

 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);

-	zfs_vmobject_wlock(object);
-	if (m[count - 1]->valid != 0 && --count == 0) {
-		zfs_vmobject_wunlock(object);
-		goto out;
+	start = IDX_TO_OFF(ma[0]->pindex);
+	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
+
+	/*
+	 * Lock a range covering all required and optional pages.
+	 * Note that we need to handle the case of the block size growing.
+	 */
+	for (;;) {
+		blksz = zp->z_blksz;
+		rl = zfs_range_lock(zp, rounddown(start, blksz),
+		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+		if (blksz == zp->z_blksz)
+			break;
+		zfs_range_unlock(rl);
 	}

-	mlast = m[count - 1];
-
-	if (IDX_TO_OFF(mlast->pindex) >=
-	    object->un_pager.vnp.vnp_size) {
-		zfs_vmobject_wunlock(object);
+	object = ma[0]->object;
+	zfs_vmobject_wlock(object);
+	obj_size = object->un_pager.vnp.vnp_size;
+	zfs_vmobject_wunlock(object);
+	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
+		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (zfs_vm_pagerret_bad);
 	}

-	VM_CNT_INC(v_vnodein);
-	VM_CNT_ADD(v_vnodepgsin, count);
-
-	lsize = PAGE_SIZE;
-	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
-		lsize = object->un_pager.vnp.vnp_size -
-		    IDX_TO_OFF(mlast->pindex);
-	zfs_vmobject_wunlock(object);
-
-	for (i = 0; i < count; i++) {
-		size = PAGE_SIZE;
-		if (i == count - 1)
-			size = lsize;
-		va = zfs_map_page(m[i], &sf);
-		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
-		    size, va, DMU_READ_PREFETCH);
-		if (size != PAGE_SIZE)
-			bzero(va + size, PAGE_SIZE - size);
-		zfs_unmap_page(sf);
-		if (error != 0)
-			goto out;
+	pgsin_b = 0;
+	if (rbehind != NULL) {
+		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
+		pgsin_b = MIN(*rbehind, pgsin_b);
 	}

-	zfs_vmobject_wlock(object);
-	for (i = 0; i < count; i++)
-		m[i]->valid = VM_PAGE_BITS_ALL;
-	zfs_vmobject_wunlock(object);
+	pgsin_a = 0;
+	if (rahead != NULL) {
+		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
+		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
+			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
+		pgsin_a = MIN(*rahead, pgsin_a);
+	}

-out:
+	/*
+	 * NB: we need to pass the exact byte size of the data that we expect
+	 * to read after accounting for the file size.  This is required because
+	 * ZFS will panic if we request DMU to read beyond the end of the last
+	 * allocated block.
+	 */
+	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
+	    MIN(end, obj_size) - (end - PAGE_SIZE));
+
+	zfs_range_unlock(rl);
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
-	if (error == 0) {
-		if (rbehind)
-			*rbehind = 0;
-		if (rahead)
-			*rahead = 0;
-		return (zfs_vm_pagerret_ok);
-	} else
+
+	if (error != 0)
 		return (zfs_vm_pagerret_error);
+
+	VM_CNT_INC(v_vnodein);
+	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
+	if (rbehind != NULL)
+		*rbehind = pgsin_b;
+	if (rahead != NULL)
+		*rahead = pgsin_a;
+	return (zfs_vm_pagerret_ok);
 }

 static int