The page hold mechanism is fast but it has couple of fallouts:

- It does not let pages respect the LRU policy - It bloats the active/inactive queues of few pages Try to avoid it as much as possible with the long-term target to completely remove it. Use the soft-busy mechanism to protect page content accesses during short-term operations (like uiomove_fromphys()). After this change only vm_fault_quick_hold_pages() is still using the hold mechanism for page content access. There is an additional complexity there as the quick path cannot immediately access the page object to busy the page and the slow path cannot however busy more than one page a time (to avoid deadlocks). Fixing such primitive can bring to complete removal of the page hold mechanism. Sponsored by: EMC / Isilon storage division Discussed with: alc Reviewed by: jeff Tested by: pho
svn path=/head/; revision=253939
2013-08-04 21:07:24 +00:00 · 2013-08-04 21:07:24 +00:00 · 3b6714cacb · 2020-12-20 02:59:44 +00:00
commit 3b6714cacb
parent b9fdaa9b19
9 changed files with 50 additions and 86 deletions
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@ -324,7 +324,8 @@ zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
 }

 static vm_page_t
-page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
+page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes,
+    boolean_t alloc)
 {
 	vm_object_t obj;
 	vm_page_t pp;
@ -346,6 +347,8 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 				continue;
 			}
 		} else if (pp == NULL) {
+			if (!alloc)
+				break;
 			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
 			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
 			    VM_ALLOC_NOBUSY);
@ -356,8 +359,10 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)

 		if (pp != NULL) {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
-			vm_object_pip_add(obj, 1);
 			vm_page_io_start(pp);
+			if (!alloc)
+				break;
+			vm_object_pip_add(obj, 1);
 			pmap_remove_write(pp);
 			vm_page_clear_dirty(pp, off, nbytes);
 		}
@ -367,55 +372,12 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 }

 static void
-page_unbusy(vm_page_t pp)
+page_unbusy(vm_page_t pp, boolean_t unalloc)
 {

 	vm_page_io_finish(pp);
-	vm_object_pip_subtract(pp->object, 1);
-}
-
-static vm_page_t
-page_hold(vnode_t *vp, int64_t start)
-{
-	vm_object_t obj;
-	vm_page_t pp;
-
-	obj = vp->v_object;
-	zfs_vmobject_assert_wlocked(obj);
-
-	for (;;) {
-		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
-		    pp->valid) {
-			if ((pp->oflags & VPO_BUSY) != 0) {
-				/*
-				 * Reference the page before unlocking and
-				 * sleeping so that the page daemon is less
-				 * likely to reclaim it.
-				 */
-				vm_page_reference(pp);
-				vm_page_sleep(pp, "zfsmwb");
-				continue;
-			}
-
-			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
-			vm_page_lock(pp);
-			vm_page_hold(pp);
-			vm_page_unlock(pp);
-
-		} else
-			pp = NULL;
-		break;
-	}
-	return (pp);
-}
-
-static void
-page_unhold(vm_page_t pp)
-{
-
-	vm_page_lock(pp);
-	vm_page_unhold(pp);
-	vm_page_unlock(pp);
+	if (unalloc)
+		vm_object_pip_subtract(pp->object, 1);
 }

 static caddr_t
@ -479,7 +441,8 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,

 			zfs_vmobject_wlock(obj);
 			vm_page_undirty(pp);
-		} else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+		} else if ((pp = page_busy(vp, start, off, nbytes,
+		    TRUE)) != NULL) {
 			zfs_vmobject_wunlock(obj);

 			va = zfs_map_page(pp, &sf);
@ -488,7 +451,7 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
 			zfs_unmap_page(sf);

 			zfs_vmobject_wlock(obj);
-			page_unbusy(pp);
+			page_unbusy(pp, TRUE);
 		}
 		len -= nbytes;
 		off = 0;
@ -598,7 +561,7 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);

-		if (pp = page_hold(vp, start)) {
+		if (pp = page_busy(vp, start, 0, 0, FALSE)) {
 			struct sf_buf *sf;
 			caddr_t va;

@ -607,7 +570,7 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 			error = uiomove(va + off, bytes, UIO_READ, uio);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
-			page_unhold(pp);
+			page_unbusy(pp, FALSE);
 		} else {
 			zfs_vmobject_wunlock(obj);
 			error = dmu_read_uio(os, zp->z_id, uio, bytes);
--- a/sys/fs/tmpfs/tmpfs_vnops.c
+++ b/sys/fs/tmpfs/tmpfs_vnops.c
@ -485,13 +485,13 @@ tmpfs_nocacheread(vm_object_t tobj, vm_pindex_t idx,
 			vm_page_zero_invalid(m, TRUE);
 		vm_page_wakeup(m);
 	}
-	vm_page_lock(m);
-	vm_page_hold(m);
-	vm_page_unlock(m);
+	vm_page_io_start(m);
 	VM_OBJECT_WUNLOCK(tobj);
 	error = uiomove_fromphys(&m, offset, tlen, uio);
+	VM_OBJECT_WLOCK(tobj);
+	vm_page_io_finish(m);
+	VM_OBJECT_WUNLOCK(tobj);
 	vm_page_lock(m);
-	vm_page_unhold(m);
 	if (m->queue == PQ_NONE) {
 		vm_page_deactivate(m);
 	} else {
@ -602,16 +602,14 @@ tmpfs_mappedwrite(vm_object_t tobj, size_t len, struct uio *uio)
 			vm_page_zero_invalid(tpg, TRUE);
 		vm_page_wakeup(tpg);
 	}
-	vm_page_lock(tpg);
-	vm_page_hold(tpg);
-	vm_page_unlock(tpg);
+	vm_page_io_start(tpg);
 	VM_OBJECT_WUNLOCK(tobj);
 	error = uiomove_fromphys(&tpg, offset, tlen, uio);
 	VM_OBJECT_WLOCK(tobj);
+	vm_page_io_finish(tpg);
 	if (error == 0)
 		vm_page_dirty(tpg);
 	vm_page_lock(tpg);
-	vm_page_unhold(tpg);
 	if (tpg->queue == PQ_NONE) {
 		vm_page_deactivate(tpg);
 	} else {
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@ -378,7 +378,7 @@ __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 		off = offset - trunc_page(offset);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
 		    end - start);
-		vm_imgact_unmap_page(sf);
+		vm_imgact_unmap_page(object, sf);
 		if (error) {
 			return (KERN_FAILURE);
 		}
@ -433,7 +433,7 @@ __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 					sz = PAGE_SIZE - off;
 				error = copyout((caddr_t)sf_buf_kva(sf) + off,
 				    (caddr_t)start, sz);
-				vm_imgact_unmap_page(sf);
+				vm_imgact_unmap_page(object, sf);
 				if (error) {
 					return (KERN_FAILURE);
 				}
@ -553,7 +553,7 @@ __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
 		    trunc_page(offset + filsz);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off,
 		    (caddr_t)map_addr, copy_len);
-		vm_imgact_unmap_page(sf);
+		vm_imgact_unmap_page(object, sf);
 		if (error) {
 			return (error);
 		}
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@ -973,7 +973,7 @@ exec_map_first_page(imgp)
 		vm_page_wakeup(ma[0]);
 	}
 	vm_page_lock(ma[0]);
-	vm_page_hold(ma[0]);
+	vm_page_wire(ma[0]);
 	vm_page_unlock(ma[0]);
 	VM_OBJECT_WUNLOCK(object);

@ -994,7 +994,7 @@ exec_unmap_first_page(imgp)
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock(m);
-		vm_page_unhold(m);
+		vm_page_unwire(m, 0);
 		vm_page_unlock(m);
 	}
 }
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@ -263,6 +263,7 @@ proc_rwmem(struct proc *p, struct uio *uio)
 	writing = uio->uio_rw == UIO_WRITE;
 	reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
 	fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
+	fault_flags |= VM_FAULT_IOBUSY;

 	/*
 	 * Only map in one page at a time.  We don't have to, but it
@ -287,9 +288,9 @@ proc_rwmem(struct proc *p, struct uio *uio)
 		len = min(PAGE_SIZE - page_offset, uio->uio_resid);

 		/*
-		 * Fault and hold the page on behalf of the process.
+		 * Fault and busy the page on behalf of the process.
 		 */
-		error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
+		error = vm_fault_handle(map, pageno, reqprot, fault_flags, &m);
 		if (error != KERN_SUCCESS) {
 			if (error == KERN_RESOURCE_SHORTAGE)
 				error = ENOMEM;
@ -315,9 +316,9 @@ proc_rwmem(struct proc *p, struct uio *uio)
 		/*
 		 * Release the page.
 		 */
-		vm_page_lock(m);
-		vm_page_unhold(m);
-		vm_page_unlock(m);
+		VM_OBJECT_WLOCK(m->object);
+		vm_page_io_finish(m);
+		VM_OBJECT_WUNLOCK(m->object);

 	} while (error == 0 && uio->uio_resid > 0);

--- a/sys/vm/vm_extern.h
+++ b/sys/vm/vm_extern.h
@ -63,7 +63,7 @@ void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
    vm_ooffset_t *);
 int vm_fault_disable_pagefaults(void);
 void vm_fault_enable_pagefaults(int save);
-int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
+int vm_fault_handle(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
    int fault_flags, vm_page_t *m_hold);
 int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
    vm_prot_t prot, vm_page_t *ma, int max_count);
@ -87,7 +87,7 @@ void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
 int vslock(void *, size_t);
 void vsunlock(void *, size_t);
 struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset);
-void vm_imgact_unmap_page(struct sf_buf *sf);
+void vm_imgact_unmap_page(vm_object_t, struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 int vm_thread_new(struct thread *td, int pages);
 int vm_mlock(struct proc *, struct ucred *, const void *, size_t);
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@ -221,8 +221,8 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	if (map != kernel_map && KTRPOINT(td, KTR_FAULT))
 		ktrfault(vaddr, fault_type);
 #endif
-	result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags,
-	    NULL);
+	result = vm_fault_handle(map, trunc_page(vaddr), fault_type,
+	    fault_flags, NULL);
 #ifdef KTRACE
 	if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND))
 		ktrfaultend(result);
@ -231,7 +231,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 }

 int
-vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
+vm_fault_handle(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
    int fault_flags, vm_page_t *m_hold)
 {
 	vm_prot_t prot;
@ -943,7 +943,10 @@ RetryFault:;
 		vm_page_activate(fs.m);
 	if (m_hold != NULL) {
 		*m_hold = fs.m;
-		vm_page_hold(fs.m);
+		if (fault_flags & VM_FAULT_IOBUSY)
+			vm_page_io_start(fs.m);
+		else
+			vm_page_hold(fs.m);
 	}
 	vm_page_unlock(fs.m);
 	vm_page_wakeup(fs.m);
@ -1145,7 +1148,7 @@ vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
 		 * and hold these pages.
 		 */
 		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
-			if (*mp == NULL && vm_fault_hold(map, va, prot,
+			if (*mp == NULL && vm_fault_handle(map, va, prot,
 			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
 				goto error;
 	}
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@ -223,7 +223,7 @@ vsunlock(void *addr, size_t len)
 * Return the pinned page if successful; otherwise, return NULL.
 */
 static vm_page_t
-vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
+vm_imgact_page_iostart(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m, ma[1];
 	vm_pindex_t pindex;
@ -249,9 +249,7 @@ vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 		}
 		vm_page_wakeup(m);
 	}
-	vm_page_lock(m);
-	vm_page_hold(m);
-	vm_page_unlock(m);
+	vm_page_io_start(m);
 out:
 	VM_OBJECT_WUNLOCK(object);
 	return (m);
@ -266,7 +264,7 @@ vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;

-	m = vm_imgact_hold_page(object, offset);
+	m = vm_imgact_page_iostart(object, offset);
 	if (m == NULL)
 		return (NULL);
 	sched_pin();
@ -277,16 +275,16 @@ vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
 * Destroy the given CPU private mapping and unpin the page that it mapped.
 */
 void
-vm_imgact_unmap_page(struct sf_buf *sf)
+vm_imgact_unmap_page(vm_object_t object, struct sf_buf *sf)
 {
 	vm_page_t m;

 	m = sf_buf_page(sf);
 	sf_buf_free(sf);
 	sched_unpin();
-	vm_page_lock(m);
-	vm_page_unhold(m);
-	vm_page_unlock(m);
+	VM_OBJECT_WLOCK(object);
+	vm_page_io_finish(m);
+	VM_OBJECT_WUNLOCK(object);
 }

 void
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@ -329,6 +329,7 @@ long vmspace_resident_count(struct vmspace *vmspace);
 #define VM_FAULT_NORMAL 0		/* Nothing special */
 #define VM_FAULT_CHANGE_WIRING 1	/* Change the wiring as appropriate */
 #define	VM_FAULT_DIRTY 2		/* Dirty the page; use w/VM_PROT_COPY */
+#define	VM_FAULT_IOBUSY 4		/* Busy the faulted page */

 /*
 * Initially, mappings are slightly sequential.  The maximum window size must