Introduce vm_fault_hold() and use it to (1) eliminate a long-standing race

condition in proc_rwmem() and to (2) simplify the implementation of the cxgb driver's vm_fault_hold_user_pages(). Specifically, in proc_rwmem() the requested read or write could fail because the targeted page could be reclaimed between the calls to vm_fault() and vm_page_hold(). In collaboration with: kib@ MFC after: 6 weeks
2010-12-20 22:49:31 +00:00 · 2010-12-20 22:49:31 +00:00 · acd11c7499
commit acd11c7499
parent 091c4c86d1
5 changed files with 42 additions and 81 deletions
--- a/sys/dev/cxgb/ulp/tom/cxgb_vm.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.c
@ -66,7 +66,7 @@ vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
    int count, vm_prot_t prot)
 {
 	vm_offset_t end, va;
-	int faults, rv;
+	int faults;
 	pmap_t pmap;
 	vm_page_t m, *pages;
 	
@ -122,20 +122,11 @@ vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
 	/*
 	 * Pages either have insufficient permissions or are not present
 	 * trigger a fault where neccessary
-	 * 
 	 */
-	rv = 0;
 	for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) {
-		/*
-		 * Account for a very narrow race where the page may be
-		 * taken away from us before it is held
-		 */
-		while (*pages == NULL) {
-			rv = vm_fault(map, va, prot, VM_FAULT_NORMAL);
-			if (rv) 
-				goto error;
-			*pages = pmap_extract_and_hold(pmap, va, prot);
-		}
+		if (*pages == NULL && vm_fault_hold(map, va, prot,
+		    VM_FAULT_NORMAL, pages) != KERN_SUCCESS)
+			goto error;
 	}
 	return (0);
 error:	
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@ -237,10 +237,9 @@ int
 proc_rwmem(struct proc *p, struct uio *uio)
 {
 	vm_map_t map;
-	vm_object_t backing_object, object;
 	vm_offset_t pageno;		/* page number */
 	vm_prot_t reqprot;
-	int error, writing;
+	int error, fault_flags, page_offset, writing;

 	/*
 	 * Assert that someone has locked this vmspace.  (Should be
@ -255,26 +254,24 @@ proc_rwmem(struct proc *p, struct uio *uio)
 	 */
 	map = &p->p_vmspace->vm_map;

+	/*
+	 * If we are writing, then we request vm_fault() to create a private
+	 * copy of each page.  Since these copies will not be writeable by the
+	 * process, we must explicity request that they be dirtied.
+	 */
 	writing = uio->uio_rw == UIO_WRITE;
 	reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
+	fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;

 	/*
 	 * Only map in one page at a time.  We don't have to, but it
 	 * makes things easier.  This way is trivial - right?
 	 */
 	do {
-		vm_map_t tmap;
 		vm_offset_t uva;
-		int page_offset;		/* offset into page */
-		vm_map_entry_t out_entry;
-		vm_prot_t out_prot;
-		boolean_t wired;
-		vm_pindex_t pindex;
 		u_int len;
 		vm_page_t m;

-		object = NULL;
-
 		uva = (vm_offset_t)uio->uio_offset;

 		/*
@ -289,10 +286,10 @@ proc_rwmem(struct proc *p, struct uio *uio)
 		len = min(PAGE_SIZE - page_offset, uio->uio_resid);

 		/*
-		 * Fault the page on behalf of the process
+		 * Fault and hold the page on behalf of the process.
 		 */
-		error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL);
-		if (error) {
+		error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
+		if (error != KERN_SUCCESS) {
 			if (error == KERN_RESOURCE_SHORTAGE)
 				error = ENOMEM;
 			else
@ -300,62 +297,19 @@ proc_rwmem(struct proc *p, struct uio *uio)
 			break;
 		}

-		/*
-		 * Now we need to get the page.  out_entry and wired
-		 * aren't used.  One would think the vm code
-		 * would be a *bit* nicer...  We use tmap because
-		 * vm_map_lookup() can change the map argument.
-		 */
-		tmap = map;
-		error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry,
-		    &object, &pindex, &out_prot, &wired);
-		if (error) {
-			error = EFAULT;
-			break;
-		}
-		VM_OBJECT_LOCK(object);
-		while ((m = vm_page_lookup(object, pindex)) == NULL &&
-		    !writing &&
-		    (backing_object = object->backing_object) != NULL) {
-			/*
-			 * Allow fallback to backing objects if we are reading.
-			 */
-			VM_OBJECT_LOCK(backing_object);
-			pindex += OFF_TO_IDX(object->backing_object_offset);
-			VM_OBJECT_UNLOCK(object);
-			object = backing_object;
-		}
-		if (writing && m != NULL) {
-			vm_page_dirty(m);
-			vm_pager_page_unswapped(m);
-		}
-		VM_OBJECT_UNLOCK(object);
-		if (m == NULL) {
-			vm_map_lookup_done(tmap, out_entry);
-			error = EFAULT;
-			break;
-		}
-
-		/*
-		 * Hold the page in memory.
-		 */
-		vm_page_lock(m);
-		vm_page_hold(m);
-		vm_page_unlock(m);
-
-		/*
-		 * We're done with tmap now.
-		 */
-		vm_map_lookup_done(tmap, out_entry);
-
 		/*
 		 * Now do the i/o move.
 		 */
 		error = uiomove_fromphys(&m, page_offset, len, uio);

 		/* Make the I-cache coherent for breakpoints. */
-		if (!error && writing && (out_prot & VM_PROT_EXECUTE))
-			vm_sync_icache(map, uva, len);
+		if (writing && error == 0) {
+			vm_map_lock_read(map);
+			if (vm_map_check_protection(map, pageno, pageno +
+			    PAGE_SIZE, VM_PROT_EXECUTE))
+				vm_sync_icache(map, uva, len);
+			vm_map_unlock_read(map);
+		}

 		/*
 		 * Release the page.
--- a/sys/vm/vm_extern.h
+++ b/sys/vm/vm_extern.h
@ -61,6 +61,8 @@ int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
 void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
    vm_ooffset_t *);
+int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
+    int fault_flags, vm_page_t *m_hold);
 void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
 int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
 int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@ -201,13 +201,20 @@ unlock_and_deallocate(struct faultstate *fs)
 *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
 *	a standard error specifying why the fault is fatal is returned.
 *
- *
 *	The map in question must be referenced, and remains so.
 *	Caller may hold no locks.
 */
 int
 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
-	 int fault_flags)
+    int fault_flags)
+{
+
+	return (vm_fault_hold(map, vaddr, fault_type, fault_flags, NULL));
+}
+
+int
+vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
+    int fault_flags, vm_page_t *m_hold)
 {
 	vm_prot_t prot;
 	int is_first_object_locked, result;
@ -880,7 +887,8 @@ RetryFault:;
 	if (hardfault)
 		fs.entry->lastr = fs.pindex + faultcount - behind;

-	if (prot & VM_PROT_WRITE) {
+	if ((prot & VM_PROT_WRITE) != 0 ||
+	    (fault_flags & VM_FAULT_DIRTY) != 0) {
 		vm_object_set_writeable_dirty(fs.object);

 		/*
@ -906,8 +914,9 @@ RetryFault:;
 		 * Also tell the backing pager, if any, that it should remove
 		 * any swap backing since the page is now dirty.
 		 */
-		if ((fault_type & VM_PROT_WRITE) != 0 &&
-		    (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) {
+		if (((fault_type & VM_PROT_WRITE) != 0 &&
+		    (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
+		    (fault_flags & VM_FAULT_DIRTY) != 0) {
 			vm_page_dirty(fs.m);
 			vm_pager_page_unswapped(fs.m);
 		}
@ -949,6 +958,10 @@ RetryFault:;
 			vm_page_unwire(fs.m, 1);
 	} else
 		vm_page_activate(fs.m);
+	if (m_hold != NULL) {
+		*m_hold = fs.m;
+		vm_page_hold(fs.m);
+	}
 	vm_page_unlock(fs.m);
 	vm_page_wakeup(fs.m);

--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@ -325,6 +325,7 @@ long vmspace_wired_count(struct vmspace *vmspace);
 */
 #define VM_FAULT_NORMAL 0		/* Nothing special */
 #define VM_FAULT_CHANGE_WIRING 1	/* Change the wiring as appropriate */
+#define	VM_FAULT_DIRTY 2		/* Dirty the page; use w/VM_PROT_COPY */

 /*
 * The following "find_space" options are supported by vm_map_find()