Introduce vm_fault_hold() and use it to (1) eliminate a long-standing race

condition in proc_rwmem() and to (2) simplify the implementation of the
cxgb driver's vm_fault_hold_user_pages().  Specifically, in proc_rwmem()
the requested read or write could fail because the targeted page could be
reclaimed between the calls to vm_fault() and vm_page_hold().

In collaboration with:	kib@
MFC after:	6 weeks
This commit is contained in:
Alan Cox 2010-12-20 22:49:31 +00:00
parent 091c4c86d1
commit acd11c7499
5 changed files with 42 additions and 81 deletions

View File

@ -66,7 +66,7 @@ vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
int count, vm_prot_t prot)
{
vm_offset_t end, va;
int faults, rv;
int faults;
pmap_t pmap;
vm_page_t m, *pages;
@ -122,20 +122,11 @@ vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
/*
* Pages either have insufficient permissions or are not present
* trigger a fault where neccessary
*
*/
rv = 0;
for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) {
/*
* Account for a very narrow race where the page may be
* taken away from us before it is held
*/
while (*pages == NULL) {
rv = vm_fault(map, va, prot, VM_FAULT_NORMAL);
if (rv)
goto error;
*pages = pmap_extract_and_hold(pmap, va, prot);
}
if (*pages == NULL && vm_fault_hold(map, va, prot,
VM_FAULT_NORMAL, pages) != KERN_SUCCESS)
goto error;
}
return (0);
error:

View File

@ -237,10 +237,9 @@ int
proc_rwmem(struct proc *p, struct uio *uio)
{
vm_map_t map;
vm_object_t backing_object, object;
vm_offset_t pageno; /* page number */
vm_prot_t reqprot;
int error, writing;
int error, fault_flags, page_offset, writing;
/*
* Assert that someone has locked this vmspace. (Should be
@ -255,26 +254,24 @@ proc_rwmem(struct proc *p, struct uio *uio)
*/
map = &p->p_vmspace->vm_map;
/*
* If we are writing, then we request vm_fault() to create a private
* copy of each page. Since these copies will not be writeable by the
* process, we must explicity request that they be dirtied.
*/
writing = uio->uio_rw == UIO_WRITE;
reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
/*
* Only map in one page at a time. We don't have to, but it
* makes things easier. This way is trivial - right?
*/
do {
vm_map_t tmap;
vm_offset_t uva;
int page_offset; /* offset into page */
vm_map_entry_t out_entry;
vm_prot_t out_prot;
boolean_t wired;
vm_pindex_t pindex;
u_int len;
vm_page_t m;
object = NULL;
uva = (vm_offset_t)uio->uio_offset;
/*
@ -289,10 +286,10 @@ proc_rwmem(struct proc *p, struct uio *uio)
len = min(PAGE_SIZE - page_offset, uio->uio_resid);
/*
* Fault the page on behalf of the process
* Fault and hold the page on behalf of the process.
*/
error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL);
if (error) {
error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
if (error != KERN_SUCCESS) {
if (error == KERN_RESOURCE_SHORTAGE)
error = ENOMEM;
else
@ -300,62 +297,19 @@ proc_rwmem(struct proc *p, struct uio *uio)
break;
}
/*
* Now we need to get the page. out_entry and wired
* aren't used. One would think the vm code
* would be a *bit* nicer... We use tmap because
* vm_map_lookup() can change the map argument.
*/
tmap = map;
error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry,
&object, &pindex, &out_prot, &wired);
if (error) {
error = EFAULT;
break;
}
VM_OBJECT_LOCK(object);
while ((m = vm_page_lookup(object, pindex)) == NULL &&
!writing &&
(backing_object = object->backing_object) != NULL) {
/*
* Allow fallback to backing objects if we are reading.
*/
VM_OBJECT_LOCK(backing_object);
pindex += OFF_TO_IDX(object->backing_object_offset);
VM_OBJECT_UNLOCK(object);
object = backing_object;
}
if (writing && m != NULL) {
vm_page_dirty(m);
vm_pager_page_unswapped(m);
}
VM_OBJECT_UNLOCK(object);
if (m == NULL) {
vm_map_lookup_done(tmap, out_entry);
error = EFAULT;
break;
}
/*
* Hold the page in memory.
*/
vm_page_lock(m);
vm_page_hold(m);
vm_page_unlock(m);
/*
* We're done with tmap now.
*/
vm_map_lookup_done(tmap, out_entry);
/*
* Now do the i/o move.
*/
error = uiomove_fromphys(&m, page_offset, len, uio);
/* Make the I-cache coherent for breakpoints. */
if (!error && writing && (out_prot & VM_PROT_EXECUTE))
vm_sync_icache(map, uva, len);
if (writing && error == 0) {
vm_map_lock_read(map);
if (vm_map_check_protection(map, pageno, pageno +
PAGE_SIZE, VM_PROT_EXECUTE))
vm_sync_icache(map, uva, len);
vm_map_unlock_read(map);
}
/*
* Release the page.

View File

@ -61,6 +61,8 @@ int useracc(void *, int, int);
int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
vm_ooffset_t *);
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold);
void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);

View File

@ -201,13 +201,20 @@ unlock_and_deallocate(struct faultstate *fs)
* KERN_SUCCESS is returned if the page fault is handled; otherwise,
* a standard error specifying why the fault is fatal is returned.
*
*
* The map in question must be referenced, and remains so.
* Caller may hold no locks.
*/
int
vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags)
int fault_flags)
{
return (vm_fault_hold(map, vaddr, fault_type, fault_flags, NULL));
}
int
vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold)
{
vm_prot_t prot;
int is_first_object_locked, result;
@ -880,7 +887,8 @@ RetryFault:;
if (hardfault)
fs.entry->lastr = fs.pindex + faultcount - behind;
if (prot & VM_PROT_WRITE) {
if ((prot & VM_PROT_WRITE) != 0 ||
(fault_flags & VM_FAULT_DIRTY) != 0) {
vm_object_set_writeable_dirty(fs.object);
/*
@ -906,8 +914,9 @@ RetryFault:;
* Also tell the backing pager, if any, that it should remove
* any swap backing since the page is now dirty.
*/
if ((fault_type & VM_PROT_WRITE) != 0 &&
(fault_flags & VM_FAULT_CHANGE_WIRING) == 0) {
if (((fault_type & VM_PROT_WRITE) != 0 &&
(fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
(fault_flags & VM_FAULT_DIRTY) != 0) {
vm_page_dirty(fs.m);
vm_pager_page_unswapped(fs.m);
}
@ -949,6 +958,10 @@ RetryFault:;
vm_page_unwire(fs.m, 1);
} else
vm_page_activate(fs.m);
if (m_hold != NULL) {
*m_hold = fs.m;
vm_page_hold(fs.m);
}
vm_page_unlock(fs.m);
vm_page_wakeup(fs.m);

View File

@ -325,6 +325,7 @@ long vmspace_wired_count(struct vmspace *vmspace);
*/
#define VM_FAULT_NORMAL 0 /* Nothing special */
#define VM_FAULT_CHANGE_WIRING 1 /* Change the wiring as appropriate */
#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */
/*
* The following "find_space" options are supported by vm_map_find()