Introduce vm_fault_hold() and use it to (1) eliminate a long-standing race
condition in proc_rwmem() and to (2) simplify the implementation of the cxgb driver's vm_fault_hold_user_pages(). Specifically, in proc_rwmem() the requested read or write could fail because the targeted page could be reclaimed between the calls to vm_fault() and vm_page_hold(). In collaboration with: kib@ MFC after: 6 weeks
This commit is contained in:
parent
091c4c86d1
commit
acd11c7499
@ -66,7 +66,7 @@ vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
|
||||
int count, vm_prot_t prot)
|
||||
{
|
||||
vm_offset_t end, va;
|
||||
int faults, rv;
|
||||
int faults;
|
||||
pmap_t pmap;
|
||||
vm_page_t m, *pages;
|
||||
|
||||
@ -122,20 +122,11 @@ vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
|
||||
/*
|
||||
* Pages either have insufficient permissions or are not present
|
||||
* trigger a fault where neccessary
|
||||
*
|
||||
*/
|
||||
rv = 0;
|
||||
for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) {
|
||||
/*
|
||||
* Account for a very narrow race where the page may be
|
||||
* taken away from us before it is held
|
||||
*/
|
||||
while (*pages == NULL) {
|
||||
rv = vm_fault(map, va, prot, VM_FAULT_NORMAL);
|
||||
if (rv)
|
||||
goto error;
|
||||
*pages = pmap_extract_and_hold(pmap, va, prot);
|
||||
}
|
||||
if (*pages == NULL && vm_fault_hold(map, va, prot,
|
||||
VM_FAULT_NORMAL, pages) != KERN_SUCCESS)
|
||||
goto error;
|
||||
}
|
||||
return (0);
|
||||
error:
|
||||
|
@ -237,10 +237,9 @@ int
|
||||
proc_rwmem(struct proc *p, struct uio *uio)
|
||||
{
|
||||
vm_map_t map;
|
||||
vm_object_t backing_object, object;
|
||||
vm_offset_t pageno; /* page number */
|
||||
vm_prot_t reqprot;
|
||||
int error, writing;
|
||||
int error, fault_flags, page_offset, writing;
|
||||
|
||||
/*
|
||||
* Assert that someone has locked this vmspace. (Should be
|
||||
@ -255,26 +254,24 @@ proc_rwmem(struct proc *p, struct uio *uio)
|
||||
*/
|
||||
map = &p->p_vmspace->vm_map;
|
||||
|
||||
/*
|
||||
* If we are writing, then we request vm_fault() to create a private
|
||||
* copy of each page. Since these copies will not be writeable by the
|
||||
* process, we must explicity request that they be dirtied.
|
||||
*/
|
||||
writing = uio->uio_rw == UIO_WRITE;
|
||||
reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
|
||||
fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
|
||||
|
||||
/*
|
||||
* Only map in one page at a time. We don't have to, but it
|
||||
* makes things easier. This way is trivial - right?
|
||||
*/
|
||||
do {
|
||||
vm_map_t tmap;
|
||||
vm_offset_t uva;
|
||||
int page_offset; /* offset into page */
|
||||
vm_map_entry_t out_entry;
|
||||
vm_prot_t out_prot;
|
||||
boolean_t wired;
|
||||
vm_pindex_t pindex;
|
||||
u_int len;
|
||||
vm_page_t m;
|
||||
|
||||
object = NULL;
|
||||
|
||||
uva = (vm_offset_t)uio->uio_offset;
|
||||
|
||||
/*
|
||||
@ -289,10 +286,10 @@ proc_rwmem(struct proc *p, struct uio *uio)
|
||||
len = min(PAGE_SIZE - page_offset, uio->uio_resid);
|
||||
|
||||
/*
|
||||
* Fault the page on behalf of the process
|
||||
* Fault and hold the page on behalf of the process.
|
||||
*/
|
||||
error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL);
|
||||
if (error) {
|
||||
error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
|
||||
if (error != KERN_SUCCESS) {
|
||||
if (error == KERN_RESOURCE_SHORTAGE)
|
||||
error = ENOMEM;
|
||||
else
|
||||
@ -300,62 +297,19 @@ proc_rwmem(struct proc *p, struct uio *uio)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we need to get the page. out_entry and wired
|
||||
* aren't used. One would think the vm code
|
||||
* would be a *bit* nicer... We use tmap because
|
||||
* vm_map_lookup() can change the map argument.
|
||||
*/
|
||||
tmap = map;
|
||||
error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry,
|
||||
&object, &pindex, &out_prot, &wired);
|
||||
if (error) {
|
||||
error = EFAULT;
|
||||
break;
|
||||
}
|
||||
VM_OBJECT_LOCK(object);
|
||||
while ((m = vm_page_lookup(object, pindex)) == NULL &&
|
||||
!writing &&
|
||||
(backing_object = object->backing_object) != NULL) {
|
||||
/*
|
||||
* Allow fallback to backing objects if we are reading.
|
||||
*/
|
||||
VM_OBJECT_LOCK(backing_object);
|
||||
pindex += OFF_TO_IDX(object->backing_object_offset);
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
object = backing_object;
|
||||
}
|
||||
if (writing && m != NULL) {
|
||||
vm_page_dirty(m);
|
||||
vm_pager_page_unswapped(m);
|
||||
}
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
if (m == NULL) {
|
||||
vm_map_lookup_done(tmap, out_entry);
|
||||
error = EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hold the page in memory.
|
||||
*/
|
||||
vm_page_lock(m);
|
||||
vm_page_hold(m);
|
||||
vm_page_unlock(m);
|
||||
|
||||
/*
|
||||
* We're done with tmap now.
|
||||
*/
|
||||
vm_map_lookup_done(tmap, out_entry);
|
||||
|
||||
/*
|
||||
* Now do the i/o move.
|
||||
*/
|
||||
error = uiomove_fromphys(&m, page_offset, len, uio);
|
||||
|
||||
/* Make the I-cache coherent for breakpoints. */
|
||||
if (!error && writing && (out_prot & VM_PROT_EXECUTE))
|
||||
vm_sync_icache(map, uva, len);
|
||||
if (writing && error == 0) {
|
||||
vm_map_lock_read(map);
|
||||
if (vm_map_check_protection(map, pageno, pageno +
|
||||
PAGE_SIZE, VM_PROT_EXECUTE))
|
||||
vm_sync_icache(map, uva, len);
|
||||
vm_map_unlock_read(map);
|
||||
}
|
||||
|
||||
/*
|
||||
* Release the page.
|
||||
|
@ -61,6 +61,8 @@ int useracc(void *, int, int);
|
||||
int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
|
||||
void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
|
||||
vm_ooffset_t *);
|
||||
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
|
||||
int fault_flags, vm_page_t *m_hold);
|
||||
void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
|
||||
int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
|
||||
int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
|
||||
|
@ -201,13 +201,20 @@ unlock_and_deallocate(struct faultstate *fs)
|
||||
* KERN_SUCCESS is returned if the page fault is handled; otherwise,
|
||||
* a standard error specifying why the fault is fatal is returned.
|
||||
*
|
||||
*
|
||||
* The map in question must be referenced, and remains so.
|
||||
* Caller may hold no locks.
|
||||
*/
|
||||
int
|
||||
vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
|
||||
int fault_flags)
|
||||
int fault_flags)
|
||||
{
|
||||
|
||||
return (vm_fault_hold(map, vaddr, fault_type, fault_flags, NULL));
|
||||
}
|
||||
|
||||
int
|
||||
vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
|
||||
int fault_flags, vm_page_t *m_hold)
|
||||
{
|
||||
vm_prot_t prot;
|
||||
int is_first_object_locked, result;
|
||||
@ -880,7 +887,8 @@ RetryFault:;
|
||||
if (hardfault)
|
||||
fs.entry->lastr = fs.pindex + faultcount - behind;
|
||||
|
||||
if (prot & VM_PROT_WRITE) {
|
||||
if ((prot & VM_PROT_WRITE) != 0 ||
|
||||
(fault_flags & VM_FAULT_DIRTY) != 0) {
|
||||
vm_object_set_writeable_dirty(fs.object);
|
||||
|
||||
/*
|
||||
@ -906,8 +914,9 @@ RetryFault:;
|
||||
* Also tell the backing pager, if any, that it should remove
|
||||
* any swap backing since the page is now dirty.
|
||||
*/
|
||||
if ((fault_type & VM_PROT_WRITE) != 0 &&
|
||||
(fault_flags & VM_FAULT_CHANGE_WIRING) == 0) {
|
||||
if (((fault_type & VM_PROT_WRITE) != 0 &&
|
||||
(fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
|
||||
(fault_flags & VM_FAULT_DIRTY) != 0) {
|
||||
vm_page_dirty(fs.m);
|
||||
vm_pager_page_unswapped(fs.m);
|
||||
}
|
||||
@ -949,6 +958,10 @@ RetryFault:;
|
||||
vm_page_unwire(fs.m, 1);
|
||||
} else
|
||||
vm_page_activate(fs.m);
|
||||
if (m_hold != NULL) {
|
||||
*m_hold = fs.m;
|
||||
vm_page_hold(fs.m);
|
||||
}
|
||||
vm_page_unlock(fs.m);
|
||||
vm_page_wakeup(fs.m);
|
||||
|
||||
|
@ -325,6 +325,7 @@ long vmspace_wired_count(struct vmspace *vmspace);
|
||||
*/
|
||||
#define VM_FAULT_NORMAL 0 /* Nothing special */
|
||||
#define VM_FAULT_CHANGE_WIRING 1 /* Change the wiring as appropriate */
|
||||
#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */
|
||||
|
||||
/*
|
||||
* The following "find_space" options are supported by vm_map_find()
|
||||
|
Loading…
Reference in New Issue
Block a user