Revert r253939:

We cannot busy a page before doing pagefaults.
Infact, it can deadlock against vnode lock, as it tries to vget().
Other functions, right now, have an opposite lock ordering, like
vm_object_sync(), which acquires the vnode lock first and then
sleeps on the busy mechanism.

Before this patch is reinserted we need to break this ordering.

Sponsored by:	EMC / Isilon storage division
Reported by:	kib
This commit is contained in:
attilio 2013-08-05 08:55:35 +00:00
parent 05101f7501
commit 899ab64514
9 changed files with 86 additions and 50 deletions

View File

@ -324,8 +324,7 @@ zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
}
static vm_page_t
page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes,
boolean_t alloc)
page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
{
vm_object_t obj;
vm_page_t pp;
@ -347,8 +346,6 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes,
continue;
}
} else if (pp == NULL) {
if (!alloc)
break;
pp = vm_page_alloc(obj, OFF_TO_IDX(start),
VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
VM_ALLOC_NOBUSY);
@ -359,10 +356,8 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes,
if (pp != NULL) {
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_page_io_start(pp);
if (!alloc)
break;
vm_object_pip_add(obj, 1);
vm_page_io_start(pp);
pmap_remove_write(pp);
vm_page_clear_dirty(pp, off, nbytes);
}
@ -372,12 +367,55 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes,
}
static void
page_unbusy(vm_page_t pp, boolean_t unalloc)
page_unbusy(vm_page_t pp)
{
vm_page_io_finish(pp);
if (unalloc)
vm_object_pip_subtract(pp->object, 1);
vm_object_pip_subtract(pp->object, 1);
}
static vm_page_t
page_hold(vnode_t *vp, int64_t start)
{
vm_object_t obj;
vm_page_t pp;
obj = vp->v_object;
zfs_vmobject_assert_wlocked(obj);
for (;;) {
if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
pp->valid) {
if ((pp->oflags & VPO_BUSY) != 0) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
* likely to reclaim it.
*/
vm_page_reference(pp);
vm_page_sleep(pp, "zfsmwb");
continue;
}
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_page_lock(pp);
vm_page_hold(pp);
vm_page_unlock(pp);
} else
pp = NULL;
break;
}
return (pp);
}
static void
page_unhold(vm_page_t pp)
{
vm_page_lock(pp);
vm_page_unhold(pp);
vm_page_unlock(pp);
}
static caddr_t
@ -441,8 +479,7 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
zfs_vmobject_wlock(obj);
vm_page_undirty(pp);
} else if ((pp = page_busy(vp, start, off, nbytes,
TRUE)) != NULL) {
} else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
@ -451,7 +488,7 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
page_unbusy(pp, TRUE);
page_unbusy(pp);
}
len -= nbytes;
off = 0;
@ -561,7 +598,7 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
vm_page_t pp;
uint64_t bytes = MIN(PAGESIZE - off, len);
if (pp = page_busy(vp, start, 0, 0, FALSE)) {
if (pp = page_hold(vp, start)) {
struct sf_buf *sf;
caddr_t va;
@ -570,7 +607,7 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
error = uiomove(va + off, bytes, UIO_READ, uio);
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
page_unbusy(pp, FALSE);
page_unhold(pp);
} else {
zfs_vmobject_wunlock(obj);
error = dmu_read_uio(os, zp->z_id, uio, bytes);

View File

@ -485,13 +485,13 @@ tmpfs_nocacheread(vm_object_t tobj, vm_pindex_t idx,
vm_page_zero_invalid(m, TRUE);
vm_page_wakeup(m);
}
vm_page_io_start(m);
vm_page_lock(m);
vm_page_hold(m);
vm_page_unlock(m);
VM_OBJECT_WUNLOCK(tobj);
error = uiomove_fromphys(&m, offset, tlen, uio);
VM_OBJECT_WLOCK(tobj);
vm_page_io_finish(m);
VM_OBJECT_WUNLOCK(tobj);
vm_page_lock(m);
vm_page_unhold(m);
if (m->queue == PQ_NONE) {
vm_page_deactivate(m);
} else {
@ -602,14 +602,16 @@ tmpfs_mappedwrite(vm_object_t tobj, size_t len, struct uio *uio)
vm_page_zero_invalid(tpg, TRUE);
vm_page_wakeup(tpg);
}
vm_page_io_start(tpg);
vm_page_lock(tpg);
vm_page_hold(tpg);
vm_page_unlock(tpg);
VM_OBJECT_WUNLOCK(tobj);
error = uiomove_fromphys(&tpg, offset, tlen, uio);
VM_OBJECT_WLOCK(tobj);
vm_page_io_finish(tpg);
if (error == 0)
vm_page_dirty(tpg);
vm_page_lock(tpg);
vm_page_unhold(tpg);
if (tpg->queue == PQ_NONE) {
vm_page_deactivate(tpg);
} else {

View File

@ -378,7 +378,7 @@ __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
off = offset - trunc_page(offset);
error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
end - start);
vm_imgact_unmap_page(object, sf);
vm_imgact_unmap_page(sf);
if (error) {
return (KERN_FAILURE);
}
@ -433,7 +433,7 @@ __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
sz = PAGE_SIZE - off;
error = copyout((caddr_t)sf_buf_kva(sf) + off,
(caddr_t)start, sz);
vm_imgact_unmap_page(object, sf);
vm_imgact_unmap_page(sf);
if (error) {
return (KERN_FAILURE);
}
@ -553,7 +553,7 @@ __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
trunc_page(offset + filsz);
error = copyout((caddr_t)sf_buf_kva(sf) + off,
(caddr_t)map_addr, copy_len);
vm_imgact_unmap_page(object, sf);
vm_imgact_unmap_page(sf);
if (error) {
return (error);
}

View File

@ -973,7 +973,7 @@ exec_map_first_page(imgp)
vm_page_wakeup(ma[0]);
}
vm_page_lock(ma[0]);
vm_page_wire(ma[0]);
vm_page_hold(ma[0]);
vm_page_unlock(ma[0]);
VM_OBJECT_WUNLOCK(object);
@ -994,7 +994,7 @@ exec_unmap_first_page(imgp)
sf_buf_free(imgp->firstpage);
imgp->firstpage = NULL;
vm_page_lock(m);
vm_page_unwire(m, 0);
vm_page_unhold(m);
vm_page_unlock(m);
}
}

View File

@ -263,7 +263,6 @@ proc_rwmem(struct proc *p, struct uio *uio)
writing = uio->uio_rw == UIO_WRITE;
reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
fault_flags |= VM_FAULT_IOBUSY;
/*
* Only map in one page at a time. We don't have to, but it
@ -288,9 +287,9 @@ proc_rwmem(struct proc *p, struct uio *uio)
len = min(PAGE_SIZE - page_offset, uio->uio_resid);
/*
* Fault and busy the page on behalf of the process.
* Fault and hold the page on behalf of the process.
*/
error = vm_fault_handle(map, pageno, reqprot, fault_flags, &m);
error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
if (error != KERN_SUCCESS) {
if (error == KERN_RESOURCE_SHORTAGE)
error = ENOMEM;
@ -316,9 +315,9 @@ proc_rwmem(struct proc *p, struct uio *uio)
/*
* Release the page.
*/
VM_OBJECT_WLOCK(m->object);
vm_page_io_finish(m);
VM_OBJECT_WUNLOCK(m->object);
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
} while (error == 0 && uio->uio_resid > 0);

View File

@ -63,7 +63,7 @@ void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
vm_ooffset_t *);
int vm_fault_disable_pagefaults(void);
void vm_fault_enable_pagefaults(int save);
int vm_fault_handle(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold);
int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
vm_prot_t prot, vm_page_t *ma, int max_count);
@ -87,7 +87,7 @@ void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
int vslock(void *, size_t);
void vsunlock(void *, size_t);
struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset);
void vm_imgact_unmap_page(vm_object_t, struct sf_buf *sf);
void vm_imgact_unmap_page(struct sf_buf *sf);
void vm_thread_dispose(struct thread *td);
int vm_thread_new(struct thread *td, int pages);
int vm_mlock(struct proc *, struct ucred *, const void *, size_t);

View File

@ -221,8 +221,8 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
if (map != kernel_map && KTRPOINT(td, KTR_FAULT))
ktrfault(vaddr, fault_type);
#endif
result = vm_fault_handle(map, trunc_page(vaddr), fault_type,
fault_flags, NULL);
result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags,
NULL);
#ifdef KTRACE
if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND))
ktrfaultend(result);
@ -231,7 +231,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
}
int
vm_fault_handle(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold)
{
vm_prot_t prot;
@ -943,10 +943,7 @@ RetryFault:;
vm_page_activate(fs.m);
if (m_hold != NULL) {
*m_hold = fs.m;
if (fault_flags & VM_FAULT_IOBUSY)
vm_page_io_start(fs.m);
else
vm_page_hold(fs.m);
vm_page_hold(fs.m);
}
vm_page_unlock(fs.m);
vm_page_wakeup(fs.m);
@ -1148,7 +1145,7 @@ vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
* and hold these pages.
*/
for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
if (*mp == NULL && vm_fault_handle(map, va, prot,
if (*mp == NULL && vm_fault_hold(map, va, prot,
VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
goto error;
}

View File

@ -223,7 +223,7 @@ vsunlock(void *addr, size_t len)
* Return the pinned page if successful; otherwise, return NULL.
*/
static vm_page_t
vm_imgact_page_iostart(vm_object_t object, vm_ooffset_t offset)
vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
{
vm_page_t m, ma[1];
vm_pindex_t pindex;
@ -249,7 +249,9 @@ vm_imgact_page_iostart(vm_object_t object, vm_ooffset_t offset)
}
vm_page_wakeup(m);
}
vm_page_io_start(m);
vm_page_lock(m);
vm_page_hold(m);
vm_page_unlock(m);
out:
VM_OBJECT_WUNLOCK(object);
return (m);
@ -264,7 +266,7 @@ vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
{
vm_page_t m;
m = vm_imgact_page_iostart(object, offset);
m = vm_imgact_hold_page(object, offset);
if (m == NULL)
return (NULL);
sched_pin();
@ -275,16 +277,16 @@ vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
* Destroy the given CPU private mapping and unpin the page that it mapped.
*/
void
vm_imgact_unmap_page(vm_object_t object, struct sf_buf *sf)
vm_imgact_unmap_page(struct sf_buf *sf)
{
vm_page_t m;
m = sf_buf_page(sf);
sf_buf_free(sf);
sched_unpin();
VM_OBJECT_WLOCK(object);
vm_page_io_finish(m);
VM_OBJECT_WUNLOCK(object);
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
}
void

View File

@ -329,7 +329,6 @@ long vmspace_resident_count(struct vmspace *vmspace);
#define VM_FAULT_NORMAL 0 /* Nothing special */
#define VM_FAULT_CHANGE_WIRING 1 /* Change the wiring as appropriate */
#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */
#define VM_FAULT_IOBUSY 4 /* Busy the faulted page */
/*
* Initially, mappings are slightly sequential. The maximum window size must