The page hold mechanism is fast but it has couple of fallouts:

- It does not let pages respect the LRU policy
- It bloats the active/inactive queues of few pages

Try to avoid it as much as possible with the long-term target to
completely remove it.
Use the soft-busy mechanism to protect page content accesses during
short-term operations (like uiomove_fromphys()).

After this change only vm_fault_quick_hold_pages() is still using the
hold mechanism for page content access.
There is an additional complexity there as the quick path cannot
immediately access the page object to busy the page and the slow path
cannot however busy more than one page a time (to avoid deadlocks).

Fixing such primitive can bring to complete removal of the page hold
mechanism.

Sponsored by:	EMC / Isilon storage division
Discussed with:	alc
Reviewed by:	jeff
Tested by:	pho
This commit is contained in:
Attilio Rao 2013-08-04 21:07:24 +00:00
parent b9fdaa9b19
commit 3b6714cacb
9 changed files with 50 additions and 86 deletions

View File

@ -324,7 +324,8 @@ zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
}
static vm_page_t
page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes,
boolean_t alloc)
{
vm_object_t obj;
vm_page_t pp;
@ -346,6 +347,8 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
continue;
}
} else if (pp == NULL) {
if (!alloc)
break;
pp = vm_page_alloc(obj, OFF_TO_IDX(start),
VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
VM_ALLOC_NOBUSY);
@ -356,8 +359,10 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
if (pp != NULL) {
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_object_pip_add(obj, 1);
vm_page_io_start(pp);
if (!alloc)
break;
vm_object_pip_add(obj, 1);
pmap_remove_write(pp);
vm_page_clear_dirty(pp, off, nbytes);
}
@ -367,55 +372,12 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
}
static void
page_unbusy(vm_page_t pp)
page_unbusy(vm_page_t pp, boolean_t unalloc)
{
vm_page_io_finish(pp);
vm_object_pip_subtract(pp->object, 1);
}
static vm_page_t
page_hold(vnode_t *vp, int64_t start)
{
vm_object_t obj;
vm_page_t pp;
obj = vp->v_object;
zfs_vmobject_assert_wlocked(obj);
for (;;) {
if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
pp->valid) {
if ((pp->oflags & VPO_BUSY) != 0) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
* likely to reclaim it.
*/
vm_page_reference(pp);
vm_page_sleep(pp, "zfsmwb");
continue;
}
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_page_lock(pp);
vm_page_hold(pp);
vm_page_unlock(pp);
} else
pp = NULL;
break;
}
return (pp);
}
static void
page_unhold(vm_page_t pp)
{
vm_page_lock(pp);
vm_page_unhold(pp);
vm_page_unlock(pp);
if (unalloc)
vm_object_pip_subtract(pp->object, 1);
}
static caddr_t
@ -479,7 +441,8 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
zfs_vmobject_wlock(obj);
vm_page_undirty(pp);
} else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
} else if ((pp = page_busy(vp, start, off, nbytes,
TRUE)) != NULL) {
zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
@ -488,7 +451,7 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
page_unbusy(pp);
page_unbusy(pp, TRUE);
}
len -= nbytes;
off = 0;
@ -598,7 +561,7 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
vm_page_t pp;
uint64_t bytes = MIN(PAGESIZE - off, len);
if (pp = page_hold(vp, start)) {
if (pp = page_busy(vp, start, 0, 0, FALSE)) {
struct sf_buf *sf;
caddr_t va;
@ -607,7 +570,7 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
error = uiomove(va + off, bytes, UIO_READ, uio);
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
page_unhold(pp);
page_unbusy(pp, FALSE);
} else {
zfs_vmobject_wunlock(obj);
error = dmu_read_uio(os, zp->z_id, uio, bytes);

View File

@ -485,13 +485,13 @@ tmpfs_nocacheread(vm_object_t tobj, vm_pindex_t idx,
vm_page_zero_invalid(m, TRUE);
vm_page_wakeup(m);
}
vm_page_lock(m);
vm_page_hold(m);
vm_page_unlock(m);
vm_page_io_start(m);
VM_OBJECT_WUNLOCK(tobj);
error = uiomove_fromphys(&m, offset, tlen, uio);
VM_OBJECT_WLOCK(tobj);
vm_page_io_finish(m);
VM_OBJECT_WUNLOCK(tobj);
vm_page_lock(m);
vm_page_unhold(m);
if (m->queue == PQ_NONE) {
vm_page_deactivate(m);
} else {
@ -602,16 +602,14 @@ tmpfs_mappedwrite(vm_object_t tobj, size_t len, struct uio *uio)
vm_page_zero_invalid(tpg, TRUE);
vm_page_wakeup(tpg);
}
vm_page_lock(tpg);
vm_page_hold(tpg);
vm_page_unlock(tpg);
vm_page_io_start(tpg);
VM_OBJECT_WUNLOCK(tobj);
error = uiomove_fromphys(&tpg, offset, tlen, uio);
VM_OBJECT_WLOCK(tobj);
vm_page_io_finish(tpg);
if (error == 0)
vm_page_dirty(tpg);
vm_page_lock(tpg);
vm_page_unhold(tpg);
if (tpg->queue == PQ_NONE) {
vm_page_deactivate(tpg);
} else {

View File

@ -378,7 +378,7 @@ __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
off = offset - trunc_page(offset);
error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
end - start);
vm_imgact_unmap_page(sf);
vm_imgact_unmap_page(object, sf);
if (error) {
return (KERN_FAILURE);
}
@ -433,7 +433,7 @@ __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
sz = PAGE_SIZE - off;
error = copyout((caddr_t)sf_buf_kva(sf) + off,
(caddr_t)start, sz);
vm_imgact_unmap_page(sf);
vm_imgact_unmap_page(object, sf);
if (error) {
return (KERN_FAILURE);
}
@ -553,7 +553,7 @@ __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
trunc_page(offset + filsz);
error = copyout((caddr_t)sf_buf_kva(sf) + off,
(caddr_t)map_addr, copy_len);
vm_imgact_unmap_page(sf);
vm_imgact_unmap_page(object, sf);
if (error) {
return (error);
}

View File

@ -973,7 +973,7 @@ exec_map_first_page(imgp)
vm_page_wakeup(ma[0]);
}
vm_page_lock(ma[0]);
vm_page_hold(ma[0]);
vm_page_wire(ma[0]);
vm_page_unlock(ma[0]);
VM_OBJECT_WUNLOCK(object);
@ -994,7 +994,7 @@ exec_unmap_first_page(imgp)
sf_buf_free(imgp->firstpage);
imgp->firstpage = NULL;
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unwire(m, 0);
vm_page_unlock(m);
}
}

View File

@ -263,6 +263,7 @@ proc_rwmem(struct proc *p, struct uio *uio)
writing = uio->uio_rw == UIO_WRITE;
reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
fault_flags |= VM_FAULT_IOBUSY;
/*
* Only map in one page at a time. We don't have to, but it
@ -287,9 +288,9 @@ proc_rwmem(struct proc *p, struct uio *uio)
len = min(PAGE_SIZE - page_offset, uio->uio_resid);
/*
* Fault and hold the page on behalf of the process.
* Fault and busy the page on behalf of the process.
*/
error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
error = vm_fault_handle(map, pageno, reqprot, fault_flags, &m);
if (error != KERN_SUCCESS) {
if (error == KERN_RESOURCE_SHORTAGE)
error = ENOMEM;
@ -315,9 +316,9 @@ proc_rwmem(struct proc *p, struct uio *uio)
/*
* Release the page.
*/
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
VM_OBJECT_WLOCK(m->object);
vm_page_io_finish(m);
VM_OBJECT_WUNLOCK(m->object);
} while (error == 0 && uio->uio_resid > 0);

View File

@ -63,7 +63,7 @@ void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
vm_ooffset_t *);
int vm_fault_disable_pagefaults(void);
void vm_fault_enable_pagefaults(int save);
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int vm_fault_handle(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold);
int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
vm_prot_t prot, vm_page_t *ma, int max_count);
@ -87,7 +87,7 @@ void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
int vslock(void *, size_t);
void vsunlock(void *, size_t);
struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset);
void vm_imgact_unmap_page(struct sf_buf *sf);
void vm_imgact_unmap_page(vm_object_t, struct sf_buf *sf);
void vm_thread_dispose(struct thread *td);
int vm_thread_new(struct thread *td, int pages);
int vm_mlock(struct proc *, struct ucred *, const void *, size_t);

View File

@ -221,8 +221,8 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
if (map != kernel_map && KTRPOINT(td, KTR_FAULT))
ktrfault(vaddr, fault_type);
#endif
result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags,
NULL);
result = vm_fault_handle(map, trunc_page(vaddr), fault_type,
fault_flags, NULL);
#ifdef KTRACE
if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND))
ktrfaultend(result);
@ -231,7 +231,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
}
int
vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
vm_fault_handle(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold)
{
vm_prot_t prot;
@ -943,7 +943,10 @@ vnode_locked:
vm_page_activate(fs.m);
if (m_hold != NULL) {
*m_hold = fs.m;
vm_page_hold(fs.m);
if (fault_flags & VM_FAULT_IOBUSY)
vm_page_io_start(fs.m);
else
vm_page_hold(fs.m);
}
vm_page_unlock(fs.m);
vm_page_wakeup(fs.m);
@ -1145,7 +1148,7 @@ vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
* and hold these pages.
*/
for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
if (*mp == NULL && vm_fault_hold(map, va, prot,
if (*mp == NULL && vm_fault_handle(map, va, prot,
VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
goto error;
}

View File

@ -223,7 +223,7 @@ vsunlock(void *addr, size_t len)
* Return the pinned page if successful; otherwise, return NULL.
*/
static vm_page_t
vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
vm_imgact_page_iostart(vm_object_t object, vm_ooffset_t offset)
{
vm_page_t m, ma[1];
vm_pindex_t pindex;
@ -249,9 +249,7 @@ vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
}
vm_page_wakeup(m);
}
vm_page_lock(m);
vm_page_hold(m);
vm_page_unlock(m);
vm_page_io_start(m);
out:
VM_OBJECT_WUNLOCK(object);
return (m);
@ -266,7 +264,7 @@ vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
{
vm_page_t m;
m = vm_imgact_hold_page(object, offset);
m = vm_imgact_page_iostart(object, offset);
if (m == NULL)
return (NULL);
sched_pin();
@ -277,16 +275,16 @@ vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
* Destroy the given CPU private mapping and unpin the page that it mapped.
*/
void
vm_imgact_unmap_page(struct sf_buf *sf)
vm_imgact_unmap_page(vm_object_t object, struct sf_buf *sf)
{
vm_page_t m;
m = sf_buf_page(sf);
sf_buf_free(sf);
sched_unpin();
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
VM_OBJECT_WLOCK(object);
vm_page_io_finish(m);
VM_OBJECT_WUNLOCK(object);
}
void

View File

@ -329,6 +329,7 @@ long vmspace_resident_count(struct vmspace *vmspace);
#define VM_FAULT_NORMAL 0 /* Nothing special */
#define VM_FAULT_CHANGE_WIRING 1 /* Change the wiring as appropriate */
#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */
#define VM_FAULT_IOBUSY 4 /* Busy the faulted page */
/*
* Initially, mappings are slightly sequential. The maximum window size must