Make collapse synchronization more explicit and allow it to complete during

paging.

Shadow objects are marked with a COLLAPSING flag while they are collapsing with
their backing object.  This gives us an explicit test rather than overloading
paging-in-progress.  While split is on-going we mark an object with SPLIT.
These two operations will modify the swap tree so they must be serialized
and swap_pager_getpages() can now directly detect these conditions and page
more conservatively.

Callers to vm_object_collapse() now will reliably wait for a collapse to finish
so that the backing chain is as short as possible before other decisions are
made that may inflate the object chain.  For example, split, coalesce, etc.
It is now safe to run fault concurrently with collapse.  It is safe to increase
or decrease paging in progress with no lock so long as there is another valid
ref on increase.

This change makes collapse more reliable as a secondary benefit.  The primary
benefit is making it safe to drop the object lock much earlier in fault or
never acquire it at all.

This was tested with a new shadow chain test script that uncovered long
standing bugs and will be integrated with stress2.

Reviewed by:	kib, markj
Differential Revision:	https://reviews.freebsd.org/D22908
This commit is contained in:
Jeff Roberson 2020-01-19 18:30:23 +00:00
parent 811d05fcb7
commit 98087a066f
3 changed files with 332 additions and 282 deletions

View File

@ -974,15 +974,12 @@ swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject,
* Destination has no swapblk and is not resident, transfer source. * Destination has no swapblk and is not resident, transfer source.
* swp_pager_meta_build() can sleep. * swp_pager_meta_build() can sleep.
*/ */
vm_object_pip_add(srcobject, 1);
VM_OBJECT_WUNLOCK(srcobject); VM_OBJECT_WUNLOCK(srcobject);
vm_object_pip_add(dstobject, 1);
dstaddr = swp_pager_meta_build(dstobject, pindex, addr); dstaddr = swp_pager_meta_build(dstobject, pindex, addr);
KASSERT(dstaddr == SWAPBLK_NONE, KASSERT(dstaddr == SWAPBLK_NONE,
("Unexpected destination swapblk")); ("Unexpected destination swapblk"));
vm_object_pip_wakeup(dstobject);
VM_OBJECT_WLOCK(srcobject); VM_OBJECT_WLOCK(srcobject);
vm_object_pip_wakeup(srcobject);
return (true); return (true);
} }
@ -995,8 +992,7 @@ swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject,
* we keep the destination's. * we keep the destination's.
* *
* This routine is allowed to sleep. It may sleep allocating metadata * This routine is allowed to sleep. It may sleep allocating metadata
* indirectly through swp_pager_meta_build() or if paging is still in * indirectly through swp_pager_meta_build().
* progress on the source.
* *
* The source object contains no vm_page_t's (which is just as well) * The source object contains no vm_page_t's (which is just as well)
* *
@ -1019,18 +1015,14 @@ swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
*/ */
if (destroysource && (srcobject->flags & OBJ_ANON) == 0 && if (destroysource && (srcobject->flags & OBJ_ANON) == 0 &&
srcobject->handle != NULL) { srcobject->handle != NULL) {
vm_object_pip_add(srcobject, 1);
VM_OBJECT_WUNLOCK(srcobject); VM_OBJECT_WUNLOCK(srcobject);
vm_object_pip_add(dstobject, 1);
VM_OBJECT_WUNLOCK(dstobject); VM_OBJECT_WUNLOCK(dstobject);
sx_xlock(&sw_alloc_sx); sx_xlock(&sw_alloc_sx);
TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
pager_object_list); pager_object_list);
sx_xunlock(&sw_alloc_sx); sx_xunlock(&sw_alloc_sx);
VM_OBJECT_WLOCK(dstobject); VM_OBJECT_WLOCK(dstobject);
vm_object_pip_wakeup(dstobject);
VM_OBJECT_WLOCK(srcobject); VM_OBJECT_WLOCK(srcobject);
vm_object_pip_wakeup(srcobject);
} }
/* /*
@ -1207,26 +1199,29 @@ swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
reqcount = count; reqcount = count;
/*
* Determine the final number of read-behind pages and
* allocate them BEFORE releasing the object lock. Otherwise,
* there can be a problematic race with vm_object_split().
* Specifically, vm_object_split() might first transfer pages
* that precede ma[0] in the current object to a new object,
* and then this function incorrectly recreates those pages as
* read-behind pages in the current object.
*/
KASSERT(object->type == OBJT_SWAP, KASSERT(object->type == OBJT_SWAP,
("%s: object not swappable", __func__)); ("%s: object not swappable", __func__));
if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead))
return (VM_PAGER_FAIL); return (VM_PAGER_FAIL);
KASSERT(reqcount - 1 <= maxahead,
("page count %d extends beyond swap block", reqcount));
/*
* Do not transfer any pages other than those that are xbusied
* when running during a split or collapse operation. This
* prevents clustering from re-creating pages which are being
* moved into another object.
*/
if ((object->flags & (OBJ_SPLIT | OBJ_DEAD)) != 0) {
maxahead = reqcount - 1;
maxbehind = 0;
}
/* /*
* Clip the readahead and readbehind ranges to exclude resident pages. * Clip the readahead and readbehind ranges to exclude resident pages.
*/ */
if (rahead != NULL) { if (rahead != NULL) {
KASSERT(reqcount - 1 <= maxahead,
("page count %d extends beyond swap block", reqcount));
*rahead = imin(*rahead, maxahead - (reqcount - 1)); *rahead = imin(*rahead, maxahead - (reqcount - 1));
pindex = ma[reqcount - 1]->pindex; pindex = ma[reqcount - 1]->pindex;
msucc = TAILQ_NEXT(ma[reqcount - 1], listq); msucc = TAILQ_NEXT(ma[reqcount - 1], listq);

View File

@ -116,8 +116,6 @@ static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
boolean_t *eio); boolean_t *eio);
static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, static boolean_t vm_object_page_remove_write(vm_page_t p, int flags,
boolean_t *allclean); boolean_t *allclean);
static void vm_object_qcollapse(vm_object_t object);
static void vm_object_vndeallocate(vm_object_t object);
static void vm_object_backing_remove(vm_object_t object); static void vm_object_backing_remove(vm_object_t object);
/* /*
@ -164,12 +162,18 @@ SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
&object_bypasses, &object_bypasses,
"VM object bypasses"); "VM object bypasses");
static counter_u64_t object_collapse_waits = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapse_waits, CTLFLAG_RD,
&object_collapse_waits,
"Number of sleeps for collapse");
static void static void
counter_startup(void) counter_startup(void)
{ {
object_collapses = counter_u64_alloc(M_WAITOK); object_collapses = counter_u64_alloc(M_WAITOK);
object_bypasses = counter_u64_alloc(M_WAITOK); object_bypasses = counter_u64_alloc(M_WAITOK);
object_collapse_waits = counter_u64_alloc(M_WAITOK);
} }
SYSINIT(object_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL); SYSINIT(object_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL);
@ -376,6 +380,19 @@ vm_object_pip_wakeupn(vm_object_t object, short i)
refcount_releasen(&object->paging_in_progress, i); refcount_releasen(&object->paging_in_progress, i);
} }
/*
* Atomically drop the interlock and wait for pip to drain. This protects
* from sleep/wakeup races due to identity changes. The lock is not
* re-acquired on return.
*/
static void
vm_object_pip_sleep(vm_object_t object, char *waitid)
{
refcount_sleep_interlock(&object->paging_in_progress,
&object->lock, waitid, PVM);
}
void void
vm_object_pip_wait(vm_object_t object, char *waitid) vm_object_pip_wait(vm_object_t object, char *waitid)
{ {
@ -383,8 +400,7 @@ vm_object_pip_wait(vm_object_t object, char *waitid)
VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object);
while (REFCOUNT_COUNT(object->paging_in_progress) > 0) { while (REFCOUNT_COUNT(object->paging_in_progress) > 0) {
VM_OBJECT_WUNLOCK(object); vm_object_pip_sleep(object, waitid);
refcount_wait(&object->paging_in_progress, waitid, PVM);
VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(object);
} }
} }
@ -466,30 +482,17 @@ vm_object_allocate_anon(vm_pindex_t size, vm_object_t backing_object,
return (object); return (object);
} }
static void
/* vm_object_reference_vnode(vm_object_t object)
* vm_object_reference:
*
* Gets another reference to the given object. Note: OBJ_DEAD
* objects can be referenced during final cleaning.
*/
void
vm_object_reference(vm_object_t object)
{ {
struct vnode *vp; struct vnode *vp;
u_int old; u_int old;
if (object == NULL)
return;
/* /*
* Many places assume exclusive access to objects with a single * vnode objects need the lock for the first reference
* ref. vm_object_collapse() in particular will directly mainpulate * to serialize with vnode_object_deallocate().
* references for objects in this state. vnode objects only need
* the lock for the first ref to reference the vnode.
*/ */
if (!refcount_acquire_if_gt(&object->ref_count, if (!refcount_acquire_if_gt(&object->ref_count, 0)) {
object->type == OBJT_VNODE ? 0 : 1)) {
VM_OBJECT_RLOCK(object); VM_OBJECT_RLOCK(object);
old = refcount_acquire(&object->ref_count); old = refcount_acquire(&object->ref_count);
if (object->type == OBJT_VNODE && old == 0) { if (object->type == OBJT_VNODE && old == 0) {
@ -500,6 +503,26 @@ vm_object_reference(vm_object_t object)
} }
} }
/*
* vm_object_reference:
*
* Acquires a reference to the given object.
*/
void
vm_object_reference(vm_object_t object)
{
if (object == NULL)
return;
if (object->type == OBJT_VNODE)
vm_object_reference_vnode(object);
else
refcount_acquire(&object->ref_count);
KASSERT((object->flags & OBJ_DEAD) == 0,
("vm_object_reference: Referenced dead object."));
}
/* /*
* vm_object_reference_locked: * vm_object_reference_locked:
* *
@ -516,23 +539,23 @@ vm_object_reference_locked(vm_object_t object)
VM_OBJECT_ASSERT_LOCKED(object); VM_OBJECT_ASSERT_LOCKED(object);
old = refcount_acquire(&object->ref_count); old = refcount_acquire(&object->ref_count);
if (object->type == OBJT_VNODE && old == 0) { if (object->type == OBJT_VNODE && old == 0) {
vp = object->handle; vp = object->handle; vref(vp); }
vref(vp); KASSERT((object->flags & OBJ_DEAD) == 0,
} ("vm_object_reference: Referenced dead object."));
} }
/* /*
* Handle deallocating an object of type OBJT_VNODE. * Handle deallocating an object of type OBJT_VNODE.
*/ */
static void static void
vm_object_vndeallocate(vm_object_t object) vm_object_deallocate_vnode(vm_object_t object)
{ {
struct vnode *vp = (struct vnode *) object->handle; struct vnode *vp = (struct vnode *) object->handle;
bool last; bool last;
KASSERT(object->type == OBJT_VNODE, KASSERT(object->type == OBJT_VNODE,
("vm_object_vndeallocate: not a vnode object")); ("vm_object_deallocate_vnode: not a vnode object"));
KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); KASSERT(vp != NULL, ("vm_object_deallocate_vnode: missing vp"));
/* Object lock to protect handle lookup. */ /* Object lock to protect handle lookup. */
last = refcount_release(&object->ref_count); last = refcount_release(&object->ref_count);
@ -548,6 +571,53 @@ vm_object_vndeallocate(vm_object_t object)
vrele(vp); vrele(vp);
} }
/*
* We dropped a reference on an object and discovered that it had a
* single remaining shadow. This is a sibling of the reference we
* dropped. Attempt to collapse the sibling and backing object.
*/
static vm_object_t
vm_object_deallocate_anon(vm_object_t backing_object)
{
vm_object_t object;
/* Fetch the final shadow. */
object = LIST_FIRST(&backing_object->shadow_head);
KASSERT(object != NULL && backing_object->shadow_count == 1,
("vm_object_anon_deallocate: ref_count: %d, shadow_count: %d",
backing_object->ref_count, backing_object->shadow_count));
KASSERT((object->flags & (OBJ_TMPFS_NODE | OBJ_ANON)) == OBJ_ANON,
("invalid shadow object %p", object));
if (!VM_OBJECT_TRYWLOCK(object)) {
/*
* Prevent object from disappearing since we do not have a
* reference.
*/
vm_object_pip_add(object, 1);
VM_OBJECT_WUNLOCK(backing_object);
VM_OBJECT_WLOCK(object);
vm_object_pip_wakeup(object);
} else
VM_OBJECT_WUNLOCK(backing_object);
/*
* Check for a collapse/terminate race with the last reference holder.
*/
if ((object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) != 0 ||
!refcount_acquire_if_not_zero(&object->ref_count)) {
VM_OBJECT_WUNLOCK(object);
return (NULL);
}
backing_object = object->backing_object;
if (backing_object != NULL && (backing_object->flags & OBJ_ANON) != 0)
vm_object_collapse(object);
VM_OBJECT_WUNLOCK(object);
return (object);
}
/* /*
* vm_object_deallocate: * vm_object_deallocate:
* *
@ -562,7 +632,7 @@ vm_object_vndeallocate(vm_object_t object)
void void
vm_object_deallocate(vm_object_t object) vm_object_deallocate(vm_object_t object)
{ {
vm_object_t robject, temp; vm_object_t temp;
bool released; bool released;
while (object != NULL) { while (object != NULL) {
@ -583,7 +653,7 @@ vm_object_deallocate(vm_object_t object)
if (object->type == OBJT_VNODE) { if (object->type == OBJT_VNODE) {
VM_OBJECT_RLOCK(object); VM_OBJECT_RLOCK(object);
if (object->type == OBJT_VNODE) { if (object->type == OBJT_VNODE) {
vm_object_vndeallocate(object); vm_object_deallocate_vnode(object);
return; return;
} }
VM_OBJECT_RUNLOCK(object); VM_OBJECT_RUNLOCK(object);
@ -594,92 +664,30 @@ vm_object_deallocate(vm_object_t object)
("vm_object_deallocate: object deallocated too many times: %d", ("vm_object_deallocate: object deallocated too many times: %d",
object->type)); object->type));
if (refcount_release(&object->ref_count)) /*
goto doterm; * If this is not the final reference to an anonymous
if (object->ref_count > 1) { * object we may need to collapse the shadow chain.
VM_OBJECT_WUNLOCK(object); */
return; if (!refcount_release(&object->ref_count)) {
} else if (object->ref_count == 1) { if (object->ref_count > 1 ||
if (object->shadow_count == 0 && object->shadow_count == 0) {
(object->flags & OBJ_ANON) != 0) { if ((object->flags & OBJ_ANON) != 0 &&
vm_object_set_flag(object, OBJ_ONEMAPPING); object->ref_count == 1)
} else if (object->shadow_count == 1) { vm_object_set_flag(object,
KASSERT((object->flags & OBJ_ANON) != 0, OBJ_ONEMAPPING);
("obj %p with shadow_count > 0 is not anon", VM_OBJECT_WUNLOCK(object);
object)); return;
robject = LIST_FIRST(&object->shadow_head);
KASSERT(robject != NULL,
("vm_object_deallocate: ref_count: %d, "
"shadow_count: %d", object->ref_count,
object->shadow_count));
KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0,
("shadowed tmpfs v_object %p", object));
if (!VM_OBJECT_TRYWLOCK(robject)) {
/*
* Avoid a potential deadlock.
*/
refcount_acquire(&object->ref_count);
VM_OBJECT_WUNLOCK(object);
/*
* More likely than not the thread
* holding robject's lock has lower
* priority than the current thread.
* Let the lower priority thread run.
*/
pause("vmo_de", 1);
continue;
}
/*
* Collapse object into its shadow unless its
* shadow is dead. In that case, object will
* be deallocated by the thread that is
* deallocating its shadow.
*/
if ((robject->flags &
(OBJ_DEAD | OBJ_ANON)) == OBJ_ANON) {
refcount_acquire(&robject->ref_count);
retry:
if (REFCOUNT_COUNT(robject->paging_in_progress) > 0) {
VM_OBJECT_WUNLOCK(object);
vm_object_pip_wait(robject,
"objde1");
temp = robject->backing_object;
if (object == temp) {
VM_OBJECT_WLOCK(object);
goto retry;
}
} else if (REFCOUNT_COUNT(object->paging_in_progress) > 0) {
VM_OBJECT_WUNLOCK(robject);
VM_OBJECT_WUNLOCK(object);
refcount_wait(
&object->paging_in_progress,
"objde2", PVM);
VM_OBJECT_WLOCK(robject);
temp = robject->backing_object;
if (object == temp) {
VM_OBJECT_WLOCK(object);
goto retry;
}
} else
VM_OBJECT_WUNLOCK(object);
if (robject->ref_count == 1) {
refcount_release(&robject->ref_count);
object = robject;
goto doterm;
}
object = robject;
vm_object_collapse(object);
VM_OBJECT_WUNLOCK(object);
continue;
}
VM_OBJECT_WUNLOCK(robject);
} }
VM_OBJECT_WUNLOCK(object);
return; /* Handle collapsing last ref on anonymous objects. */
object = vm_object_deallocate_anon(object);
continue;
} }
doterm:
/*
* Handle the final reference to an object. We restart
* the loop with the backing object to avoid recursion.
*/
umtx_shm_object_terminated(object); umtx_shm_object_terminated(object);
temp = object->backing_object; temp = object->backing_object;
if (temp != NULL) { if (temp != NULL) {
@ -687,16 +695,11 @@ vm_object_deallocate(vm_object_t object)
("shadowed tmpfs v_object 2 %p", object)); ("shadowed tmpfs v_object 2 %p", object));
vm_object_backing_remove(object); vm_object_backing_remove(object);
} }
/*
* Don't double-terminate, we could be in a termination KASSERT((object->flags & OBJ_DEAD) == 0,
* recursion due to the terminate having to sync data ("vm_object_deallocate: Terminating dead object."));
* to disk. vm_object_set_flag(object, OBJ_DEAD);
*/ vm_object_terminate(object);
if ((object->flags & OBJ_DEAD) == 0) {
vm_object_set_flag(object, OBJ_DEAD);
vm_object_terminate(object);
} else
VM_OBJECT_WUNLOCK(object);
object = temp; object = temp;
} }
} }
@ -734,6 +737,9 @@ vm_object_backing_remove_locked(vm_object_t object)
VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object);
VM_OBJECT_ASSERT_WLOCKED(backing_object); VM_OBJECT_ASSERT_WLOCKED(backing_object);
KASSERT((object->flags & OBJ_COLLAPSING) == 0,
("vm_object_backing_remove: Removing collapsing object."));
if ((object->flags & OBJ_SHADOWLIST) != 0) { if ((object->flags & OBJ_SHADOWLIST) != 0) {
LIST_REMOVE(object, shadow_list); LIST_REMOVE(object, shadow_list);
backing_object->shadow_count--; backing_object->shadow_count--;
@ -788,6 +794,98 @@ vm_object_backing_insert(vm_object_t object, vm_object_t backing_object)
object->backing_object = backing_object; object->backing_object = backing_object;
} }
/*
* Insert an object into a backing_object's shadow list with an additional
* reference to the backing_object added.
*/
static void
vm_object_backing_insert_ref(vm_object_t object, vm_object_t backing_object)
{
VM_OBJECT_ASSERT_WLOCKED(object);
if ((backing_object->flags & OBJ_ANON) != 0) {
VM_OBJECT_WLOCK(backing_object);
KASSERT((backing_object->flags & OBJ_DEAD) == 0,
("shadowing dead anonymous object"));
vm_object_reference_locked(backing_object);
vm_object_backing_insert_locked(object, backing_object);
vm_object_clear_flag(backing_object, OBJ_ONEMAPPING);
VM_OBJECT_WUNLOCK(backing_object);
} else {
vm_object_reference(backing_object);
object->backing_object = backing_object;
}
}
/*
* Transfer a backing reference from backing_object to object.
*/
static void
vm_object_backing_transfer(vm_object_t object, vm_object_t backing_object)
{
vm_object_t new_backing_object;
/*
* Note that the reference to backing_object->backing_object
* moves from within backing_object to within object.
*/
vm_object_backing_remove_locked(object);
new_backing_object = backing_object->backing_object;
if (new_backing_object == NULL)
return;
if ((new_backing_object->flags & OBJ_ANON) != 0) {
VM_OBJECT_WLOCK(new_backing_object);
vm_object_backing_remove_locked(backing_object);
vm_object_backing_insert_locked(object, new_backing_object);
VM_OBJECT_WUNLOCK(new_backing_object);
} else {
object->backing_object = new_backing_object;
backing_object->backing_object = NULL;
}
}
/*
* Wait for a concurrent collapse to settle.
*/
static void
vm_object_collapse_wait(vm_object_t object)
{
VM_OBJECT_ASSERT_WLOCKED(object);
while ((object->flags & OBJ_COLLAPSING) != 0) {
vm_object_pip_wait(object, "vmcolwait");
counter_u64_add(object_collapse_waits, 1);
}
}
/*
* Waits for a backing object to clear a pending collapse and returns
* it locked if it is an ANON object.
*/
static vm_object_t
vm_object_backing_collapse_wait(vm_object_t object)
{
vm_object_t backing_object;
VM_OBJECT_ASSERT_WLOCKED(object);
for (;;) {
backing_object = object->backing_object;
if (backing_object == NULL ||
(backing_object->flags & OBJ_ANON) == 0)
return (NULL);
VM_OBJECT_WLOCK(backing_object);
if ((backing_object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) == 0)
break;
VM_OBJECT_WUNLOCK(object);
vm_object_pip_sleep(backing_object, "vmbckwait");
counter_u64_add(object_collapse_waits, 1);
VM_OBJECT_WLOCK(object);
}
return (backing_object);
}
/* /*
* vm_object_terminate_pages removes any remaining pageable pages * vm_object_terminate_pages removes any remaining pageable pages
@ -843,9 +941,14 @@ vm_object_terminate_pages(vm_object_t object)
void void
vm_object_terminate(vm_object_t object) vm_object_terminate(vm_object_t object)
{ {
VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((object->flags & OBJ_DEAD) != 0, KASSERT((object->flags & OBJ_DEAD) != 0,
("terminating non-dead obj %p", object)); ("terminating non-dead obj %p", object));
KASSERT((object->flags & OBJ_COLLAPSING) == 0,
("terminating collapsing obj %p", object));
KASSERT(object->backing_object == NULL,
("terminating shadow obj %p", object));
/* /*
* wait for the pageout daemon to be done with the object * wait for the pageout daemon to be done with the object
@ -853,11 +956,11 @@ vm_object_terminate(vm_object_t object)
vm_object_pip_wait(object, "objtrm"); vm_object_pip_wait(object, "objtrm");
KASSERT(!REFCOUNT_COUNT(object->paging_in_progress), KASSERT(!REFCOUNT_COUNT(object->paging_in_progress),
("vm_object_terminate: pageout in progress")); ("vm_object_terminate: pageout in progress"));
KASSERT(object->ref_count == 0, KASSERT(object->ref_count == 0,
("vm_object_terminate: object with references, ref_count=%d", ("vm_object_terminate: object with references, ref_count=%d",
object->ref_count)); object->ref_count));
if ((object->flags & OBJ_PG_DTOR) == 0) if ((object->flags & OBJ_PG_DTOR) == 0)
vm_object_terminate_pages(object); vm_object_terminate_pages(object);
@ -1397,11 +1500,13 @@ void
vm_object_split(vm_map_entry_t entry) vm_object_split(vm_map_entry_t entry)
{ {
vm_page_t m, m_next; vm_page_t m, m_next;
vm_object_t orig_object, new_object, source; vm_object_t orig_object, new_object, backing_object;
vm_pindex_t idx, offidxstart; vm_pindex_t idx, offidxstart;
vm_size_t size; vm_size_t size;
orig_object = entry->object.vm_object; orig_object = entry->object.vm_object;
KASSERT((orig_object->flags & OBJ_ONEMAPPING) != 0,
("vm_object_split: Splitting object with multiple mappings."));
if ((orig_object->flags & OBJ_ANON) == 0) if ((orig_object->flags & OBJ_ANON) == 0)
return; return;
if (orig_object->ref_count <= 1) if (orig_object->ref_count <= 1)
@ -1418,36 +1523,26 @@ vm_object_split(vm_map_entry_t entry)
new_object = vm_object_allocate_anon(size, orig_object, new_object = vm_object_allocate_anon(size, orig_object,
orig_object->cred, ptoa(size)); orig_object->cred, ptoa(size));
/*
* We must wait for the orig_object to complete any in-progress
* collapse so that the swap blocks are stable below. The
* additional reference on backing_object by new object will
* prevent further collapse operations until split completes.
*/
VM_OBJECT_WLOCK(orig_object);
vm_object_collapse_wait(orig_object);
/* /*
* At this point, the new object is still private, so the order in * At this point, the new object is still private, so the order in
* which the original and new objects are locked does not matter. * which the original and new objects are locked does not matter.
*/ */
VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(new_object);
VM_OBJECT_WLOCK(orig_object);
new_object->domain = orig_object->domain; new_object->domain = orig_object->domain;
source = orig_object->backing_object; backing_object = orig_object->backing_object;
if (source != NULL) { if (backing_object != NULL) {
if ((source->flags & (OBJ_ANON | OBJ_DEAD)) != 0) { vm_object_backing_insert_ref(new_object, backing_object);
VM_OBJECT_WLOCK(source);
if ((source->flags & OBJ_DEAD) != 0) {
VM_OBJECT_WUNLOCK(source);
VM_OBJECT_WUNLOCK(orig_object);
VM_OBJECT_WUNLOCK(new_object);
new_object->cred = NULL;
vm_object_deallocate(new_object);
VM_OBJECT_WLOCK(orig_object);
return;
}
vm_object_backing_insert_locked(new_object, source);
vm_object_reference_locked(source); /* for new_object */
vm_object_clear_flag(source, OBJ_ONEMAPPING);
VM_OBJECT_WUNLOCK(source);
} else {
vm_object_backing_insert(new_object, source);
vm_object_reference(source);
}
new_object->backing_object_offset = new_object->backing_object_offset =
orig_object->backing_object_offset + entry->offset; orig_object->backing_object_offset + entry->offset;
} }
if (orig_object->cred != NULL) { if (orig_object->cred != NULL) {
crhold(orig_object->cred); crhold(orig_object->cred);
@ -1455,6 +1550,12 @@ vm_object_split(vm_map_entry_t entry)
("orig_object->charge < 0")); ("orig_object->charge < 0"));
orig_object->charge -= ptoa(size); orig_object->charge -= ptoa(size);
} }
/*
* Mark the split operation so that swap_pager_getpages() knows
* that the object is in transition.
*/
vm_object_set_flag(orig_object, OBJ_SPLIT);
retry: retry:
m = vm_page_find_least(orig_object, offidxstart); m = vm_page_find_least(orig_object, offidxstart);
for (; m != NULL && (idx = m->pindex - offidxstart) < size; for (; m != NULL && (idx = m->pindex - offidxstart) < size;
@ -1523,6 +1624,7 @@ vm_object_split(vm_map_entry_t entry)
TAILQ_FOREACH(m, &new_object->memq, listq) TAILQ_FOREACH(m, &new_object->memq, listq)
vm_page_xunbusy(m); vm_page_xunbusy(m);
} }
vm_object_clear_flag(orig_object, OBJ_SPLIT);
VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(orig_object);
VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(new_object);
entry->object.vm_object = new_object; entry->object.vm_object = new_object;
@ -1531,12 +1633,8 @@ vm_object_split(vm_map_entry_t entry)
VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(new_object);
} }
#define OBSC_COLLAPSE_NOWAIT 0x0002
#define OBSC_COLLAPSE_WAIT 0x0004
static vm_page_t static vm_page_t
vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p)
int op)
{ {
vm_object_t backing_object; vm_object_t backing_object;
@ -1546,8 +1644,6 @@ vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
KASSERT(p == NULL || p->object == object || p->object == backing_object, KASSERT(p == NULL || p->object == object || p->object == backing_object,
("invalid ownership %p %p %p", p, object, backing_object)); ("invalid ownership %p %p %p", p, object, backing_object));
if ((op & OBSC_COLLAPSE_NOWAIT) != 0)
return (next);
/* The page is only NULL when rename fails. */ /* The page is only NULL when rename fails. */
if (p == NULL) { if (p == NULL) {
VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(object);
@ -1632,8 +1728,8 @@ vm_object_scan_all_shadowed(vm_object_t object)
return (true); return (true);
} }
static bool static void
vm_object_collapse_scan(vm_object_t object, int op) vm_object_collapse_scan(vm_object_t object)
{ {
vm_object_t backing_object; vm_object_t backing_object;
vm_page_t next, p, pp; vm_page_t next, p, pp;
@ -1645,12 +1741,6 @@ vm_object_collapse_scan(vm_object_t object, int op)
backing_object = object->backing_object; backing_object = object->backing_object;
backing_offset_index = OFF_TO_IDX(object->backing_object_offset); backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
/*
* Initial conditions
*/
if ((op & OBSC_COLLAPSE_WAIT) != 0)
vm_object_set_flag(backing_object, OBJ_DEAD);
/* /*
* Our scan * Our scan
*/ */
@ -1662,12 +1752,16 @@ vm_object_collapse_scan(vm_object_t object, int op)
* Check for busy page * Check for busy page
*/ */
if (vm_page_tryxbusy(p) == 0) { if (vm_page_tryxbusy(p) == 0) {
next = vm_object_collapse_scan_wait(object, p, next, op); next = vm_object_collapse_scan_wait(object, p);
continue; continue;
} }
KASSERT(object->backing_object == backing_object,
("vm_object_collapse_scan: backing object mismatch %p != %p",
object->backing_object, backing_object));
KASSERT(p->object == backing_object, KASSERT(p->object == backing_object,
("vm_object_collapse_scan: object mismatch")); ("vm_object_collapse_scan: object mismatch %p != %p",
p->object, backing_object));
if (p->pindex < backing_offset_index || if (p->pindex < backing_offset_index ||
new_pindex >= object->size) { new_pindex >= object->size) {
@ -1689,16 +1783,9 @@ vm_object_collapse_scan(vm_object_t object, int op)
* The page in the parent is busy and possibly not * The page in the parent is busy and possibly not
* (yet) valid. Until its state is finalized by the * (yet) valid. Until its state is finalized by the
* busy bit owner, we can't tell whether it shadows the * busy bit owner, we can't tell whether it shadows the
* original page. Therefore, we must either skip it * original page.
* and the original (backing_object) page or wait for
* its state to be finalized.
*
* This is due to a race with vm_fault() where we must
* unbusy the original (backing_obj) page before we can
* (re)lock the parent. Hence we can get here.
*/ */
next = vm_object_collapse_scan_wait(object, pp, next, next = vm_object_collapse_scan_wait(object, pp);
op);
continue; continue;
} }
@ -1742,10 +1829,7 @@ vm_object_collapse_scan(vm_object_t object, int op)
*/ */
if (vm_page_rename(p, object, new_pindex)) { if (vm_page_rename(p, object, new_pindex)) {
vm_page_xunbusy(p); vm_page_xunbusy(p);
if (pp != NULL) next = vm_object_collapse_scan_wait(object, NULL);
vm_page_xunbusy(pp);
next = vm_object_collapse_scan_wait(object, NULL, next,
op);
continue; continue;
} }
@ -1763,27 +1847,7 @@ vm_object_collapse_scan(vm_object_t object, int op)
#endif #endif
vm_page_xunbusy(p); vm_page_xunbusy(p);
} }
return (true); return;
}
/*
* this version of collapse allows the operation to occur earlier and
* when paging_in_progress is true for an object... This is not a complete
* operation, but should plug 99.9% of the rest of the leaks.
*/
static void
vm_object_qcollapse(vm_object_t object)
{
vm_object_t backing_object = object->backing_object;
VM_OBJECT_ASSERT_WLOCKED(object);
VM_OBJECT_ASSERT_WLOCKED(backing_object);
if (backing_object->ref_count != 1)
return;
vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
} }
/* /*
@ -1801,53 +1865,48 @@ vm_object_collapse(vm_object_t object)
VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object);
while (TRUE) { while (TRUE) {
KASSERT((object->flags & (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON,
("collapsing invalid object"));
/* /*
* Verify that the conditions are right for collapse: * Wait for the backing_object to finish any pending
* * collapse so that the caller sees the shortest possible
* The object exists and the backing object exists. * shadow chain.
*/ */
if ((backing_object = object->backing_object) == NULL) backing_object = vm_object_backing_collapse_wait(object);
break; if (backing_object == NULL)
return;
KASSERT(object->ref_count > 0 &&
object->ref_count > object->shadow_count,
("collapse with invalid ref %d or shadow %d count.",
object->ref_count, object->shadow_count));
KASSERT((backing_object->flags &
(OBJ_COLLAPSING | OBJ_DEAD)) == 0,
("vm_object_collapse: Backing object already collapsing."));
KASSERT((object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0,
("vm_object_collapse: object is already collapsing."));
/* /*
* we check the backing object first, because it is most likely * We know that we can either collapse the backing object if
* not collapsable. * the parent is the only reference to it, or (perhaps) have
*/
if ((backing_object->flags & OBJ_ANON) == 0)
break;
VM_OBJECT_WLOCK(backing_object);
if ((backing_object->flags & OBJ_DEAD) != 0 ||
(object->flags & (OBJ_DEAD | OBJ_ANON)) != OBJ_ANON) {
VM_OBJECT_WUNLOCK(backing_object);
break;
}
if (REFCOUNT_COUNT(object->paging_in_progress) > 0 ||
REFCOUNT_COUNT(backing_object->paging_in_progress) > 0) {
vm_object_qcollapse(object);
VM_OBJECT_WUNLOCK(backing_object);
break;
}
/*
* We know that we can either collapse the backing object (if
* the parent is the only reference to it) or (perhaps) have
* the parent bypass the object if the parent happens to shadow * the parent bypass the object if the parent happens to shadow
* all the resident pages in the entire backing object. * all the resident pages in the entire backing object.
*
* This is ignoring pager-backed pages such as swap pages.
* vm_object_collapse_scan fails the shadowing test in this
* case.
*/ */
if (backing_object->ref_count == 1) { if (backing_object->ref_count == 1) {
KASSERT(backing_object->shadow_count == 1,
("vm_object_collapse: shadow_count: %d",
backing_object->shadow_count));
vm_object_pip_add(object, 1); vm_object_pip_add(object, 1);
vm_object_set_flag(object, OBJ_COLLAPSING);
vm_object_pip_add(backing_object, 1); vm_object_pip_add(backing_object, 1);
vm_object_set_flag(backing_object, OBJ_DEAD);
/* /*
* If there is exactly one reference to the backing * If there is exactly one reference to the backing
* object, we can collapse it into the parent. * object, we can collapse it into the parent.
*/ */
vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); vm_object_collapse_scan(object);
#if VM_NRESERVLEVEL > 0 #if VM_NRESERVLEVEL > 0
/* /*
@ -1866,31 +1925,24 @@ vm_object_collapse(vm_object_t object)
* the backing_object's and object's locks are * the backing_object's and object's locks are
* released and reacquired. * released and reacquired.
* Since swap_pager_copy() is being asked to * Since swap_pager_copy() is being asked to
* destroy the source, it will change the * destroy backing_object, it will change the
* backing_object's type to OBJT_DEFAULT. * type to OBJT_DEFAULT.
*/ */
swap_pager_copy( swap_pager_copy(
backing_object, backing_object,
object, object,
OFF_TO_IDX(object->backing_object_offset), TRUE); OFF_TO_IDX(object->backing_object_offset), TRUE);
} }
/* /*
* Object now shadows whatever backing_object did. * Object now shadows whatever backing_object did.
* Note that the reference to
* backing_object->backing_object moves from within
* backing_object to within object.
*/ */
vm_object_backing_remove_locked(object); vm_object_clear_flag(object, OBJ_COLLAPSING);
new_backing_object = backing_object->backing_object; vm_object_backing_transfer(object, backing_object);
if (new_backing_object != NULL) {
VM_OBJECT_WLOCK(new_backing_object);
vm_object_backing_remove_locked(backing_object);
vm_object_backing_insert_locked(object,
new_backing_object);
VM_OBJECT_WUNLOCK(new_backing_object);
}
object->backing_object_offset += object->backing_object_offset +=
backing_object->backing_object_offset; backing_object->backing_object_offset;
VM_OBJECT_WUNLOCK(object);
vm_object_pip_wakeup(object);
/* /*
* Discard backing_object. * Discard backing_object.
@ -1903,17 +1955,17 @@ vm_object_collapse(vm_object_t object)
"backing_object %p was somehow re-referenced during collapse!", "backing_object %p was somehow re-referenced during collapse!",
backing_object)); backing_object));
vm_object_pip_wakeup(backing_object); vm_object_pip_wakeup(backing_object);
backing_object->type = OBJT_DEAD; (void)refcount_release(&backing_object->ref_count);
refcount_release(&backing_object->ref_count); vm_object_terminate(backing_object);
VM_OBJECT_WUNLOCK(backing_object);
vm_object_destroy(backing_object);
vm_object_pip_wakeup(object);
counter_u64_add(object_collapses, 1); counter_u64_add(object_collapses, 1);
VM_OBJECT_WLOCK(object);
} else { } else {
/* /*
* If we do not entirely shadow the backing object, * If we do not entirely shadow the backing object,
* there is nothing we can do so we give up. * there is nothing we can do so we give up.
*
* The object lock and backing_object lock must not
* be dropped during this sequence.
*/ */
if (!vm_object_scan_all_shadowed(object)) { if (!vm_object_scan_all_shadowed(object)) {
VM_OBJECT_WUNLOCK(backing_object); VM_OBJECT_WUNLOCK(backing_object);
@ -1926,21 +1978,22 @@ vm_object_collapse(vm_object_t object)
* it, since its reference count is at least 2. * it, since its reference count is at least 2.
*/ */
vm_object_backing_remove_locked(object); vm_object_backing_remove_locked(object);
new_backing_object = backing_object->backing_object; new_backing_object = backing_object->backing_object;
if (new_backing_object != NULL) { if (new_backing_object != NULL) {
vm_object_backing_insert(object, vm_object_backing_insert_ref(object,
new_backing_object); new_backing_object);
vm_object_reference(new_backing_object);
object->backing_object_offset += object->backing_object_offset +=
backing_object->backing_object_offset; backing_object->backing_object_offset;
} }
/* /*
* Drop the reference count on backing_object. Since * Drop the reference count on backing_object. Since
* its ref_count was at least 2, it will not vanish. * its ref_count was at least 2, it will not vanish.
*/ */
refcount_release(&backing_object->ref_count); (void)refcount_release(&backing_object->ref_count);
KASSERT(backing_object->ref_count >= 1, (
"backing_object %p was somehow dereferenced during collapse!",
backing_object));
VM_OBJECT_WUNLOCK(backing_object); VM_OBJECT_WUNLOCK(backing_object);
counter_u64_add(object_bypasses, 1); counter_u64_add(object_bypasses, 1);
} }
@ -2155,7 +2208,7 @@ vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
VM_OBJECT_WLOCK(prev_object); VM_OBJECT_WLOCK(prev_object);
/* /*
* Try to collapse the object first * Try to collapse the object first.
*/ */
vm_object_collapse(prev_object); vm_object_collapse(prev_object);

View File

@ -190,6 +190,8 @@ struct vm_object {
#define OBJ_SIZEVNLOCK 0x0040 /* lock vnode to check obj size */ #define OBJ_SIZEVNLOCK 0x0040 /* lock vnode to check obj size */
#define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */ #define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */
#define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */
#define OBJ_SPLIT 0x0400 /* object is being split */
#define OBJ_COLLAPSING 0x0800 /* Parent of collapse. */
#define OBJ_COLORED 0x1000 /* pg_color is defined */ #define OBJ_COLORED 0x1000 /* pg_color is defined */
#define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */
#define OBJ_SHADOWLIST 0x4000 /* Object is on the shadow list. */ #define OBJ_SHADOWLIST 0x4000 /* Object is on the shadow list. */