Generalize lazy dequeue logic for wired pages.

Some recent work aims to remove the use of the page lock for
synchronizing updates to page queue state.  This change adds a mechanism
to preserve the existing behaviour of lazily dequeuing wired pages,
which was previously synchronized using the page lock.

Handle this by setting PGA_DEQUEUE when a managed page's wire count
transitions from 0 to 1.  When the page daemon encounters a page with a
flag in PGA_QUEUE_OP_MASK set, it creates a batch queue entry for that
page, but in so doing it does not modify the page itself and thus racing
with a concurrent free of the page is harmless.  The flag is advisory;
the page daemon still checks for wirings after acquiring the object and
page xbusy locks.

vm_page_unwire_managed() now clears PGA_DEQUEUE on a 1->0 transition.
It must do this before dropping the reference to avoid a use-after-free
but also handles races with concurrent wirings to ensure that
PGA_DEQUEUE is not left unset on a wired page.

Reviewed by:	jeff
Tested by:	pho
Sponsored by:	Netflix, Intel
Differential Revision:	https://reviews.freebsd.org/D22882
This commit is contained in:
Mark Johnston 2019-12-28 19:03:46 +00:00
parent f3f38e2580
commit b7f30bff2f
2 changed files with 73 additions and 99 deletions

View File

@ -3530,8 +3530,6 @@ vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
KASSERT((m->oflags & VPO_UNMANAGED) == 0, KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("page %p is unmanaged", m)); ("page %p is unmanaged", m));
KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL,
("missing synchronization for page %p", m));
KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
domain = vm_phys_domain(m); domain = vm_phys_domain(m);
@ -3904,8 +3902,11 @@ vm_page_wire(vm_page_t m)
old = atomic_fetchadd_int(&m->ref_count, 1); old = atomic_fetchadd_int(&m->ref_count, 1);
KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX,
("vm_page_wire: counter overflow for page %p", m)); ("vm_page_wire: counter overflow for page %p", m));
if (VPRC_WIRE_COUNT(old) == 0) if (VPRC_WIRE_COUNT(old) == 0) {
if ((m->oflags & VPO_UNMANAGED) == 0)
vm_page_aflag_set(m, PGA_DEQUEUE);
vm_wire_add(1); vm_wire_add(1);
}
} }
/* /*
@ -3928,8 +3929,11 @@ vm_page_wire_mapped(vm_page_t m)
return (false); return (false);
} while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1));
if (VPRC_WIRE_COUNT(old) == 0) if (VPRC_WIRE_COUNT(old) == 0) {
if ((m->oflags & VPO_UNMANAGED) == 0)
vm_page_aflag_set(m, PGA_DEQUEUE);
vm_wire_add(1); vm_wire_add(1);
}
return (true); return (true);
} }
@ -3942,37 +3946,43 @@ static void
vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse) vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse)
{ {
u_int old; u_int old;
bool locked;
KASSERT((m->oflags & VPO_UNMANAGED) == 0, KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("%s: page %p is unmanaged", __func__, m)); ("%s: page %p is unmanaged", __func__, m));
/* /*
* Update LRU state before releasing the wiring reference. * Update LRU state before releasing the wiring reference.
* We only need to do this once since we hold the page lock.
* Use a release store when updating the reference count to * Use a release store when updating the reference count to
* synchronize with vm_page_free_prep(). * synchronize with vm_page_free_prep().
*/ */
old = m->ref_count; old = m->ref_count;
locked = false;
do { do {
KASSERT(VPRC_WIRE_COUNT(old) > 0, KASSERT(VPRC_WIRE_COUNT(old) > 0,
("vm_page_unwire: wire count underflow for page %p", m)); ("vm_page_unwire: wire count underflow for page %p", m));
if (!locked && VPRC_WIRE_COUNT(old) == 1) {
vm_page_lock(m); if (old > VPRC_OBJREF + 1) {
locked = true; /*
* The page has at least one other wiring reference. An
* earlier iteration of this loop may have called
* vm_page_release_toq() and cleared PGA_DEQUEUE, so
* re-set it if necessary.
*/
if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0)
vm_page_aflag_set(m, PGA_DEQUEUE);
} else if (old == VPRC_OBJREF + 1) {
/*
* This is the last wiring. Clear PGA_DEQUEUE and
* update the page's queue state to reflect the
* reference. If the page does not belong to an object
* (i.e., the VPRC_OBJREF bit is clear), we only need to
* clear leftover queue state.
*/
vm_page_release_toq(m, nqueue, false); vm_page_release_toq(m, nqueue, false);
} else if (old == 1) {
vm_page_aflag_clear(m, PGA_DEQUEUE);
} }
} while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1));
/*
* Release the lock only after the wiring is released, to ensure that
* the page daemon does not encounter and dequeue the page while it is
* still wired.
*/
if (locked)
vm_page_unlock(m);
if (VPRC_WIRE_COUNT(old) == 1) { if (VPRC_WIRE_COUNT(old) == 1) {
vm_wire_sub(1); vm_wire_sub(1);
if (old == 1) if (old == 1)
@ -4026,6 +4036,8 @@ vm_page_unwire_noq(vm_page_t m)
if (VPRC_WIRE_COUNT(old) > 1) if (VPRC_WIRE_COUNT(old) > 1)
return (false); return (false);
if ((m->oflags & VPO_UNMANAGED) == 0)
vm_page_aflag_clear(m, PGA_DEQUEUE);
vm_wire_sub(1); vm_wire_sub(1);
return (true); return (true);
} }
@ -4035,10 +4047,6 @@ vm_page_unwire_noq(vm_page_t m)
* active or being moved to the active queue, ensure that its act_count is * active or being moved to the active queue, ensure that its act_count is
* at least ACT_INIT but do not otherwise mess with it. * at least ACT_INIT but do not otherwise mess with it.
* *
* The page may be wired. The caller should release its wiring reference
* before releasing the page lock, otherwise the page daemon may immediately
* dequeue the page.
*
* A managed page must be locked. * A managed page must be locked.
*/ */
static __always_inline void static __always_inline void
@ -4046,16 +4054,18 @@ vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag)
{ {
vm_page_astate_t old, new; vm_page_astate_t old, new;
vm_page_assert_locked(m);
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("%s: page %p is unmanaged", __func__, m));
KASSERT(m->ref_count > 0, KASSERT(m->ref_count > 0,
("%s: page %p does not carry any references", __func__, m)); ("%s: page %p does not carry any references", __func__, m));
KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD,
("%s: invalid flags %x", __func__, nflag)); ("%s: invalid flags %x", __func__, nflag));
if ((m->oflags & VPO_UNMANAGED) != 0)
return;
old = vm_page_astate_load(m); old = vm_page_astate_load(m);
do { do {
if ((old.flags & PGA_DEQUEUE) != 0)
break;
new = old; new = old;
if (nqueue == PQ_ACTIVE) if (nqueue == PQ_ACTIVE)
new.act_count = max(old.act_count, ACT_INIT); new.act_count = max(old.act_count, ACT_INIT);
@ -4076,8 +4086,6 @@ void
vm_page_activate(vm_page_t m) vm_page_activate(vm_page_t m)
{ {
if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
return;
vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE);
} }
@ -4089,8 +4097,6 @@ void
vm_page_deactivate(vm_page_t m) vm_page_deactivate(vm_page_t m)
{ {
if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
return;
vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE);
} }
@ -4098,11 +4104,6 @@ void
vm_page_deactivate_noreuse(vm_page_t m) vm_page_deactivate_noreuse(vm_page_t m)
{ {
KASSERT(m->object != NULL,
("vm_page_deactivate_noreuse: page %p has no object", m));
if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
return;
vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD);
} }
@ -4113,8 +4114,6 @@ void
vm_page_launder(vm_page_t m) vm_page_launder(vm_page_t m)
{ {
if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
return;
vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE);
} }
@ -4142,8 +4141,6 @@ vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse)
vm_page_astate_t old, new; vm_page_astate_t old, new;
uint16_t nflag; uint16_t nflag;
vm_page_assert_locked(m);
/* /*
* Use a check of the valid bits to determine whether we should * Use a check of the valid bits to determine whether we should
* accelerate reclamation of the page. The object lock might not be * accelerate reclamation of the page. The object lock might not be

View File

@ -317,6 +317,26 @@ vm_pageout_next(struct scan_state *ss, const bool dequeue)
return (vm_batchqueue_pop(&ss->bq)); return (vm_batchqueue_pop(&ss->bq));
} }
/*
* Determine whether processing of a page should be deferred and ensure that any
* outstanding queue operations are processed.
*/
static __always_inline bool
vm_pageout_defer(vm_page_t m, const uint8_t queue, const bool enqueued)
{
vm_page_astate_t as;
as = vm_page_astate_load(m);
if (__predict_false(as.queue != queue ||
((as.flags & PGA_ENQUEUED) != 0) != enqueued))
return (true);
if ((as.flags & PGA_QUEUE_OP_MASK) != 0) {
vm_page_pqbatch_submit(m, queue);
return (true);
}
return (false);
}
/* /*
* Scan for pages at adjacent offsets within the given page's object that are * Scan for pages at adjacent offsets within the given page's object that are
* eligible for laundering, form a cluster of these pages and the given page, * eligible for laundering, form a cluster of these pages and the given page,
@ -755,35 +775,14 @@ scan:
recheck: recheck:
/* /*
* The page may have been disassociated from the queue * Don't touch a page that was removed from the queue after the
* or even freed while locks were dropped. We thus must be * page queue lock was released. Otherwise, ensure that any
* careful whenever modifying page state. Once the object lock * pending queue operations, such as dequeues for wired pages,
* has been acquired, we have a stable reference to the page. * are handled.
*/ */
if (vm_page_queue(m) != queue) if (vm_pageout_defer(m, queue, true))
continue; continue;
/*
* A requeue was requested, so this page gets a second
* chance.
*/
if ((m->a.flags & PGA_REQUEUE) != 0) {
vm_page_pqbatch_submit(m, queue);
continue;
}
/*
* Wired pages may not be freed. Complete their removal
* from the queue now to avoid needless revisits during
* future scans. This check is racy and must be reverified once
* we hold the object lock and have verified that the page
* is not busy.
*/
if (vm_page_wired(m)) {
vm_page_dequeue_deferred(m);
continue;
}
if (object != m->object) { if (object != m->object) {
if (object != NULL) if (object != NULL)
VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(object);
@ -813,9 +812,9 @@ recheck:
continue; continue;
/* /*
* Re-check for wirings now that we hold the object lock and * Check for wirings now that we hold the object lock and have
* have verified that the page is unbusied. If the page is * verified that the page is unbusied. If the page is mapped,
* mapped, it may still be wired by pmap lookups. The call to * it may still be wired by pmap lookups. The call to
* vm_page_try_remove_all() below atomically checks for such * vm_page_try_remove_all() below atomically checks for such
* wirings and removes mappings. If the page is unmapped, the * wirings and removes mappings. If the page is unmapped, the
* wire count is guaranteed not to increase. * wire count is guaranteed not to increase.
@ -1259,22 +1258,14 @@ act_scan:
vm_page_change_lock(m, &mtx); vm_page_change_lock(m, &mtx);
/* /*
* The page may have been disassociated from the queue * Don't touch a page that was removed from the queue after the
* or even freed while locks were dropped. We thus must be * page queue lock was released. Otherwise, ensure that any
* careful whenever modifying page state. Once the object lock * pending queue operations, such as dequeues for wired pages,
* has been acquired, we have a stable reference to the page. * are handled.
*/ */
if (vm_page_queue(m) != PQ_ACTIVE) if (vm_pageout_defer(m, PQ_ACTIVE, true))
continue; continue;
/*
* Wired pages are dequeued lazily.
*/
if (vm_page_wired(m)) {
vm_page_dequeue_deferred(m);
continue;
}
/* /*
* A page's object pointer may be set to NULL before * A page's object pointer may be set to NULL before
* the object lock is acquired. * the object lock is acquired.
@ -1515,28 +1506,14 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage,
recheck: recheck:
/* /*
* The page may have been disassociated from the queue * Don't touch a page that was removed from the queue after the
* or even freed while locks were dropped. We thus must be * page queue lock was released. Otherwise, ensure that any
* careful whenever modifying page state. Once the object lock * pending queue operations, such as dequeues for wired pages,
* has been acquired, we have a stable reference to the page. * are handled.
*/ */
old = vm_page_astate_load(m); if (vm_pageout_defer(m, PQ_INACTIVE, false))
if (old.queue != PQ_INACTIVE ||
(old.flags & PGA_QUEUE_STATE_MASK) != 0)
continue; continue;
/*
* Wired pages may not be freed. Complete their removal
* from the queue now to avoid needless revisits during
* future scans. This check is racy and must be reverified once
* we hold the object lock and have verified that the page
* is not busy.
*/
if (vm_page_wired(m)) {
vm_page_dequeue_deferred(m);
continue;
}
if (object != m->object) { if (object != m->object) {
if (object != NULL) if (object != NULL)
VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(object);