Generalize lazy dequeue logic for wired pages.

Some recent work aims to remove the use of the page lock for synchronizing updates to page queue state. This change adds a mechanism to preserve the existing behaviour of lazily dequeuing wired pages, which was previously synchronized using the page lock. Handle this by setting PGA_DEQUEUE when a managed page's wire count transitions from 0 to 1. When the page daemon encounters a page with a flag in PGA_QUEUE_OP_MASK set, it creates a batch queue entry for that page, but in so doing it does not modify the page itself and thus racing with a concurrent free of the page is harmless. The flag is advisory; the page daemon still checks for wirings after acquiring the object and page xbusy locks. vm_page_unwire_managed() now clears PGA_DEQUEUE on a 1->0 transition. It must do this before dropping the reference to avoid a use-after-free but also handles races with concurrent wirings to ensure that PGA_DEQUEUE is not left unset on a wired page. Reviewed by: jeff Tested by: pho Sponsored by: Netflix, Intel Differential Revision: https://reviews.freebsd.org/D22882
2019-12-28 19:03:46 +00:00 · 2019-12-28 19:03:46 +00:00 · b7f30bff2f
commit b7f30bff2f
parent f3f38e2580
2 changed files with 73 additions and 99 deletions
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -3530,8 +3530,6 @@ vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("page %p is unmanaged", m));
 	KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL,
 	    ("missing synchronization for page %p", m));
 	KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
 	domain = vm_phys_domain(m);
@ -3904,8 +3902,11 @@ vm_page_wire(vm_page_t m)
 	old = atomic_fetchadd_int(&m->ref_count, 1);
 	KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX,
 	    ("vm_page_wire: counter overflow for page %p", m));
-	if (VPRC_WIRE_COUNT(old) == 0)
+	if (VPRC_WIRE_COUNT(old) == 0) {
 		if ((m->oflags & VPO_UNMANAGED) == 0)
 			vm_page_aflag_set(m, PGA_DEQUEUE);
 		vm_wire_add(1);
 	}
 }
 /*
@ -3928,8 +3929,11 @@ vm_page_wire_mapped(vm_page_t m)
 			return (false);
 	} while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1));
-	if (VPRC_WIRE_COUNT(old) == 0)
+	if (VPRC_WIRE_COUNT(old) == 0) {
 		if ((m->oflags & VPO_UNMANAGED) == 0)
 			vm_page_aflag_set(m, PGA_DEQUEUE);
 		vm_wire_add(1);
 	}
 	return (true);
 }
@ -3942,37 +3946,43 @@ static void
 vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse)
 {
 	u_int old;
 	bool locked;
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is unmanaged", __func__, m));
 	/*
 	 * Update LRU state before releasing the wiring reference.
 	 * We only need to do this once since we hold the page lock.
 	 * Use a release store when updating the reference count to
 	 * synchronize with vm_page_free_prep().
 	 */
 	old = m->ref_count;
 	locked = false;
 	do {
 		KASSERT(VPRC_WIRE_COUNT(old) > 0,
 		    ("vm_page_unwire: wire count underflow for page %p", m));
-		if (!locked && VPRC_WIRE_COUNT(old) == 1) {
+
-			vm_page_lock(m);
+		if (old > VPRC_OBJREF + 1) {
-			locked = true;
+			/*
 			 * The page has at least one other wiring reference.  An
 			 * earlier iteration of this loop may have called
 			 * vm_page_release_toq() and cleared PGA_DEQUEUE, so
 			 * re-set it if necessary.
 			 */
 			if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0)
 				vm_page_aflag_set(m, PGA_DEQUEUE);
 		} else if (old == VPRC_OBJREF + 1) {
 			/*
 			 * This is the last wiring.  Clear PGA_DEQUEUE and
 			 * update the page's queue state to reflect the
 			 * reference.  If the page does not belong to an object
 			 * (i.e., the VPRC_OBJREF bit is clear), we only need to
 			 * clear leftover queue state.
 			 */
 			vm_page_release_toq(m, nqueue, false);
 		} else if (old == 1) {
 			vm_page_aflag_clear(m, PGA_DEQUEUE);
 		}
 	} while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1));
 	/*
 	 * Release the lock only after the wiring is released, to ensure that
 	 * the page daemon does not encounter and dequeue the page while it is
 	 * still wired.
 	 */
 	if (locked)
 		vm_page_unlock(m);
 	if (VPRC_WIRE_COUNT(old) == 1) {
 		vm_wire_sub(1);
 		if (old == 1)
@ -4026,6 +4036,8 @@ vm_page_unwire_noq(vm_page_t m)
 	if (VPRC_WIRE_COUNT(old) > 1)
 		return (false);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_aflag_clear(m, PGA_DEQUEUE);
 	vm_wire_sub(1);
 	return (true);
 }
@ -4035,10 +4047,6 @@ vm_page_unwire_noq(vm_page_t m)
 * active or being moved to the active queue, ensure that its act_count is
 * at least ACT_INIT but do not otherwise mess with it.
 *
 * The page may be wired.  The caller should release its wiring reference
 * before releasing the page lock, otherwise the page daemon may immediately
 * dequeue the page.
 *
 * A managed page must be locked.
 */
 static __always_inline void
@ -4046,16 +4054,18 @@ vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag)
 {
 	vm_page_astate_t old, new;
 	vm_page_assert_locked(m);
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("%s: page %p is unmanaged", __func__, m));
 	KASSERT(m->ref_count > 0,
 	    ("%s: page %p does not carry any references", __func__, m));
 	KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD,
 	    ("%s: invalid flags %x", __func__, nflag));
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return;
 	old = vm_page_astate_load(m);
 	do {
 		if ((old.flags & PGA_DEQUEUE) != 0)
 			break;
 		new = old;
 		if (nqueue == PQ_ACTIVE)
 			new.act_count = max(old.act_count, ACT_INIT);
@ -4076,8 +4086,6 @@ void
 vm_page_activate(vm_page_t m)
 {
 	if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
 		return;
 	vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE);
 }
@ -4089,8 +4097,6 @@ void
 vm_page_deactivate(vm_page_t m)
 {
 	if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
 		return;
 	vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE);
 }
@ -4098,11 +4104,6 @@ void
 vm_page_deactivate_noreuse(vm_page_t m)
 {
 	KASSERT(m->object != NULL,
 	    ("vm_page_deactivate_noreuse: page %p has no object", m));
 	if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
 		return;
 	vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD);
 }
@ -4113,8 +4114,6 @@ void
 vm_page_launder(vm_page_t m)
 {
 	if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m))
 		return;
 	vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE);
 }
@ -4142,8 +4141,6 @@ vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse)
 	vm_page_astate_t old, new;
 	uint16_t nflag;
 	vm_page_assert_locked(m);
 	/*
 	 * Use a check of the valid bits to determine whether we should
 	 * accelerate reclamation of the page.  The object lock might not be
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -317,6 +317,26 @@ vm_pageout_next(struct scan_state *ss, const bool dequeue)
 	return (vm_batchqueue_pop(&ss->bq));
 }
 /*
 * Determine whether processing of a page should be deferred and ensure that any
 * outstanding queue operations are processed.
 */
 static __always_inline bool
 vm_pageout_defer(vm_page_t m, const uint8_t queue, const bool enqueued)
 {
 	vm_page_astate_t as;
 	as = vm_page_astate_load(m);
 	if (__predict_false(as.queue != queue ||
 	    ((as.flags & PGA_ENQUEUED) != 0) != enqueued))
 		return (true);
 	if ((as.flags & PGA_QUEUE_OP_MASK) != 0) {
 		vm_page_pqbatch_submit(m, queue);
 		return (true);
 	}
 	return (false);
 }
 /*
 * Scan for pages at adjacent offsets within the given page's object that are
 * eligible for laundering, form a cluster of these pages and the given page,
@ -755,35 +775,14 @@ scan:
 recheck:
 		/*
-		 * The page may have been disassociated from the queue
+		 * Don't touch a page that was removed from the queue after the
-		 * or even freed while locks were dropped.  We thus must be
+		 * page queue lock was released.  Otherwise, ensure that any
-		 * careful whenever modifying page state.  Once the object lock
+		 * pending queue operations, such as dequeues for wired pages,
-		 * has been acquired, we have a stable reference to the page.
+		 * are handled.
 		 */
-		if (vm_page_queue(m) != queue)
+		if (vm_pageout_defer(m, queue, true))
 			continue;
 		/*
 		 * A requeue was requested, so this page gets a second
 		 * chance.
 		 */
 		if ((m->a.flags & PGA_REQUEUE) != 0) {
 			vm_page_pqbatch_submit(m, queue);
 			continue;
 		}
 		/*
 		 * Wired pages may not be freed.  Complete their removal
 		 * from the queue now to avoid needless revisits during
 		 * future scans.  This check is racy and must be reverified once
 		 * we hold the object lock and have verified that the page
 		 * is not busy.
 		 */
 		if (vm_page_wired(m)) {
 			vm_page_dequeue_deferred(m);
 			continue;
 		}
 		if (object != m->object) {
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
@ -813,9 +812,9 @@ recheck:
 			continue;
 		/*
-		 * Re-check for wirings now that we hold the object lock and
+		 * Check for wirings now that we hold the object lock and have
-		 * have verified that the page is unbusied.  If the page is
+		 * verified that the page is unbusied.  If the page is mapped,
-		 * mapped, it may still be wired by pmap lookups.  The call to
+		 * it may still be wired by pmap lookups.  The call to
 		 * vm_page_try_remove_all() below atomically checks for such
 		 * wirings and removes mappings.  If the page is unmapped, the
 		 * wire count is guaranteed not to increase.
@ -1259,22 +1258,14 @@ act_scan:
 		vm_page_change_lock(m, &mtx);
 		/*
-		 * The page may have been disassociated from the queue
+		 * Don't touch a page that was removed from the queue after the
-		 * or even freed while locks were dropped.  We thus must be
+		 * page queue lock was released.  Otherwise, ensure that any
-		 * careful whenever modifying page state.  Once the object lock
+		 * pending queue operations, such as dequeues for wired pages,
-		 * has been acquired, we have a stable reference to the page.
+		 * are handled.
 		 */
-		if (vm_page_queue(m) != PQ_ACTIVE)
+		if (vm_pageout_defer(m, PQ_ACTIVE, true))
 			continue;
 		/*
 		 * Wired pages are dequeued lazily.
 		 */
 		if (vm_page_wired(m)) {
 			vm_page_dequeue_deferred(m);
 			continue;
 		}
 		/*
 		 * A page's object pointer may be set to NULL before
 		 * the object lock is acquired.
@ -1515,28 +1506,14 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage,
 recheck:
 		/*
-		 * The page may have been disassociated from the queue
+		 * Don't touch a page that was removed from the queue after the
-		 * or even freed while locks were dropped.  We thus must be
+		 * page queue lock was released.  Otherwise, ensure that any
-		 * careful whenever modifying page state.  Once the object lock
+		 * pending queue operations, such as dequeues for wired pages,
-		 * has been acquired, we have a stable reference to the page.
+		 * are handled.
 		 */
-		old = vm_page_astate_load(m);
+		if (vm_pageout_defer(m, PQ_INACTIVE, false))
 		if (old.queue != PQ_INACTIVE ||
 		    (old.flags & PGA_QUEUE_STATE_MASK) != 0)
 			continue;
 		/*
 		 * Wired pages may not be freed.  Complete their removal
 		 * from the queue now to avoid needless revisits during
 		 * future scans.  This check is racy and must be reverified once
 		 * we hold the object lock and have verified that the page
 		 * is not busy.
 		 */
 		if (vm_page_wired(m)) {
 			vm_page_dequeue_deferred(m);
 			continue;
 		}
 		if (object != m->object) {
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);