Improve VM page queue scalability.

Currently both the page lock and a page queue lock must be held in order to enqueue, dequeue or requeue a page in a given page queue. The queue locks are a scalability bottleneck in many workloads. This change reduces page queue lock contention by batching queue operations. To detangle the page and page queue locks, per-CPU batch queues are used to reference pages with pending queue operations. The requested operation is encoded in the page's aflags field with the page lock held, after which the page is enqueued for a deferred batch operation. Page queue scans are similarly optimized to minimize the amount of work performed with a page queue lock held. Reviewed by: kib, jeff (previous versions) Tested by: pho Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D14893
svn path=/head/; revision=332974
2018-04-24 21:15:54 +00:00 · 2018-04-24 21:15:54 +00:00 · 5cd29d0f3c · 2020-12-20 02:59:44 +00:00
commit 5cd29d0f3c
parent 55ba21d4fd
9 changed files with 905 additions and 559 deletions
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@ -227,4 +227,10 @@

 #define	ZERO_REGION_SIZE	(2 * 1024 * 1024)	/* 2MB */

+/*
+ * Use a fairly large batch size since we expect amd64 systems to have lots of
+ * memory.
+ */
+#define	VM_BATCHQUEUE_SIZE	31
+
 #endif /* _MACHINE_VMPARAM_H_ */
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@ -603,7 +603,6 @@ static struct witness_order_list_entry order_lists[] = {
 	 * CDEV
 	 */
 	{ "vm map (system)", &lock_class_mtx_sleep },
-	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
@ -613,11 +612,11 @@ static struct witness_order_list_entry order_lists[] = {
 	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_rw },
 	{ "vm page", &lock_class_mtx_sleep },
-	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ "pmap pv list", &lock_class_rw },
 	{ "vm page free queue", &lock_class_mtx_sleep },
+	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@ -720,14 +720,11 @@ static void
 vm_object_terminate_pages(vm_object_t object)
 {
 	vm_page_t p, p_next;
-	struct mtx *mtx, *mtx1;
-	struct vm_pagequeue *pq, *pq1;
-	int dequeued;
+	struct mtx *mtx;

 	VM_OBJECT_ASSERT_WLOCKED(object);

 	mtx = NULL;
-	pq = NULL;

 	/*
 	 * Free any remaining pageable pages.  This also removes them from the
@ -737,60 +734,21 @@ vm_object_terminate_pages(vm_object_t object)
 	 */
 	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
 		vm_page_assert_unbusied(p);
-		if ((object->flags & OBJ_UNMANAGED) == 0) {
+		if ((object->flags & OBJ_UNMANAGED) == 0)
 			/*
 			 * vm_page_free_prep() only needs the page
 			 * lock for managed pages.
 			 */
-			mtx1 = vm_page_lockptr(p);
-			if (mtx1 != mtx) {
-				if (mtx != NULL)
-					mtx_unlock(mtx);
-				if (pq != NULL) {
-					vm_pagequeue_cnt_add(pq, dequeued);
-					vm_pagequeue_unlock(pq);
-					pq = NULL;
-				}
-				mtx = mtx1;
-				mtx_lock(mtx);
-			}
-		}
+			vm_page_change_lock(p, &mtx);
 		p->object = NULL;
 		if (p->wire_count != 0)
-			goto unlist;
-		VM_CNT_INC(v_pfree);
-		p->flags &= ~PG_ZERO;
-		if (p->queue != PQ_NONE) {
-			KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
-			    "page %p is not queued", p));
-			pq1 = vm_page_pagequeue(p);
-			if (pq != pq1) {
-				if (pq != NULL) {
-					vm_pagequeue_cnt_add(pq, dequeued);
-					vm_pagequeue_unlock(pq);
-				}
-				pq = pq1;
-				vm_pagequeue_lock(pq);
-				dequeued = 0;
-			}
-			p->queue = PQ_NONE;
-			TAILQ_REMOVE(&pq->pq_pl, p, plinks.q);
-			dequeued--;
-		}
-		if (vm_page_free_prep(p, true))
 			continue;
-unlist:
-		TAILQ_REMOVE(&object->memq, p, listq);
-	}
-	if (pq != NULL) {
-		vm_pagequeue_cnt_add(pq, dequeued);
-		vm_pagequeue_unlock(pq);
+		VM_CNT_INC(v_pfree);
+		vm_page_free(p);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);

-	vm_page_free_phys_pglist(&object->memq);
-
 	/*
 	 * If the object contained any pages, then reset it to an empty state.
 	 * None of the object's fields, including "resident_page_count", were
@ -1973,7 +1931,6 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
 {
 	vm_page_t p, next;
 	struct mtx *mtx;
-	struct pglist pgl;

 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
@ -1982,7 +1939,6 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
 	if (object->resident_page_count == 0)
 		return;
 	vm_object_pip_add(object, 1);
-	TAILQ_INIT(&pgl);
 again:
 	p = vm_page_find_least(object, start);
 	mtx = NULL;
@ -2036,13 +1992,10 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
 		}
 		if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0)
 			pmap_remove_all(p);
-		p->flags &= ~PG_ZERO;
-		if (vm_page_free_prep(p, false))
-			TAILQ_INSERT_TAIL(&pgl, p, listq);
+		vm_page_free(p);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
-	vm_page_free_phys_pglist(&pgl);
 	vm_object_pip_wakeup(object);
 }

--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -102,6 +102,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
+#include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
@ -131,13 +132,10 @@ extern int	uma_startup_count(int);
 extern void	uma_startup(void *, int);
 extern int	vmem_startup_count(void);

-/*
- *	Associated with page of user-allocatable memory is a
- *	page structure.
- */
-
 struct vm_domain vm_dom[MAXMEMDOM];

+static DPCPU_DEFINE(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
+
 struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];

 struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
@ -176,7 +174,8 @@ static uma_zone_t fakepg_zone;

 static void vm_page_alloc_check(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
-static void vm_page_enqueue(uint8_t queue, vm_page_t m);
+static void vm_page_dequeue_complete(vm_page_t m);
+static void vm_page_enqueue(vm_page_t m, uint8_t queue);
 static void vm_page_init(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
    vm_pindex_t pindex, vm_page_t mpred);
@ -443,12 +442,13 @@ sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
 * Nonetheless, it write busies and initializes the hold count to one as
 * safety precautions.
 */
-void
-vm_page_init_marker(vm_page_t marker, int queue)
+static void
+vm_page_init_marker(vm_page_t marker, int queue, uint8_t aflags)
 {

 	bzero(marker, sizeof(*marker));
 	marker->flags = PG_MARKER;
+	marker->aflags = aflags;
 	marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	marker->queue = queue;
 	marker->hold_count = 1;
@ -481,14 +481,32 @@ vm_page_domain_init(int domain)
 		TAILQ_INIT(&pq->pq_pl);
 		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
 		    MTX_DEF | MTX_DUPOK);
-		vm_page_init_marker(&vmd->vmd_markers[i], i);
+		vm_page_init_marker(&vmd->vmd_markers[i], i, 0);
 	}
 	mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
 	mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF);
-	vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE);
+	snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
+
+	/*
+	 * inacthead is used to provide FIFO ordering for LRU-bypassing
+	 * insertions.
+	 */
+	vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED);
 	TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
 	    &vmd->vmd_inacthead, plinks.q);
-	snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
+
+	/*
+	 * The clock pages are used to implement active queue scanning without
+	 * requeues.  Scans start at clock[0], which is advanced after the scan
+	 * ends.  When the two clock hands meet, they are reset and scanning
+	 * resumes from the head of the queue.
+	 */
+	vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED);
+	vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED);
+	TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
+	    &vmd->vmd_clock[0], plinks.q);
+	TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
+	    &vmd->vmd_clock[1], plinks.q);
 }

 /*
@ -1847,6 +1865,7 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
 	KASSERT(m != NULL, ("missing page"));

 found:
+	vm_page_dequeue(m);
 	vm_page_alloc_check(m);

 	/*
@ -2043,8 +2062,10 @@ vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
 #if VM_NRESERVLEVEL > 0
 found:
 #endif
-	for (m = m_ret; m < &m_ret[npages]; m++)
+	for (m = m_ret; m < &m_ret[npages]; m++) {
+		vm_page_dequeue(m);
 		vm_page_alloc_check(m);
+	}

 	/*
 	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
@ -2188,6 +2209,7 @@ vm_page_alloc_freelist_domain(int domain, int freelist, int req)
 			goto again;
 		return (NULL);
 	}
+	vm_page_dequeue(m);
 	vm_page_alloc_check(m);

 	/*
@ -2381,7 +2403,7 @@ vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
 				    vm_reserv_size(level)) - pa);
 #endif
 			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
-			    m->queue != PQ_NONE && !vm_page_busied(m)) {
+			    vm_page_enqueued(m) && !vm_page_busied(m)) {
 				/*
 				 * The page is allocated but eligible for
 				 * relocation.  Extend the current run by one
@ -2532,7 +2554,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
 				error = EINVAL;
 			else if (object->memattr != VM_MEMATTR_DEFAULT)
 				error = EINVAL;
-			else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
+			else if (vm_page_enqueued(m) && !vm_page_busied(m)) {
 				KASSERT(pmap_page_get_memattr(m) ==
 				    VM_MEMATTR_DEFAULT,
 				    ("page %p has an unexpected memattr", m));
@ -2592,7 +2614,8 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
 					 */
 					if (object->ref_count != 0)
 						pmap_remove_all(m);
-					m_new->aflags = m->aflags;
+					m_new->aflags = m->aflags &
+					    ~PGA_QUEUE_STATE_MASK;
 					KASSERT(m_new->oflags == VPO_UNMANAGED,
 					    ("page %p is managed", m_new));
 					m_new->oflags = m->oflags & VPO_NOSYNC;
@ -2604,7 +2627,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
 					vm_page_remque(m);
 					vm_page_replace_checked(m_new, object,
 					    m->pindex, m);
-					if (vm_page_free_prep(m, false))
+					if (vm_page_free_prep(m))
 						SLIST_INSERT_HEAD(&free, m,
 						    plinks.s.ss);

@ -2618,7 +2641,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
 					m->flags &= ~PG_ZERO;
 					vm_page_remque(m);
 					vm_page_remove(m);
-					if (vm_page_free_prep(m, false))
+					if (vm_page_free_prep(m))
 						SLIST_INSERT_HEAD(&free, m,
 						    plinks.s.ss);
 					KASSERT(m->dirty == 0,
@ -3061,113 +3084,297 @@ vm_page_pagequeue(vm_page_t m)
 	return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]);
 }

+static struct mtx *
+vm_page_pagequeue_lockptr(vm_page_t m)
+{
+
+	if (m->queue == PQ_NONE)
+		return (NULL);
+	return (&vm_page_pagequeue(m)->pq_mutex);
+}
+
+static inline void
+vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m)
+{
+	struct vm_domain *vmd;
+	uint8_t aflags;
+
+	vm_pagequeue_assert_locked(pq);
+	KASSERT(pq == vm_page_pagequeue(m),
+	    ("page %p doesn't belong to %p", m, pq));
+
+	aflags = m->aflags;
+	if ((aflags & PGA_DEQUEUE) != 0) {
+		if (__predict_true((aflags & PGA_ENQUEUED) != 0)) {
+			TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+			vm_pagequeue_cnt_dec(pq);
+		}
+		vm_page_dequeue_complete(m);
+	} else if ((aflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) {
+		if ((aflags & PGA_ENQUEUED) != 0)
+			TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+		else {
+			vm_pagequeue_cnt_inc(pq);
+			vm_page_aflag_set(m, PGA_ENQUEUED);
+		}
+		if ((aflags & PGA_REQUEUE_HEAD) != 0) {
+			KASSERT(m->queue == PQ_INACTIVE,
+			    ("head enqueue not supported for page %p", m));
+			vmd = vm_pagequeue_domain(m);
+			TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
+		} else
+			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+
+		/*
+		 * PGA_REQUEUE and PGA_REQUEUE_HEAD must be cleared after
+		 * setting PGA_ENQUEUED in order to synchronize with the
+		 * page daemon.
+		 */
+		vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
+	}
+}
+
+static void
+vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq,
+    uint8_t queue)
+{
+	vm_page_t m;
+	int i;
+
+	for (i = 0; i < bq->bq_cnt; i++) {
+		m = bq->bq_pa[i];
+		if (__predict_false(m->queue != queue))
+			continue;
+		vm_pqbatch_process_page(pq, m);
+	}
+	vm_batchqueue_init(bq);
+}
+
+static void
+vm_pqbatch_submit_page(vm_page_t m, uint8_t queue)
+{
+	struct vm_batchqueue *bq;
+	struct vm_pagequeue *pq;
+	int domain;
+
+	vm_page_assert_locked(m);
+	KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
+
+	domain = vm_phys_domain(m);
+	pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue];
+
+	critical_enter();
+	bq = DPCPU_PTR(pqbatch[domain][queue]);
+	if (vm_batchqueue_insert(bq, m)) {
+		critical_exit();
+		return;
+	}
+	if (!vm_pagequeue_trylock(pq)) {
+		critical_exit();
+		vm_pagequeue_lock(pq);
+		critical_enter();
+		bq = DPCPU_PTR(pqbatch[domain][queue]);
+	}
+	vm_pqbatch_process(pq, bq, queue);
+
+	/*
+	 * The page may have been logically dequeued before we acquired the
+	 * page queue lock.  In this case, the page lock prevents the page
+	 * from being logically enqueued elsewhere.
+	 */
+	if (__predict_true(m->queue == queue))
+		vm_pqbatch_process_page(pq, m);
+	else {
+		KASSERT(m->queue == PQ_NONE,
+		    ("invalid queue transition for page %p", m));
+		KASSERT((m->aflags & PGA_ENQUEUED) == 0,
+		    ("page %p is enqueued with invalid queue index", m));
+		vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
+	}
+	vm_pagequeue_unlock(pq);
+	critical_exit();
+}
+
 /*
- *	vm_page_dequeue:
+ *	vm_page_drain_pqbatch:		[ internal use only ]
 *
- *	Remove the given page from its current page queue.
+ *	Force all per-CPU page queue batch queues to be drained.  This is
+ *	intended for use in severe memory shortages, to ensure that pages
+ *	do not remain stuck in the batch queues.
+ */
+void
+vm_page_drain_pqbatch(void)
+{
+	struct thread *td;
+	struct vm_domain *vmd;
+	struct vm_pagequeue *pq;
+	int cpu, domain, queue;
+
+	td = curthread;
+	CPU_FOREACH(cpu) {
+		thread_lock(td);
+		sched_bind(td, cpu);
+		thread_unlock(td);
+
+		for (domain = 0; domain < vm_ndomains; domain++) {
+			vmd = VM_DOMAIN(domain);
+			for (queue = 0; queue < PQ_COUNT; queue++) {
+				pq = &vmd->vmd_pagequeues[queue];
+				vm_pagequeue_lock(pq);
+				critical_enter();
+				vm_pqbatch_process(pq,
+				    DPCPU_PTR(pqbatch[domain][queue]), queue);
+				critical_exit();
+				vm_pagequeue_unlock(pq);
+			}
+		}
+	}
+	thread_lock(td);
+	sched_unbind(td);
+	thread_unlock(td);
+}
+
+/*
+ * Complete the logical removal of a page from a page queue.  We must be
+ * careful to synchronize with the page daemon, which may be concurrently
+ * examining the page with only the page lock held.  The page must not be
+ * in a state where it appears to be logically enqueued.
+ */
+static void
+vm_page_dequeue_complete(vm_page_t m)
+{
+
+	m->queue = PQ_NONE;
+	atomic_thread_fence_rel();
+	vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
+}
+
+/*
+ *	vm_page_dequeue_deferred:	[ internal use only ]
+ *
+ *	Request removal of the given page from its current page
+ *	queue.  Physical removal from the queue may be deferred
+ *	indefinitely.
 *
 *	The page must be locked.
 */
 void
-vm_page_dequeue(vm_page_t m)
+vm_page_dequeue_deferred(vm_page_t m)
 {
-	struct vm_pagequeue *pq;
+	int queue;

 	vm_page_assert_locked(m);
-	KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
-	    m));
-	pq = vm_page_pagequeue(m);
-	vm_pagequeue_lock(pq);
-	m->queue = PQ_NONE;
-	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
-	vm_pagequeue_cnt_dec(pq);
-	vm_pagequeue_unlock(pq);
+
+	queue = m->queue;
+	if (queue == PQ_NONE) {
+		KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0,
+		    ("page %p has queue state", m));
+		return;
+	}
+	if ((m->aflags & PGA_DEQUEUE) == 0)
+		vm_page_aflag_set(m, PGA_DEQUEUE);
+	vm_pqbatch_submit_page(m, queue);
 }

 /*
 *	vm_page_dequeue_locked:
 *
- *	Remove the given page from its current page queue.
+ *	Remove the page from its page queue, which must be locked.
+ *	If the page lock is not held, there is no guarantee that the
+ *	page will not be enqueued by another thread before this function
+ *	returns.  In this case, it is up to the caller to ensure that
+ *	no other threads hold a reference to the page.
 *
- *	The page and page queue must be locked.
+ *	The page queue lock must be held.  If the page is not already
+ *	logically dequeued, the page lock must be held as well.
 */
 void
 vm_page_dequeue_locked(vm_page_t m)
 {
 	struct vm_pagequeue *pq;

-	vm_page_lock_assert(m, MA_OWNED);
 	pq = vm_page_pagequeue(m);
+
+	KASSERT(m->queue != PQ_NONE,
+	    ("%s: page %p queue field is PQ_NONE", __func__, m));
 	vm_pagequeue_assert_locked(pq);
-	m->queue = PQ_NONE;
-	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
-	vm_pagequeue_cnt_dec(pq);
+	KASSERT((m->aflags & PGA_DEQUEUE) != 0 ||
+	    mtx_owned(vm_page_lockptr(m)),
+	    ("%s: queued unlocked page %p", __func__, m));
+
+	if ((m->aflags & PGA_ENQUEUED) != 0) {
+		TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+		vm_pagequeue_cnt_dec(pq);
+	}
+	vm_page_dequeue_complete(m);
 }

 /*
- *	vm_page_enqueue:
+ *	vm_page_dequeue:
 *
- *	Add the given page to the specified page queue.
- *
- *	The page must be locked.
+ *	Remove the page from whichever page queue it's in, if any.
+ *	If the page lock is not held, there is no guarantee that the
+ *	page will not be enqueued by another thread before this function
+ *	returns.  In this case, it is up to the caller to ensure that
+ *	no other threads hold a reference to the page.
+ */
+void
+vm_page_dequeue(vm_page_t m)
+{
+	struct mtx *lock, *lock1;
+
+	lock = vm_page_pagequeue_lockptr(m);
+	for (;;) {
+		if (lock == NULL)
+			return;
+		mtx_lock(lock);
+		if ((lock1 = vm_page_pagequeue_lockptr(m)) == lock)
+			break;
+		mtx_unlock(lock);
+		lock = lock1;
+	}
+	KASSERT(lock == vm_page_pagequeue_lockptr(m),
+	    ("%s: page %p migrated directly between queues", __func__, m));
+	vm_page_dequeue_locked(m);
+	mtx_unlock(lock);
+}
+
+/*
+ * Schedule the given page for insertion into the specified page queue.
+ * Physical insertion of the page may be deferred indefinitely.
 */
 static void
-vm_page_enqueue(uint8_t queue, vm_page_t m)
+vm_page_enqueue(vm_page_t m, uint8_t queue)
 {
-	struct vm_pagequeue *pq;

-	vm_page_lock_assert(m, MA_OWNED);
-	KASSERT(queue < PQ_COUNT,
-	    ("vm_page_enqueue: invalid queue %u request for page %p",
-	    queue, m));
-	pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue];
-	vm_pagequeue_lock(pq);
+	vm_page_assert_locked(m);
+	KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0,
+	    ("%s: page %p is already enqueued", __func__, m));
+
 	m->queue = queue;
-	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
-	vm_pagequeue_cnt_inc(pq);
-	vm_pagequeue_unlock(pq);
+	if ((m->aflags & PGA_REQUEUE) == 0)
+		vm_page_aflag_set(m, PGA_REQUEUE);
+	vm_pqbatch_submit_page(m, queue);
 }

 /*
- *	vm_page_requeue:
+ *	vm_page_requeue:		[ internal use only ]
 *
- *	Move the given page to the tail of its current page queue.
+ *	Schedule a requeue of the given page.
 *
 *	The page must be locked.
 */
 void
 vm_page_requeue(vm_page_t m)
 {
-	struct vm_pagequeue *pq;

-	vm_page_lock_assert(m, MA_OWNED);
+	vm_page_assert_locked(m);
 	KASSERT(m->queue != PQ_NONE,
-	    ("vm_page_requeue: page %p is not queued", m));
-	pq = vm_page_pagequeue(m);
-	vm_pagequeue_lock(pq);
-	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
-	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
-	vm_pagequeue_unlock(pq);
-}
+	    ("%s: page %p is not logically enqueued", __func__, m));

-/*
- *	vm_page_requeue_locked:
- *
- *	Move the given page to the tail of its current page queue.
- *
- *	The page queue must be locked.
- */
-void
-vm_page_requeue_locked(vm_page_t m)
-{
-	struct vm_pagequeue *pq;
-
-	KASSERT(m->queue != PQ_NONE,
-	    ("vm_page_requeue_locked: page %p is not queued", m));
-	pq = vm_page_pagequeue(m);
-	vm_pagequeue_assert_locked(pq);
-	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
-	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+	if ((m->aflags & PGA_REQUEUE) == 0)
+		vm_page_aflag_set(m, PGA_REQUEUE);
+	vm_pqbatch_submit_page(m, m->queue);
 }

 /*
@ -3185,18 +3392,18 @@ vm_page_activate(vm_page_t m)
 	int queue;

 	vm_page_lock_assert(m, MA_OWNED);
-	if ((queue = m->queue) != PQ_ACTIVE) {
-		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
-			if (m->act_count < ACT_INIT)
-				m->act_count = ACT_INIT;
-			if (queue != PQ_NONE)
-				vm_page_dequeue(m);
-			vm_page_enqueue(PQ_ACTIVE, m);
-		}
-	} else {
-		if (m->act_count < ACT_INIT)
+
+	if ((queue = m->queue) == PQ_ACTIVE || m->wire_count > 0 ||
+	    (m->oflags & VPO_UNMANAGED) != 0) {
+		if (queue == PQ_ACTIVE && m->act_count < ACT_INIT)
 			m->act_count = ACT_INIT;
+		return;
 	}
+
+	vm_page_remque(m);
+	if (m->act_count < ACT_INIT)
+		m->act_count = ACT_INIT;
+	vm_page_enqueue(m, PQ_ACTIVE);
 }

 /*
@ -3207,11 +3414,10 @@ vm_page_activate(vm_page_t m)
 *	the page to the free list only if this function returns true.
 *
 *	The object must be locked.  The page must be locked if it is
- *	managed.  For a queued managed page, the pagequeue_locked
- *	argument specifies whether the page queue is already locked.
+ *	managed.
 */
 bool
-vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
+vm_page_free_prep(vm_page_t m)
 {

 #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
@ -3227,14 +3433,14 @@ vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		vm_page_lock_assert(m, MA_OWNED);
 		KASSERT(!pmap_page_is_mapped(m),
-		    ("vm_page_free_toq: freeing mapped page %p", m));
+		    ("vm_page_free_prep: freeing mapped page %p", m));
 	} else
 		KASSERT(m->queue == PQ_NONE,
-		    ("vm_page_free_toq: unmanaged page %p is queued", m));
+		    ("vm_page_free_prep: unmanaged page %p is queued", m));
 	VM_CNT_INC(v_tfree);

 	if (vm_page_sbusied(m))
-		panic("vm_page_free: freeing busy page %p", m);
+		panic("vm_page_free_prep: freeing busy page %p", m);

 	vm_page_remove(m);

@ -3250,21 +3456,23 @@ vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
 		return (false);
 	}

-	if (m->queue != PQ_NONE) {
-		if (pagequeue_locked)
-			vm_page_dequeue_locked(m);
-		else
-			vm_page_dequeue(m);
-	}
+	/*
+	 * Pages need not be dequeued before they are returned to the physical
+	 * memory allocator, but they must at least be marked for a deferred
+	 * dequeue.
+	 */
+	if ((m->oflags & VPO_UNMANAGED) == 0)
+		vm_page_dequeue_deferred(m);
+
 	m->valid = 0;
 	vm_page_undirty(m);

 	if (m->wire_count != 0)
-		panic("vm_page_free: freeing wired page %p", m);
+		panic("vm_page_free_prep: freeing wired page %p", m);
 	if (m->hold_count != 0) {
 		m->flags &= ~PG_ZERO;
 		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
-		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
+		    ("vm_page_free_prep: freeing PG_UNHOLDFREE page %p", m));
 		m->flags |= PG_UNHOLDFREE;
 		return (false);
 	}
@ -3283,36 +3491,6 @@ vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
 	return (true);
 }

-void
-vm_page_free_phys_pglist(struct pglist *tq)
-{
-	struct vm_domain *vmd;
-	vm_page_t m;
-	int cnt;
-
-	if (TAILQ_EMPTY(tq))
-		return;
-	vmd = NULL;
-	cnt = 0;
-	TAILQ_FOREACH(m, tq, listq) {
-		if (vmd != vm_pagequeue_domain(m)) {
-			if (vmd != NULL) {
-				vm_domain_free_unlock(vmd);
-				vm_domain_freecnt_inc(vmd, cnt);
-				cnt = 0;
-			}
-			vmd = vm_pagequeue_domain(m);
-			vm_domain_free_lock(vmd);
-		}
-		vm_phys_free_pages(m, 0);
-		cnt++;
-	}
-	if (vmd != NULL) {
-		vm_domain_free_unlock(vmd);
-		vm_domain_freecnt_inc(vmd, cnt);
-	}
-}
-
 /*
 *	vm_page_free_toq:
 *
@ -3327,7 +3505,7 @@ vm_page_free_toq(vm_page_t m)
 {
 	struct vm_domain *vmd;

-	if (!vm_page_free_prep(m, false))
+	if (!vm_page_free_prep(m))
 		return;

 	vmd = vm_pagequeue_domain(m);
@ -3425,22 +3603,25 @@ vm_page_unwire(vm_page_t m, uint8_t queue)
 	KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
 	    ("vm_page_unwire: invalid queue %u request for page %p",
 	    queue, m));
+	if ((m->oflags & VPO_UNMANAGED) == 0)
+		vm_page_assert_locked(m);

 	unwired = vm_page_unwire_noq(m);
-	if (unwired && (m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL) {
-		if (m->queue == queue) {
+	if (!unwired || (m->oflags & VPO_UNMANAGED) != 0 || m->object == NULL)
+		return (unwired);
+
+	if (m->queue == queue) {
+		if (queue == PQ_ACTIVE)
+			vm_page_reference(m);
+		else if (queue != PQ_NONE)
+			vm_page_requeue(m);
+	} else {
+		vm_page_dequeue(m);
+		if (queue != PQ_NONE) {
+			vm_page_enqueue(m, queue);
 			if (queue == PQ_ACTIVE)
-				vm_page_reference(m);
-			else if (queue != PQ_NONE)
-				vm_page_requeue(m);
-		} else {
-			vm_page_remque(m);
-			if (queue != PQ_NONE) {
-				vm_page_enqueue(queue, m);
-				if (queue == PQ_ACTIVE)
-					/* Initialize act_count. */
-					vm_page_activate(m);
-			}
+				/* Initialize act_count. */
+				vm_page_activate(m);
 		}
 	}
 	return (unwired);
@ -3476,52 +3657,8 @@ vm_page_unwire_noq(vm_page_t m)
 }

 /*
- * Move the specified page to the inactive queue, or requeue the page if it is
- * already in the inactive queue.
- *
- * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
- * queue.  However, setting "noreuse" to TRUE will accelerate the specified
- * page's reclamation, but it will not unmap the page from any address space.
- * This is implemented by inserting the page near the head of the inactive
- * queue, using a marker page to guide FIFO insertion ordering.
- *
- * The page must be locked.
- */
-static inline void
-_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
-{
-	struct vm_pagequeue *pq;
-	int queue;
-
-	vm_page_assert_locked(m);
-
-	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
-		pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE];
-		/* Avoid multiple acquisitions of the inactive queue lock. */
-		queue = m->queue;
-		if (queue == PQ_INACTIVE) {
-			vm_pagequeue_lock(pq);
-			vm_page_dequeue_locked(m);
-		} else {
-			if (queue != PQ_NONE)
-				vm_page_dequeue(m);
-			vm_pagequeue_lock(pq);
-		}
-		m->queue = PQ_INACTIVE;
-		if (noreuse)
-			TAILQ_INSERT_BEFORE(
-			    &vm_pagequeue_domain(m)->vmd_inacthead, m,
-			    plinks.q);
-		else
-			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
-		vm_pagequeue_cnt_inc(pq);
-		vm_pagequeue_unlock(pq);
-	}
-}
-
-/*
- * Move the specified page to the inactive queue, or requeue the page if it is
- * already in the inactive queue.
+ * Move the specified page to the tail of the inactive queue, or requeue
+ * the page if it is already in the inactive queue.
 *
 * The page must be locked.
 */
@ -3529,12 +3666,23 @@ void
 vm_page_deactivate(vm_page_t m)
 {

-	_vm_page_deactivate(m, FALSE);
+	vm_page_assert_locked(m);
+
+	if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
+		return;
+
+	if (!vm_page_inactive(m)) {
+		vm_page_remque(m);
+		vm_page_enqueue(m, PQ_INACTIVE);
+	} else
+		vm_page_requeue(m);
 }

 /*
- * Move the specified page to the inactive queue with the expectation
- * that it is unlikely to be reused.
+ * Move the specified page close to the head of the inactive queue,
+ * bypassing LRU.  A marker page is used to maintain FIFO ordering.
+ * As with regular enqueues, we use a per-CPU batch queue to reduce
+ * contention on the page queue lock.
 *
 * The page must be locked.
 */
@ -3542,7 +3690,17 @@ void
 vm_page_deactivate_noreuse(vm_page_t m)
 {

-	_vm_page_deactivate(m, TRUE);
+	vm_page_assert_locked(m);
+
+	if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
+		return;
+
+	if (!vm_page_inactive(m))
+		vm_page_remque(m);
+	m->queue = PQ_INACTIVE;
+	if ((m->aflags & PGA_REQUEUE_HEAD) == 0)
+		vm_page_aflag_set(m, PGA_REQUEUE_HEAD);
+	vm_pqbatch_submit_page(m, PQ_INACTIVE);
 }

 /*
@ -3555,13 +3713,14 @@ vm_page_launder(vm_page_t m)
 {

 	vm_page_assert_locked(m);
-	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
-		if (m->queue == PQ_LAUNDRY)
-			vm_page_requeue(m);
-		else {
-			vm_page_remque(m);
-			vm_page_enqueue(PQ_LAUNDRY, m);
-		}
+	if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
+		return;
+
+	if (m->queue == PQ_LAUNDRY)
+		vm_page_requeue(m);
+	else {
+		vm_page_remque(m);
+		vm_page_enqueue(m, PQ_LAUNDRY);
 	}
 }

@ -3577,9 +3736,9 @@ vm_page_unswappable(vm_page_t m)
 	vm_page_assert_locked(m);
 	KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0,
 	    ("page %p already unswappable", m));
-	if (m->queue != PQ_NONE)
-		vm_page_dequeue(m);
-	vm_page_enqueue(PQ_UNSWAPPABLE, m);
+
+	vm_page_remque(m);
+	vm_page_enqueue(m, PQ_UNSWAPPABLE);
 }

 /*
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -93,8 +93,11 @@
 *
 *	In general, operations on this structure's mutable fields are
 *	synchronized using either one of or a combination of the lock on the
- *	object that the page belongs to (O), the pool lock for the page (P),
- *	or the lock for either the free or paging queue (Q).  If a field is
+ *	object that the page belongs to (O), the page lock (P),
+ *	the per-domain lock for the free queues (F), or the page's queue
+ *	lock (Q).  The physical address of a page is used to select its page
+ *	lock from a pool.  The queue lock for a page depends on the value of
+ *	its queue field and described in detail below.  If a field is
 *	annotated below with two of these locks, then holding either lock is
 *	sufficient for read access, but both locks are required for write
 *	access.  An annotation of (C) indicates that the field is immutable.
@ -143,6 +146,29 @@
 *	causing the thread to block.  vm_page_sleep_if_busy() can be used to
 *	sleep until the page's busy state changes, after which the caller
 *	must re-lookup the page and re-evaluate its state.
+ *
+ *	The queue field is the index of the page queue containing the
+ *	page, or PQ_NONE if the page is not enqueued.  The queue lock of a
+ *	page is the page queue lock corresponding to the page queue index,
+ *	or the page lock (P) for the page if it is not enqueued.  To modify
+ *	the queue field, the queue lock for the old value of the field must
+ *	be held.  It is invalid for a page's queue field to transition
+ *	between two distinct page queue indices.  That is, when updating
+ *	the queue field, either the new value or the old value must be
+ *	PQ_NONE.
+ *
+ *	To avoid contention on page queue locks, page queue operations
+ *	(enqueue, dequeue, requeue) are batched using per-CPU queues.
+ *	A deferred operation is requested by inserting an entry into a
+ *	batch queue; the entry is simply a pointer to the page, and the
+ *	request type is encoded in the page's aflags field using the values
+ *	in PGA_QUEUE_STATE_MASK.  The type-stability of struct vm_pages is
+ *	crucial to this scheme since the processing of entries in a given
+ *	batch queue may be deferred indefinitely.  In particular, a page
+ *	may be freed before its pending batch queue entries have been
+ *	processed.  The page lock (P) must be held to schedule a batched
+ *	queue operation, and the page queue lock must be held in order to
+ *	process batch queue entries for the page queue.
 */

 #if PAGE_SIZE == 4096
@ -174,7 +200,7 @@ struct vm_page {
 	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) */
 	vm_object_t object;		/* which object am I in (O,P) */
 	vm_pindex_t pindex;		/* offset into object (O,P) */
-	vm_paddr_t phys_addr;		/* physical address of page */
+	vm_paddr_t phys_addr;		/* physical address of page (C) */
 	struct md_page md;		/* machine dependent stuff */
 	u_int wire_count;		/* wired down maps refs (P) */
 	volatile u_int busy_lock;	/* busy owners lock */
@ -182,11 +208,11 @@ struct vm_page {
 	uint16_t flags;			/* page PG_* flags (P) */
 	uint8_t aflags;			/* access is atomic */
 	uint8_t oflags;			/* page VPO_* flags (O) */
-	uint8_t	queue;			/* page queue index (P,Q) */
+	uint8_t	queue;			/* page queue index (Q) */
 	int8_t psind;			/* pagesizes[] index (O) */
 	int8_t segind;			/* vm_phys segment index (C) */
-	uint8_t	order;			/* index of the buddy queue */
-	uint8_t pool;			/* vm_phys freepool index (Q) */
+	uint8_t	order;			/* index of the buddy queue (F) */
+	uint8_t pool;			/* vm_phys freepool index (F) */
 	u_char	act_count;		/* page usage count (P) */
 	/* NOTE that these must support one bit per DEV_BSIZE in a page */
 	/* so, on normal X86 kernels, they must be at least 8 bits wide */
@ -314,10 +340,38 @@ extern struct mtx_padalign pa_lock[];
 *
 * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has
 * at least one executable mapping.  It is not consumed by the MI VM layer.
+ *
+ * PGA_ENQUEUED is set and cleared when a page is inserted into or removed
+ * from a page queue, respectively.  It determines whether the plinks.q field
+ * of the page is valid.  To set or clear this flag, the queue lock for the
+ * page must be held: the page queue lock corresponding to the page's "queue"
+ * field if its value is not PQ_NONE, and the page lock otherwise.
+ *
+ * PGA_DEQUEUE is set when the page is scheduled to be dequeued from a page
+ * queue, and cleared when the dequeue request is processed.  A page may
+ * have PGA_DEQUEUE set and PGA_ENQUEUED cleared, for instance if a dequeue
+ * is requested after the page is scheduled to be enqueued but before it is
+ * actually inserted into the page queue.  The page lock must be held to set
+ * this flag, and the queue lock for the page must be held to clear it.
+ *
+ * PGA_REQUEUE is set when the page is scheduled to be enqueued or requeued
+ * in its page queue.  The page lock must be held to set this flag, and the
+ * queue lock for the page must be held to clear it.
+ *
+ * PGA_REQUEUE_HEAD is a special flag for enqueuing pages near the head of
+ * the inactive queue, thus bypassing LRU.  The page lock must be held to
+ * set this flag, and the queue lock for the page must be held to clear it.
 */
 #define	PGA_WRITEABLE	0x01		/* page may be mapped writeable */
 #define	PGA_REFERENCED	0x02		/* page has been referenced */
 #define	PGA_EXECUTABLE	0x04		/* page may be mapped executable */
+#define	PGA_ENQUEUED	0x08		/* page is enqueued in a page queue */
+#define	PGA_DEQUEUE	0x10		/* page is due to be dequeued */
+#define	PGA_REQUEUE	0x20		/* page is due to be requeued */
+#define	PGA_REQUEUE_HEAD 0x40		/* page requeue should bypass LRU */
+
+#define	PGA_QUEUE_STATE_MASK	(PGA_ENQUEUED | PGA_DEQUEUE | PGA_REQUEUE | \
+				PGA_REQUEUE_HEAD)

 /*
 * Page flags.  If changed at any other time than page allocation or
@ -484,13 +538,13 @@ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
 void vm_page_deactivate(vm_page_t);
 void vm_page_deactivate_noreuse(vm_page_t);
 void vm_page_dequeue(vm_page_t m);
+void vm_page_dequeue_deferred(vm_page_t m);
 void vm_page_dequeue_locked(vm_page_t m);
+void vm_page_drain_pqbatch(void);
 vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
-void vm_page_free_phys_pglist(struct pglist *tq);
-bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked);
+bool vm_page_free_prep(vm_page_t m);
 vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
-void vm_page_init_marker(vm_page_t m, int queue);
 int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
 void vm_page_launder(vm_page_t m);
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
@ -752,6 +806,24 @@ vm_page_in_laundry(vm_page_t m)
 	return (m->queue == PQ_LAUNDRY || m->queue == PQ_UNSWAPPABLE);
 }

+/*
+ *	vm_page_enqueued:
+ *
+ *	Return true if the page is logically enqueued and no deferred
+ *	dequeue is pending.
+ */
+static inline bool
+vm_page_enqueued(vm_page_t m)
+{
+
+	vm_page_assert_locked(m);
+
+	if ((m->aflags & PGA_DEQUEUE) != 0)
+		return (false);
+	atomic_thread_fence_acq();
+	return (m->queue != PQ_NONE);
+}
+
 /*
 *	vm_page_held:
 *
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -201,103 +201,134 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");

 static u_int isqrt(u_int num);
-static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
 static int vm_pageout_launder(struct vm_domain *vmd, int launder,
    bool in_shortfall);
 static void vm_pageout_laundry_worker(void *arg);
-static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);

-/*
- * vm_pageout_fallback_object_lock:
- * 
- * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
- * known to have failed and page queue must be either PQ_ACTIVE or
- * PQ_INACTIVE.  To avoid lock order violation, unlock the page queue
- * while locking the vm object.  Use marker page to detect page queue
- * changes and maintain notion of next page on page queue.  Return
- * TRUE if no changes were detected, FALSE otherwise.  vm object is
- * locked on return.
- * 
- * This function depends on both the lock portion of struct vm_object
- * and normal struct vm_page being type stable.
- */
-static boolean_t
-vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
-{
-	struct vm_page marker;
+struct scan_state {
+	struct vm_batchqueue bq;
 	struct vm_pagequeue *pq;
-	boolean_t unchanged;
-	vm_object_t object;
-	int queue;
+	vm_page_t	marker;
+	int		maxscan;
+	int		scanned;
+};

-	queue = m->queue;
-	vm_page_init_marker(&marker, queue);
-	pq = vm_page_pagequeue(m);
-	object = m->object;
-	
-	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
+static void
+vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
+    vm_page_t marker, vm_page_t after, int maxscan)
+{
+
+	vm_pagequeue_assert_locked(pq);
+	KASSERT((marker->aflags & PGA_ENQUEUED) == 0,
+	    ("marker %p already enqueued", marker));
+
+	if (after == NULL)
+		TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
+	else
+		TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);
+	vm_page_aflag_set(marker, PGA_ENQUEUED);
+
+	vm_batchqueue_init(&ss->bq);
+	ss->pq = pq;
+	ss->marker = marker;
+	ss->maxscan = maxscan;
+	ss->scanned = 0;
 	vm_pagequeue_unlock(pq);
-	vm_page_unlock(m);
-	VM_OBJECT_WLOCK(object);
-	vm_page_lock(m);
-	vm_pagequeue_lock(pq);
+}

-	/*
-	 * The page's object might have changed, and/or the page might
-	 * have moved from its original position in the queue.  If the
-	 * page's object has changed, then the caller should abandon
-	 * processing the page because the wrong object lock was
-	 * acquired.  Use the marker's plinks.q, not the page's, to
-	 * determine if the page has been moved.  The state of the
-	 * page's plinks.q can be indeterminate; whereas, the marker's
-	 * plinks.q must be valid.
-	 */
-	*next = TAILQ_NEXT(&marker, plinks.q);
-	unchanged = m->object == object &&
-	    m == TAILQ_PREV(&marker, pglist, plinks.q);
-	KASSERT(!unchanged || m->queue == queue,
-	    ("page %p queue %d %d", m, queue, m->queue));
-	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
-	return (unchanged);
+static void
+vm_pageout_end_scan(struct scan_state *ss)
+{
+	struct vm_pagequeue *pq;
+
+	pq = ss->pq;
+	vm_pagequeue_assert_locked(pq);
+	KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
+	    ("marker %p not enqueued", ss->marker));
+
+	TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
+	vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
+	VM_CNT_ADD(v_pdpages, ss->scanned);
 }

 /*
- * Lock the page while holding the page queue lock.  Use marker page
- * to detect page queue changes and maintain notion of next page on
- * page queue.  Return TRUE if no changes were detected, FALSE
- * otherwise.  The page is locked on return. The page queue lock might
- * be dropped and reacquired.
- *
- * This function depends on normal struct vm_page being type stable.
+ * Ensure that the page has not been dequeued after a pageout batch was
+ * collected.  See vm_page_dequeue_complete().
 */
-static boolean_t
-vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
+static inline bool
+vm_pageout_page_queued(vm_page_t m, int queue)
+{
+
+	vm_page_assert_locked(m);
+
+	if ((m->aflags & PGA_DEQUEUE) != 0)
+		return (false);
+	atomic_thread_fence_acq();
+	return (m->queue == queue);
+}
+
+/*
+ * Add a small number of queued pages to a batch queue for later processing
+ * without the corresponding queue lock held.  The caller must have enqueued a
+ * marker page at the desired start point for the scan.  Pages will be
+ * physically dequeued if the caller so requests.  Otherwise, the returned
+ * batch may contain marker pages, and it is up to the caller to handle them.
+ *
+ * When processing the batch queue, vm_pageout_page_queued() must be used to
+ * determine whether the page was logically dequeued by another thread.  Once
+ * this check is performed, the page lock guarantees that the page will not be
+ * disassociated from the queue.
+ */
+static __always_inline void
+vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
 {
-	struct vm_page marker;
 	struct vm_pagequeue *pq;
-	boolean_t unchanged;
-	int queue;
+	vm_page_t m, marker;

-	vm_page_lock_assert(m, MA_NOTOWNED);
-	if (vm_page_trylock(m))
-		return (TRUE);
+	marker = ss->marker;
+	pq = ss->pq;

-	queue = m->queue;
-	vm_page_init_marker(&marker, queue);
-	pq = vm_page_pagequeue(m);
+	KASSERT((marker->aflags & PGA_ENQUEUED) != 0,
+	    ("marker %p not enqueued", ss->marker));

-	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
-	vm_pagequeue_unlock(pq);
-	vm_page_lock(m);
 	vm_pagequeue_lock(pq);
+	for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
+	    ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
+	    m = TAILQ_NEXT(m, plinks.q), ss->scanned++) {
+		if ((m->flags & PG_MARKER) == 0) {
+			KASSERT((m->aflags & PGA_ENQUEUED) != 0,
+			    ("page %p not enqueued", m));
+			KASSERT((m->flags & PG_FICTITIOUS) == 0,
+			    ("Fictitious page %p cannot be in page queue", m));
+			KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+			    ("Unmanaged page %p cannot be in page queue", m));
+		} else if (dequeue)
+			continue;

-	/* Page queue might have changed. */
-	*next = TAILQ_NEXT(&marker, plinks.q);
-	unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
-	KASSERT(!unchanged || m->queue == queue,
-	    ("page %p queue %d %d", m, queue, m->queue));
-	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
-	return (unchanged);
+		(void)vm_batchqueue_insert(&ss->bq, m);
+		if (dequeue) {
+			TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+			vm_page_aflag_clear(m, PGA_ENQUEUED);
+		}
+	}
+	TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
+	if (__predict_true(m != NULL))
+		TAILQ_INSERT_BEFORE(m, marker, plinks.q);
+	else
+		TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
+	if (dequeue)
+		vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
+	vm_pagequeue_unlock(pq);
+}
+
+/* Return the next page to be scanned, or NULL if the scan is complete. */
+static __always_inline vm_page_t
+vm_pageout_next(struct scan_state *ss, const bool dequeue)
+{
+
+	if (ss->bq.bq_cnt == 0)
+		vm_pageout_collect_batch(ss, dequeue);
+	return (vm_batchqueue_pop(&ss->bq));
 }

 /*
@ -353,12 +384,12 @@ vm_pageout_cluster(vm_page_t m)
 			break;
 		}
 		vm_page_test_dirty(p);
-		if (p->dirty == 0) {
+		if (p->dirty == 0 || !vm_page_in_laundry(p)) {
 			ib = 0;
 			break;
 		}
 		vm_page_lock(p);
-		if (!vm_page_in_laundry(p) || vm_page_held(p)) {
+		if (vm_page_held(p)) {
 			vm_page_unlock(p);
 			ib = 0;
 			break;
@ -381,10 +412,10 @@ vm_pageout_cluster(vm_page_t m)
 		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
 			break;
 		vm_page_test_dirty(p);
-		if (p->dirty == 0)
+		if (p->dirty == 0 || !vm_page_in_laundry(p))
 			break;
 		vm_page_lock(p);
-		if (!vm_page_in_laundry(p) || vm_page_held(p)) {
+		if (vm_page_held(p)) {
 			vm_page_unlock(p);
 			break;
 		}
@ -675,13 +706,18 @@ vm_pageout_clean(vm_page_t m, int *numpagedout)
 static int
 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 {
+	struct scan_state ss;
 	struct vm_pagequeue *pq;
+	struct mtx *mtx;
 	vm_object_t object;
-	vm_page_t m, marker, next;
-	int act_delta, error, maxscan, numpagedout, queue, starting_target;
+	vm_page_t m, marker;
+	int act_delta, error, numpagedout, queue, starting_target;
 	int vnodes_skipped;
-	bool pageout_ok, queue_locked;
+	bool obj_locked, pageout_ok;

+	mtx = NULL;
+	obj_locked = false;
+	object = NULL;
 	starting_target = launder;
 	vnodes_skipped = 0;

@ -691,10 +727,6 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 	 * we've reached the end of the queue.  A single iteration of this loop
 	 * may cause more than one page to be laundered because of clustering.
 	 *
-	 * maxscan ensures that we don't re-examine requeued pages.  Any
-	 * additional pages written as part of a cluster are subtracted from
-	 * maxscan since they must be taken from the laundry queue.
-	 *
 	 * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
 	 * swap devices are configured.
 	 */
@ -704,53 +736,68 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 		queue = PQ_LAUNDRY;

 scan:
-	pq = &vmd->vmd_pagequeues[queue];
 	marker = &vmd->vmd_markers[queue];
-
+	pq = &vmd->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
-	maxscan = pq->pq_cnt;
-	queue_locked = true;
-	for (m = TAILQ_FIRST(&pq->pq_pl);
-	    m != NULL && maxscan-- > 0 && launder > 0;
-	    m = next) {
-		vm_pagequeue_assert_locked(pq);
-		KASSERT(queue_locked, ("unlocked laundry queue"));
-		KASSERT(vm_page_in_laundry(m),
-		    ("page %p has an inconsistent queue", m));
-		next = TAILQ_NEXT(m, plinks.q);
-		if ((m->flags & PG_MARKER) != 0)
+	vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
+	while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {
+		if (__predict_false((m->flags & PG_MARKER) != 0))
 			continue;
-		KASSERT((m->flags & PG_FICTITIOUS) == 0,
-		    ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
-		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
-		    ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
-		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
-			vm_page_unlock(m);
+
+		vm_page_change_lock(m, &mtx);
+
+recheck:
+		/*
+		 * The page may have been disassociated from the queue
+		 * while locks were dropped.
+		 */
+		if (!vm_pageout_page_queued(m, queue))
 			continue;
-		}
-		if (m->wire_count != 0) {
-			vm_page_dequeue_locked(m);
-			vm_page_unlock(m);
-			continue;
-		}
-		object = m->object;
-		if ((!VM_OBJECT_TRYWLOCK(object) &&
-		    (!vm_pageout_fallback_object_lock(m, &next) ||
-		    vm_page_held(m))) || vm_page_busied(m)) {
-			VM_OBJECT_WUNLOCK(object);
-			if (m->wire_count != 0 && vm_page_pagequeue(m) == pq)
-				vm_page_dequeue_locked(m);
-			vm_page_unlock(m);
+
+		/*
+		 * A requeue was requested, so this page gets a second
+		 * chance.
+		 */
+		if ((m->aflags & PGA_REQUEUE) != 0) {
+			vm_page_requeue(m);
 			continue;
 		}

 		/*
-		 * Unlock the laundry queue, invalidating the 'next' pointer.
-		 * Use a marker to remember our place in the laundry queue.
+		 * Held pages are essentially stuck in the queue.
+		 *
+		 * Wired pages may not be freed.  Complete their removal
+		 * from the queue now to avoid needless revisits during
+		 * future scans.
 		 */
-		TAILQ_INSERT_AFTER(&pq->pq_pl, m, marker, plinks.q);
-		vm_pagequeue_unlock(pq);
-		queue_locked = false;
+		if (m->hold_count != 0)
+			continue;
+		if (m->wire_count != 0) {
+			vm_page_dequeue_deferred(m);
+			continue;
+		}
+
+		if (object != m->object) {
+			if (obj_locked) {
+				VM_OBJECT_WUNLOCK(object);
+				obj_locked = false;
+			}
+			object = m->object;
+		}
+		if (!obj_locked) {
+			if (!VM_OBJECT_TRYWLOCK(object)) {
+				mtx_unlock(mtx);
+				/* Depends on type-stability. */
+				VM_OBJECT_WLOCK(object);
+				obj_locked = true;
+				mtx_lock(mtx);
+				goto recheck;
+			} else
+				obj_locked = true;
+		}
+
+		if (vm_page_busied(m))
+			continue;

 		/*
 		 * Invalid pages can be easily freed.  They cannot be
@ -799,9 +846,11 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 				 */
 				if (!in_shortfall)
 					launder--;
-				goto drop_page;
-			} else if ((object->flags & OBJ_DEAD) == 0)
-				goto requeue_page;
+				continue;
+			} else if ((object->flags & OBJ_DEAD) == 0) {
+				vm_page_requeue(m);
+				continue;
+			}
 		}

 		/*
@ -836,11 +885,8 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 			else
 				pageout_ok = true;
 			if (!pageout_ok) {
-requeue_page:
-				vm_pagequeue_lock(pq);
-				queue_locked = true;
-				vm_page_requeue_locked(m);
-				goto drop_page;
+				vm_page_requeue(m);
+				continue;
 			}

 			/*
@ -859,24 +905,25 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 			error = vm_pageout_clean(m, &numpagedout);
 			if (error == 0) {
 				launder -= numpagedout;
-				maxscan -= numpagedout - 1;
+				ss.scanned += numpagedout;
 			} else if (error == EDEADLK) {
 				pageout_lock_miss++;
 				vnodes_skipped++;
 			}
-			goto relock_queue;
+			mtx = NULL;
+			obj_locked = false;
 		}
-drop_page:
-		vm_page_unlock(m);
-		VM_OBJECT_WUNLOCK(object);
-relock_queue:
-		if (!queue_locked) {
-			vm_pagequeue_lock(pq);
-			queue_locked = true;
-		}
-		next = TAILQ_NEXT(marker, plinks.q);
-		TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
 	}
+	if (mtx != NULL) {
+		mtx_unlock(mtx);
+		mtx = NULL;
+	}
+	if (obj_locked) {
+		VM_OBJECT_WUNLOCK(object);
+		obj_locked = false;
+	}
+	vm_pagequeue_lock(pq);
+	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);

 	if (launder > 0 && queue == PQ_UNSWAPPABLE) {
@ -1077,6 +1124,56 @@ vm_pageout_laundry_worker(void *arg)
 	}
 }

+static int
+vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m)
+{
+	struct vm_domain *vmd;
+
+	if (!vm_page_inactive(m) || (m->aflags & PGA_ENQUEUED) != 0)
+		return (0);
+	vm_page_aflag_set(m, PGA_ENQUEUED);
+	if ((m->aflags & PGA_REQUEUE_HEAD) != 0) {
+		vmd = vm_pagequeue_domain(m);
+		TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
+		vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
+	} else if ((m->aflags & PGA_REQUEUE) != 0) {
+		TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q);
+		vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
+	} else
+		TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q);
+	return (1);
+}
+
+/*
+ * Re-add stuck pages to the inactive queue.  We will examine them again
+ * during the next scan.  If the queue state of a page has changed since
+ * it was physically removed from the page queue in
+ * vm_pageout_collect_batch(), don't do anything with that page.
+ */
+static void
+vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
+    vm_page_t m)
+{
+	struct vm_pagequeue *pq;
+	int delta;
+
+	delta = 0;
+	pq = ss->pq;
+
+	if (m != NULL) {
+		if (vm_batchqueue_insert(bq, m))
+			return;
+		vm_pagequeue_lock(pq);
+		delta += vm_pageout_reinsert_inactive_page(ss, m);
+	} else
+		vm_pagequeue_lock(pq);
+	while ((m = vm_batchqueue_pop(bq)) != NULL)
+		delta += vm_pageout_reinsert_inactive_page(ss, m);
+	vm_pagequeue_cnt_add(pq, delta);
+	vm_pagequeue_unlock(pq);
+	vm_batchqueue_init(bq);
+}
+
 /*
 *	vm_pageout_scan does the dirty work for the pageout daemon.
 *
@ -1089,13 +1186,16 @@ vm_pageout_laundry_worker(void *arg)
 static bool
 vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 {
-	vm_page_t m, marker, next;
+	struct scan_state ss;
+	struct vm_batchqueue rq;
+	struct mtx *mtx;
+	vm_page_t m, marker;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	long min_scan;
-	int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
-	int page_shortage, scan_tick, scanned, starting_page_shortage;
-	boolean_t queue_locked;
+	int act_delta, addl_page_shortage, deficit, inactq_shortage, max_scan;
+	int page_shortage, scan_tick, starting_page_shortage;
+	bool obj_locked;

 	/*
 	 * If we need to reclaim memory ask kernel caches to return
@ -1136,79 +1236,85 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 		page_shortage = deficit = 0;
 	starting_page_shortage = page_shortage;

+	mtx = NULL;
+	obj_locked = false;
+	object = NULL;
+	vm_batchqueue_init(&rq);
+
 	/*
 	 * Start scanning the inactive queue for pages that we can free.  The
 	 * scan will stop when we reach the target or we have scanned the
 	 * entire queue.  (Note that m->act_count is not used to make
 	 * decisions for the inactive queue, only for the active queue.)
 	 */
-	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 	marker = &vmd->vmd_markers[PQ_INACTIVE];
-	maxscan = pq->pq_cnt;
+	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 	vm_pagequeue_lock(pq);
-	queue_locked = TRUE;
-	for (m = TAILQ_FIRST(&pq->pq_pl);
-	     m != NULL && maxscan-- > 0 && page_shortage > 0;
-	     m = next) {
-		vm_pagequeue_assert_locked(pq);
-		KASSERT(queue_locked, ("unlocked inactive queue"));
-		KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
+	vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
+	while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) {
+		KASSERT((m->flags & PG_MARKER) == 0,
+		    ("marker page %p was dequeued", m));

-		VM_CNT_INC(v_pdpages);
-		next = TAILQ_NEXT(m, plinks.q);
+		vm_page_change_lock(m, &mtx);

+recheck:
 		/*
-		 * skip marker pages
+		 * The page may have been disassociated from the queue
+		 * while locks were dropped.
 		 */
-		if (m->flags & PG_MARKER)
+		if (!vm_pageout_page_queued(m, PQ_INACTIVE)) {
+			addl_page_shortage++;
 			continue;
-
-		KASSERT((m->flags & PG_FICTITIOUS) == 0,
-		    ("Fictitious page %p cannot be in inactive queue", m));
-		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
-		    ("Unmanaged page %p cannot be in inactive queue", m));
+		}

 		/*
-		 * The page or object lock acquisitions fail if the
-		 * page was removed from the queue or moved to a
-		 * different position within the queue.  In either
-		 * case, addl_page_shortage should not be incremented.
+		 * The page was re-enqueued after the page queue lock was
+		 * dropped, or a requeue was requested.  This page gets a second
+		 * chance.
 		 */
-		if (!vm_pageout_page_lock(m, &next))
-			goto unlock_page;
-		else if (m->wire_count != 0) {
-			/*
-			 * Wired pages may not be freed, and unwiring a queued
-			 * page will cause it to be requeued.  Thus, remove them
-			 * from the queue now to avoid unnecessary revisits.
-			 */
-			vm_page_dequeue_locked(m);
+		if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE |
+		    PGA_REQUEUE_HEAD)) != 0)
+			goto reinsert;
+
+		/*
+		 * Held pages are essentially stuck in the queue.  So,
+		 * they ought to be discounted from the inactive count.
+		 * See the calculation of inactq_shortage before the
+		 * loop over the active queue below.
+		 *
+		 * Wired pages may not be freed.  Complete their removal
+		 * from the queue now to avoid needless revisits during
+		 * future scans.
+		 */
+		if (m->hold_count != 0) {
 			addl_page_shortage++;
-			goto unlock_page;
-		} else if (m->hold_count != 0) {
-			/*
-			 * Held pages are essentially stuck in the
-			 * queue.  So, they ought to be discounted
-			 * from the inactive count.  See the
-			 * calculation of inactq_shortage before the
-			 * loop over the active queue below.
-			 */
-			addl_page_shortage++;
-			goto unlock_page;
+			goto reinsert;
 		}
-		object = m->object;
-		if (!VM_OBJECT_TRYWLOCK(object)) {
-			if (!vm_pageout_fallback_object_lock(m, &next))
-				goto unlock_object;
-			else if (m->wire_count != 0) {
-				vm_page_dequeue_locked(m);
-				addl_page_shortage++;
-				goto unlock_object;
-			} else if (m->hold_count != 0) {
-				addl_page_shortage++;
-				goto unlock_object;
+		if (m->wire_count != 0) {
+			addl_page_shortage++;
+			vm_page_dequeue_deferred(m);
+			continue;
+		}
+
+		if (object != m->object) {
+			if (obj_locked) {
+				VM_OBJECT_WUNLOCK(object);
+				obj_locked = false;
 			}
+			object = m->object;
 		}
+		if (!obj_locked) {
+			if (!VM_OBJECT_TRYWLOCK(object)) {
+				mtx_unlock(mtx);
+				/* Depends on type-stability. */
+				VM_OBJECT_WLOCK(object);
+				obj_locked = true;
+				mtx_lock(mtx);
+				goto recheck;
+			} else
+				obj_locked = true;
+		}
+
 		if (vm_page_busied(m)) {
 			/*
 			 * Don't mess with busy pages.  Leave them at
@ -1219,26 +1325,8 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 			 * inactive count.
 			 */
 			addl_page_shortage++;
-unlock_object:
-			VM_OBJECT_WUNLOCK(object);
-unlock_page:
-			vm_page_unlock(m);
-			continue;
+			goto reinsert;
 		}
-		KASSERT(!vm_page_held(m), ("Held page %p", m));
-
-		/*
-		 * Dequeue the inactive page and unlock the inactive page
-		 * queue, invalidating the 'next' pointer.  Dequeueing the
-		 * page here avoids a later reacquisition (and release) of
-		 * the inactive page queue lock when vm_page_activate(),
-		 * vm_page_free(), or vm_page_launder() is called.  Use a
-		 * marker to remember our place in the inactive queue.
-		 */
-		TAILQ_INSERT_AFTER(&pq->pq_pl, m, marker, plinks.q);
-		vm_page_dequeue_locked(m);
-		vm_pagequeue_unlock(pq);
-		queue_locked = FALSE;

 		/*
 		 * Invalid pages can be easily freed. They cannot be
@ -1276,14 +1364,10 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 				 * queue.
 				 */
 				m->act_count += act_delta + ACT_ADVANCE;
-				goto drop_page;
+				continue;
 			} else if ((object->flags & OBJ_DEAD) == 0) {
-				vm_pagequeue_lock(pq);
-				queue_locked = TRUE;
-				m->queue = PQ_INACTIVE;
-				TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
-				vm_pagequeue_cnt_inc(pq);
-				goto drop_page;
+				vm_page_aflag_set(m, PGA_REQUEUE);
+				goto reinsert;
 			}
 		}

@ -1309,23 +1393,39 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 		 */
 		if (m->dirty == 0) {
 free_page:
+			/*
+			 * Because we dequeued the page and have already
+			 * checked for concurrent dequeue and enqueue
+			 * requests, we can safely disassociate the page
+			 * from the inactive queue.
+			 */
+			KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0,
+			    ("page %p has queue state", m));
+			m->queue = PQ_NONE;
 			vm_page_free(m);
-			VM_CNT_INC(v_dfree);
-			--page_shortage;
+			page_shortage--;
 		} else if ((object->flags & OBJ_DEAD) == 0)
 			vm_page_launder(m);
-drop_page:
-		vm_page_unlock(m);
-		VM_OBJECT_WUNLOCK(object);
-		if (!queue_locked) {
-			vm_pagequeue_lock(pq);
-			queue_locked = TRUE;
-		}
-		next = TAILQ_NEXT(marker, plinks.q);
-		TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
+		continue;
+reinsert:
+		vm_pageout_reinsert_inactive(&ss, &rq, m);
 	}
+	if (mtx != NULL) {
+		mtx_unlock(mtx);
+		mtx = NULL;
+	}
+	if (obj_locked) {
+		VM_OBJECT_WUNLOCK(object);
+		obj_locked = false;
+	}
+	vm_pageout_reinsert_inactive(&ss, &rq, NULL);
+	vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);
+	vm_pagequeue_lock(pq);
+	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);

+	VM_CNT_ADD(v_dfree, starting_page_shortage - page_shortage);
+
 	/*
 	 * Wake up the laundry thread so that it can perform any needed
 	 * laundering.  If we didn't meet our target, we're in shortfall and
@ -1386,9 +1486,9 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 	    vm_paging_target(vmd) + deficit + addl_page_shortage;
 	inactq_shortage *= act_scan_laundry_weight;

+	marker = &vmd->vmd_markers[PQ_ACTIVE];
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
-	maxscan = pq->pq_cnt;

 	/*
 	 * If we're just idle polling attempt to visit every
@ -1401,43 +1501,55 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 		min_scan /= hz * vm_pageout_update_period;
 	} else
 		min_scan = 0;
-	if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
+	if (min_scan > 0 || (inactq_shortage > 0 && pq->pq_cnt > 0))
 		vmd->vmd_last_active_scan = scan_tick;

 	/*
 	 * Scan the active queue for pages that can be deactivated.  Update
 	 * the per-page activity counter and use it to identify deactivation
 	 * candidates.  Held pages may be deactivated.
+	 *
+	 * To avoid requeuing each page that remains in the active queue, we
+	 * implement the CLOCK algorithm.  To maintain consistency in the
+	 * generic page queue code, pages are inserted at the tail of the
+	 * active queue.  We thus use two hands, represented by marker pages:
+	 * scans begin at the first hand, which precedes the second hand in
+	 * the queue.  When the two hands meet, they are moved back to the
+	 * head and tail of the queue, respectively, and scanning resumes.
 	 */
-	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
-	    min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
-	    scanned++) {
-		KASSERT(m->queue == PQ_ACTIVE,
-		    ("vm_pageout_scan: page %p isn't active", m));
-		next = TAILQ_NEXT(m, plinks.q);
-		if ((m->flags & PG_MARKER) != 0)
-			continue;
-		KASSERT((m->flags & PG_FICTITIOUS) == 0,
-		    ("Fictitious page %p cannot be in active queue", m));
-		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
-		    ("Unmanaged page %p cannot be in active queue", m));
-		if (!vm_pageout_page_lock(m, &next)) {
-			vm_page_unlock(m);
-			continue;
+	max_scan = inactq_shortage > 0 ? pq->pq_cnt : min_scan;
+act_scan:
+	vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);
+	while ((m = vm_pageout_next(&ss, false)) != NULL) {
+		if (__predict_false(m == &vmd->vmd_clock[1])) {
+			vm_pagequeue_lock(pq);
+			TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
+			TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
+			TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
+			    plinks.q);
+			TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
+			    plinks.q);
+			max_scan -= ss.scanned;
+			vm_pageout_end_scan(&ss);
+			goto act_scan;
 		}
+		if (__predict_false((m->flags & PG_MARKER) != 0))
+			continue;
+
+		vm_page_change_lock(m, &mtx);

 		/*
-		 * The count for page daemon pages is updated after checking
-		 * the page for eligibility.
+		 * The page may have been disassociated from the queue
+		 * while locks were dropped.
 		 */
-		VM_CNT_INC(v_pdpages);
+		if (!vm_pageout_page_queued(m, PQ_ACTIVE))
+			continue;

 		/*
 		 * Wired pages are dequeued lazily.
 		 */
 		if (m->wire_count != 0) {
-			vm_page_dequeue_locked(m);
-			vm_page_unlock(m);
+			vm_page_dequeue_deferred(m);
 			continue;
 		}

@ -1476,14 +1588,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 		} else
 			m->act_count -= min(m->act_count, ACT_DECLINE);

-		/*
-		 * Move this page to the tail of the active, inactive or laundry
-		 * queue depending on usage.
-		 */
 		if (m->act_count == 0) {
-			/* Dequeue to avoid later lock recursion. */
-			vm_page_dequeue_locked(m);
-
 			/*
 			 * When not short for inactive pages, let dirty pages go
 			 * through the inactive queue before moving to the
@ -1515,11 +1620,18 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 					inactq_shortage--;
 				}
 			}
-		} else
-			vm_page_requeue_locked(m);
-		vm_page_unlock(m);
+		}
 	}
+	if (mtx != NULL) {
+		mtx_unlock(mtx);
+		mtx = NULL;
+	}
+	vm_pagequeue_lock(pq);
+	TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
+	TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
+	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
+
 	if (pass > 0)
 		vm_swapout_run_idle();
 	return (page_shortage <= 0);
--- a/sys/vm/vm_pagequeue.h
+++ b/sys/vm/vm_pagequeue.h
@ -73,8 +73,17 @@ struct vm_pagequeue {
 	const char	* const pq_name;
 } __aligned(CACHE_LINE_SIZE);

-#include <sys/pidctrl.h>
+#ifndef VM_BATCHQUEUE_SIZE
+#define	VM_BATCHQUEUE_SIZE	7
+#endif
+
+struct vm_batchqueue {
+	vm_page_t	bq_pa[VM_BATCHQUEUE_SIZE];
+	int		bq_cnt;
+} __aligned(CACHE_LINE_SIZE);
+
 #include <vm/uma.h>
+#include <sys/pidctrl.h>
 struct sysctl_oid;

 /*
@ -82,12 +91,12 @@ struct sysctl_oid;
 * and accounting.
 *
 * Lock Key:
- * f   vmd_free_mtx
- * p   vmd_pageout_mtx
- * d   vm_domainset_lock
- * a   atomic
- * c   const after boot
- * q   page queue lock
+ * f	vmd_free_mtx
+ * p	vmd_pageout_mtx
+ * d	vm_domainset_lock
+ * a	atomic
+ * c	const after boot
+ * q	page queue lock
 */
 struct vm_domain {
 	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
@ -107,8 +116,9 @@ struct vm_domain {
 	boolean_t vmd_oom;
 	int vmd_oom_seq;
 	int vmd_last_active_scan;
-	struct vm_page vmd_markers[PQ_COUNT]; /* markers for queue scans */
+	struct vm_page vmd_markers[PQ_COUNT]; /* (q) markers for queue scans */
 	struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
+	struct vm_page vmd_clock[2]; /* markers for active queue scan */

 	int vmd_pageout_wanted;		/* (a, p) pageout daemon wait channel */
 	int vmd_pageout_pages_needed;	/* (d) page daemon waiting for pages? */
@ -144,6 +154,7 @@ extern struct vm_domain vm_dom[MAXMEMDOM];
 #define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
 #define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)
 #define	vm_pagequeue_lockptr(pq)	(&(pq)->pq_mutex)
+#define	vm_pagequeue_trylock(pq)	mtx_trylock(&(pq)->pq_mutex)
 #define	vm_pagequeue_unlock(pq)		mtx_unlock(&(pq)->pq_mutex)

 #define	vm_domain_free_assert_locked(n)					\
@ -154,6 +165,8 @@ extern struct vm_domain vm_dom[MAXMEMDOM];
 	    mtx_lock(vm_domain_free_lockptr((d)))
 #define	vm_domain_free_lockptr(d)					\
 	    (&(d)->vmd_free_mtx)
+#define	vm_domain_free_trylock(d)					\
+	    mtx_trylock(vm_domain_free_lockptr((d)))
 #define	vm_domain_free_unlock(d)					\
 	    mtx_unlock(vm_domain_free_lockptr((d)))

@ -172,14 +185,39 @@ static __inline void
 vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
 {

-#ifdef notyet
 	vm_pagequeue_assert_locked(pq);
-#endif
 	pq->pq_cnt += addend;
 }
 #define	vm_pagequeue_cnt_inc(pq)	vm_pagequeue_cnt_add((pq), 1)
 #define	vm_pagequeue_cnt_dec(pq)	vm_pagequeue_cnt_add((pq), -1)

+static inline void
+vm_batchqueue_init(struct vm_batchqueue *bq)
+{
+
+	bq->bq_cnt = 0;
+}
+
+static inline bool
+vm_batchqueue_insert(struct vm_batchqueue *bq, vm_page_t m)
+{
+
+	if (bq->bq_cnt < nitems(bq->bq_pa)) {
+		bq->bq_pa[bq->bq_cnt++] = m;
+		return (true);
+	}
+	return (false);
+}
+
+static inline vm_page_t
+vm_batchqueue_pop(struct vm_batchqueue *bq)
+{
+
+	if (bq->bq_cnt == 0)
+		return (NULL);
+	return (bq->bq_pa[--bq->bq_cnt]);
+}
+
 void vm_domain_set(struct vm_domain *vmd);
 void vm_domain_clear(struct vm_domain *vmd);
 int vm_domain_allocate(struct vm_domain *vmd, int req, int npages);
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@ -354,9 +354,9 @@ vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)

 	m->order = order;
 	if (tail)
-		TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
+		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
 	else
-		TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
+		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
 	fl[order].lcnt++;
 }

@ -364,7 +364,7 @@ static void
 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
 {

-	TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
+	TAILQ_REMOVE(&fl[order].pl, m, listq);
 	fl[order].lcnt--;
 	m->order = VM_NFREEORDER;
 }
@ -1196,7 +1196,7 @@ vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
 	    oind++) {
 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 			fl = (*seg->free_queues)[pind];
-			TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
 				/*
 				 * Is the size of this allocation request
 				 * larger than the largest block size?
--- a/sys/vm/vm_swapout.c
+++ b/sys/vm/vm_swapout.c
@ -399,8 +399,15 @@ vm_daemon(void)
 		swapout_flags = vm_pageout_req_swapout;
 		vm_pageout_req_swapout = 0;
 		mtx_unlock(&vm_daemon_mtx);
-		if (swapout_flags)
+		if (swapout_flags != 0) {
+			/*
+			 * Drain the per-CPU page queue batches as a deadlock
+			 * avoidance measure.
+			 */
+			if ((swapout_flags & VM_SWAP_NORMAL) != 0)
+				vm_page_drain_pqbatch();
 			swapout_procs(swapout_flags);
+		}

 		/*
 		 * scan the processes for exceeding their rlimits or if