Split the pagequeues per NUMA domains, and split pageademon process

into threads each processing queue in a single domain. The structure of the pagedaemons and queues is kept intact, most of the changes come from the need for code to find an owning page queue for given page, calculated from the segment containing the page. The tie between NUMA domain and pagedaemon thread/pagequeue split is rather arbitrary, the multithreaded daemon could be allowed for the single-domain machines, or one domain might be split into several page domains, to further increase concurrency. Right now, each pagedaemon thread tries to reach the global target, precalculated at the start of the pass. This is not optimal, since it could cause excessive page deactivation and freeing. The code should be changed to re-check the global page deficit state in the loop after some number of iterations. The pagedaemons reach the quorum before starting the OOM, since one thread inability to meet the target is normal for split queues. Only when all pagedaemons fail to produce enough reusable pages, OOM is started by single selected thread. Launder is modified to take into account the segments layout with regard to the region for which cleaning is performed. Based on the preliminary patch by jeff, sponsored by EMC / Isilon Storage Division. Reviewed by: alc Tested by: pho Sponsored by: The FreeBSD Foundation
svn path=/head/; revision=254065
2013-08-07 16:36:38 +00:00 · 2013-08-07 16:36:38 +00:00 · 449c2e92c9 · 2020-12-20 02:59:44 +00:00
commit 449c2e92c9
parent 872d995f76
10 changed files with 420 additions and 176 deletions
--- a/sys/amd64/amd64/minidump_machdep.c
+++ b/sys/amd64/amd64/minidump_machdep.c
@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/msgbuf.h>
 #include <sys/watchdog.h>
 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/pmap.h>
--- a/sys/ofed/include/linux/page.h
+++ b/sys/ofed/include/linux/page.h
@ -32,6 +32,7 @@

 #include <sys/param.h>

+#include <machine/atomic.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>

--- a/sys/sparc64/sparc64/genassym.c
+++ b/sys/sparc64/sparc64/genassym.c
@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/vmmeter.h>
 #include <sys/_cpuset.h>

+#include <machine/atomic.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -64,8 +64,7 @@
 *			GENERAL RULES ON VM_PAGE MANIPULATION
 *
 *	- A page queue lock is required when adding or removing a page from a
- *	  page queue (vm_pagequeues[]), regardless of other locks or the
- *	  busy state of a page.
+ *	  page queue regardless of other locks or the busy state of a page.
 *
 *		* In general, no thread besides the page daemon can acquire or
 *		  hold more than one page queue lock at a time.
@ -124,20 +123,7 @@ __FBSDID("$FreeBSD$");
 *	page structure.
 */

-struct vm_pagequeue vm_pagequeues[PQ_COUNT] = {
-	[PQ_INACTIVE] = {
-		.pq_pl = TAILQ_HEAD_INITIALIZER(
-		    vm_pagequeues[PQ_INACTIVE].pq_pl),
-		.pq_cnt = &cnt.v_inactive_count,
-		.pq_name = "vm inactive pagequeue"
-	},
-	[PQ_ACTIVE] = {
-		.pq_pl = TAILQ_HEAD_INITIALIZER(
-		    vm_pagequeues[PQ_ACTIVE].pq_pl),
-		.pq_cnt = &cnt.v_active_count,
-		.pq_name = "vm active pagequeue"
-	}
-};
+struct vm_domain vm_dom[MAXMEMDOM];
 struct mtx_padalign vm_page_queue_free_mtx;

 struct mtx_padalign pa_lock[PA_LOCK_COUNT];
@ -256,6 +242,34 @@ vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
 	return (0);
 }

+static void
+vm_page_domain_init(struct vm_domain *vmd)
+{
+	struct vm_pagequeue *pq;
+	int i;
+
+	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
+	    "vm inactive pagequeue";
+	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
+	    &cnt.v_inactive_count;
+	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
+	    "vm active pagequeue";
+	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
+	    &cnt.v_active_count;
+	vmd->vmd_fullintervalcount = 0;
+	vmd->vmd_page_count = 0;
+	vmd->vmd_free_count = 0;
+	vmd->vmd_segs = 0;
+	vmd->vmd_oom = FALSE;
+	vmd->vmd_pass = 0;
+	for (i = 0; i < PQ_COUNT; i++) {
+		pq = &vmd->vmd_pagequeues[i];
+		TAILQ_INIT(&pq->pq_pl);
+		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
+		    MTX_DEF | MTX_DUPOK);
+	}
+}
+
 /*
 *	vm_page_startup:
 *
@ -319,8 +333,8 @@ vm_page_startup(vm_offset_t vaddr)
 	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
 	for (i = 0; i < PA_LOCK_COUNT; i++)
 		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
-	for (i = 0; i < PQ_COUNT; i++)
-		vm_pagequeue_init_lock(&vm_pagequeues[i]);
+	for (i = 0; i < vm_ndomains; i++)
+		vm_page_domain_init(&vm_dom[i]);

 	/*
 	 * Allocate memory for use when boot strapping the kernel memory
@ -1055,7 +1069,7 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
 		    ("vm_page_cache_free: page %p has inconsistent flags", m));
 		cnt.v_cache_count--;
-		cnt.v_free_count++;
+		vm_phys_freecnt_adj(m, 1);
 	}
 	empty = vm_radix_is_empty(&object->cache);
 	mtx_unlock(&vm_page_queue_free_mtx);
@ -1311,7 +1325,7 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 		    ("vm_page_alloc: page %p is not free", m));
 		KASSERT(m->valid == 0,
 		    ("vm_page_alloc: free page %p is valid", m));
-		cnt.v_free_count--;
+		vm_phys_freecnt_adj(m, -1);
 	}

 	/*
@ -1569,7 +1583,7 @@ vm_page_alloc_init(vm_page_t m)
 		    ("vm_page_alloc_init: page %p is not free", m));
 		KASSERT(m->valid == 0,
 		    ("vm_page_alloc_init: free page %p is valid", m));
-		cnt.v_free_count--;
+		vm_phys_freecnt_adj(m, -1);
 		if ((m->flags & PG_ZERO) != 0)
 			vm_page_zero_count--;
 	}
@ -1711,6 +1725,13 @@ vm_waitpfault(void)
 	    "pfault", 0);
 }

+struct vm_pagequeue *
+vm_page_pagequeue(vm_page_t m)
+{
+
+	return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+}
+
 /*
 *	vm_page_dequeue:
 *
@ -1726,11 +1747,11 @@ vm_page_dequeue(vm_page_t m)
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_dequeue: page %p is not queued", m));
-	pq = &vm_pagequeues[m->queue];
+	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
-	(*pq->pq_cnt)--;
+	vm_pagequeue_cnt_dec(pq);
 	vm_pagequeue_unlock(pq);
 }

@ -1747,11 +1768,11 @@ vm_page_dequeue_locked(vm_page_t m)
 	struct vm_pagequeue *pq;

 	vm_page_lock_assert(m, MA_OWNED);
-	pq = &vm_pagequeues[m->queue];
+	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
-	(*pq->pq_cnt)--;
+	vm_pagequeue_cnt_dec(pq);
 }

 /*
@ -1767,11 +1788,11 @@ vm_page_enqueue(int queue, vm_page_t m)
 	struct vm_pagequeue *pq;

 	vm_page_lock_assert(m, MA_OWNED);
-	pq = &vm_pagequeues[queue];
+	pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	m->queue = queue;
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
-	++*pq->pq_cnt;
+	vm_pagequeue_cnt_inc(pq);
 	vm_pagequeue_unlock(pq);
 }

@ -1790,7 +1811,7 @@ vm_page_requeue(vm_page_t m)
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue: page %p is not queued", m));
-	pq = &vm_pagequeues[m->queue];
+	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
@ -1811,7 +1832,7 @@ vm_page_requeue_locked(vm_page_t m)

 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue_locked: page %p is not queued", m));
-	pq = &vm_pagequeues[m->queue];
+	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
@ -1948,7 +1969,7 @@ vm_page_free_toq(vm_page_t m)
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		m->flags |= PG_FREE;
-		cnt.v_free_count++;
+		vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
 		if (!vm_reserv_free_page(m))
 #else
@ -2081,14 +2102,14 @@ _vm_page_deactivate(vm_page_t m, int athead)
 		if (queue != PQ_NONE)
 			vm_page_dequeue(m);
 		m->flags &= ~PG_WINATCFLS;
-		pq = &vm_pagequeues[PQ_INACTIVE];
+		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
 		vm_pagequeue_lock(pq);
 		m->queue = PQ_INACTIVE;
 		if (athead)
 			TAILQ_INSERT_HEAD(&pq->pq_pl, m, pageq);
 		else
 			TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
-		cnt.v_inactive_count++;
+		vm_pagequeue_cnt_inc(pq);
 		vm_pagequeue_unlock(pq);
 	}
 }
@ -2888,18 +2909,20 @@ DB_SHOW_COMMAND(page, vm_page_print_page_info)

 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
-		
-	db_printf("PQ_FREE:");
-	db_printf(" %d", cnt.v_free_count);
-	db_printf("\n");
-		
-	db_printf("PQ_CACHE:");
-	db_printf(" %d", cnt.v_cache_count);
-	db_printf("\n");
+	int dom;

-	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
-		*vm_pagequeues[PQ_ACTIVE].pq_cnt,
-		*vm_pagequeues[PQ_INACTIVE].pq_cnt);
+	db_printf("pq_free %d pq_cache %d\n",
+	    cnt.v_free_count, cnt.v_cache_count);
+	for (dom = 0; dom < vm_ndomains; dom++) {
+		db_printf(
+	"dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
+		    dom,
+		    vm_dom[dom].vmd_page_count,
+		    vm_dom[dom].vmd_free_count,
+		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
+		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
+		    vm_dom[dom].vmd_pass);
+	}
 }

 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -181,18 +181,44 @@ TAILQ_HEAD(pglist, vm_page);
 struct vm_pagequeue {
 	struct mtx	pq_mutex;
 	struct pglist	pq_pl;
-	int *const	pq_cnt;
-	const char *const pq_name;
+	int		pq_cnt;
+	int		* const pq_vcnt;
+	const char	* const pq_name;
 } __aligned(CACHE_LINE_SIZE);

-extern struct vm_pagequeue vm_pagequeues[PQ_COUNT];
+
+struct vm_domain {
+	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
+	int vmd_fullintervalcount;
+	u_int vmd_page_count;
+	u_int vmd_free_count;
+	long vmd_segs;	/* bitmask of the segments */
+	boolean_t vmd_oom;
+	int vmd_pass;	/* local pagedaemon pass */
+	struct vm_page vmd_marker; /* marker for pagedaemon private use */
+};
+
+extern struct vm_domain vm_dom[MAXMEMDOM];

 #define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
-#define	vm_pagequeue_init_lock(pq)	mtx_init(&(pq)->pq_mutex,	\
-	    (pq)->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK);
 #define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)
 #define	vm_pagequeue_unlock(pq)		mtx_unlock(&(pq)->pq_mutex)

+#ifdef _KERNEL
+static __inline void
+vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
+{
+
+#ifdef notyet
+	vm_pagequeue_assert_locked(pq);
+#endif
+	pq->pq_cnt += addend;
+	atomic_add_int(pq->pq_vcnt, addend);
+}
+#define	vm_pagequeue_cnt_inc(pq)	vm_pagequeue_cnt_add((pq), 1)
+#define	vm_pagequeue_cnt_dec(pq)	vm_pagequeue_cnt_add((pq), -1)
+#endif	/* _KERNEL */
+
 extern struct mtx_padalign vm_page_queue_free_mtx;
 extern struct mtx_padalign pa_lock[];

@ -393,6 +419,7 @@ boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 vm_page_t vm_page_next(vm_page_t m);
 int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
+struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
 vm_page_t vm_page_prev(vm_page_t m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
+#include <sys/smp.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/rwlock.h>
@ -103,6 +104,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
@ -114,7 +116,8 @@ __FBSDID("$FreeBSD$");
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_scan(int pass);
+static void vm_pageout_scan(struct vm_domain *vmd, int pass);
+static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);

 struct proc *pageproc;

@ -216,14 +219,15 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");

 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
-static boolean_t vm_pageout_launder(int, int, vm_paddr_t, vm_paddr_t);
+static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
+    vm_paddr_t);
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void vm_req_vmdaemon(int req);
 #endif
 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
-static void vm_pageout_page_stats(void);
+static void vm_pageout_page_stats(struct vm_domain *vmd);

 /*
 * Initialize a dummy page for marking the caller's place in the specified
@ -267,7 +271,7 @@ vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)

 	queue = m->queue;
 	vm_pageout_init_marker(&marker, queue);
-	pq = &vm_pagequeues[queue];
+	pq = vm_page_pagequeue(m);
 	object = m->object;
 	
 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
@ -309,7 +313,7 @@ vm_pageout_page_lock(vm_page_t m, vm_page_t *next)

 	queue = m->queue;
 	vm_pageout_init_marker(&marker, queue);
-	pq = &vm_pagequeues[queue];
+	pq = vm_page_pagequeue(m);

 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
 	vm_pagequeue_unlock(pq);
@ -567,21 +571,17 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
 }

 static boolean_t
-vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high)
+vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
+    vm_paddr_t high)
 {
 	struct mount *mp;
-	struct vm_pagequeue *pq;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_tmp, next;

-	pq = &vm_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, pageq, next) {
-		KASSERT(m->queue == queue,
-		    ("vm_pageout_launder: page %p's queue is not %d", m,
-		    queue));
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
 		pa = VM_PAGE_TO_PHYS(m);
@ -661,7 +661,8 @@ vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high)
 void
 vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
 {
-	int actl, actmax, inactl, inactmax;
+	int actl, actmax, inactl, inactmax, dom, initial_dom;
+	static int start_dom = 0;

 	if (tries > 0) {
 		/*
@ -677,19 +678,55 @@ vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
 		 */
 		uma_reclaim();
 	}
+
+	/*
+	 * Make the next scan start on the next domain.
+	 */
+	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
+
 	inactl = 0;
 	inactmax = cnt.v_inactive_count;
 	actl = 0;
 	actmax = tries < 2 ? 0 : cnt.v_active_count;
+	dom = initial_dom;
+
+	/*
+	 * Scan domains in round-robin order, first inactive queues,
+	 * then active.  Since domain usually owns large physically
+	 * contiguous chunk of memory, it makes sense to completely
+	 * exhaust one domain before switching to next, while growing
+	 * the pool of contiguous physical pages.
+	 *
+	 * Do not even start launder a domain which cannot contain
+	 * the specified address range, as indicated by segments
+	 * constituting the domain.
+	 */
 again:
-	if (inactl < inactmax && vm_pageout_launder(PQ_INACTIVE, tries, low,
-	    high)) {
-		inactl++;
-		goto again;
+	if (inactl < inactmax) {
+		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+		    low, high) &&
+		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
+		    tries, low, high)) {
+			inactl++;
+			goto again;
+		}
+		if (++dom == vm_ndomains)
+			dom = 0;
+		if (dom != initial_dom)
+			goto again;
 	}
-	if (actl < actmax && vm_pageout_launder(PQ_ACTIVE, tries, low, high)) {
-		actl++;
-		goto again;
+	if (actl < actmax) {
+		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+		    low, high) &&
+		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
+		      tries, low, high)) {
+			actl++;
+			goto again;
+		}
+		if (++dom == vm_ndomains)
+			dom = 0;
+		if (dom != initial_dom)
+			goto again;
 	}
 }

@ -861,10 +898,9 @@ vm_pageout_map_deactivate_pages(map, desired)
 *	vm_pageout_scan does the dirty work for the pageout daemon.
 */
 static void
-vm_pageout_scan(int pass)
+vm_pageout_scan(struct vm_domain *vmd, int pass)
 {
 	vm_page_t m, next;
-	struct vm_page marker;
 	struct vm_pagequeue *pq;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage;
@ -874,8 +910,6 @@ vm_pageout_scan(int pass)
 	int maxlaunder;
 	boolean_t queues_locked;

-	vm_pageout_init_marker(&marker, PQ_INACTIVE);
-
 	/*
 	 * Decrease registered cache sizes.
 	 */
@ -888,7 +922,7 @@ vm_pageout_scan(int pass)
 	/*
 	 * The addl_page_shortage is the number of temporarily
 	 * stuck pages in the inactive queue.  In other words, the
-	 * number of pages from cnt.v_inactive_count that should be
+	 * number of pages from the inactive count that should be
 	 * discounted in setting the target for the active queue scan.
 	 */
 	addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit);
@ -914,8 +948,6 @@ vm_pageout_scan(int pass)
 	if (pass)
 		maxlaunder = 10000;

-	maxscan = cnt.v_inactive_count;
-
 	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
@ -923,7 +955,8 @@ vm_pageout_scan(int pass)
 	 * is not used to form decisions for the inactive queue, only for the
 	 * active queue.
 	 */
-	pq = &vm_pagequeues[PQ_INACTIVE];
+	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
+	maxscan = pq->pq_cnt;
 	vm_pagequeue_lock(pq);
 	queues_locked = TRUE;
 	for (m = TAILQ_FIRST(&pq->pq_pl);
@ -984,7 +1017,7 @@ vm_pageout_scan(int pass)
 		 * 'next' pointer.  Use our marker to remember our
 		 * place.
 		 */
-		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
+		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, pageq);
 		vm_pagequeue_unlock(pq);
 		queues_locked = FALSE;

@ -1034,7 +1067,7 @@ vm_pageout_scan(int pass)
 			/*
 			 * Held pages are essentially stuck in the
 			 * queue.  So, they ought to be discounted
-			 * from cnt.v_inactive_count.  See the
+			 * from the inactive count.  See the
 			 * calculation of the page_shortage for the
 			 * loop over the active queue below.
 			 */
@ -1178,7 +1211,7 @@ vm_pageout_scan(int pass)
 				 */
 				if (m->queue != PQ_INACTIVE ||
 				    m->object != object ||
-				    TAILQ_NEXT(m, pageq) != &marker) {
+				    TAILQ_NEXT(m, pageq) != &vmd->vmd_marker) {
 					vm_page_unlock(m);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
@ -1248,8 +1281,8 @@ vm_pageout_scan(int pass)
 			vm_pagequeue_lock(pq);
 			queues_locked = TRUE;
 		}
-		next = TAILQ_NEXT(&marker, pageq);
-		TAILQ_REMOVE(&pq->pq_pl, &marker, pageq);
+		next = TAILQ_NEXT(&vmd->vmd_marker, pageq);
+		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, pageq);
 	}
 	vm_pagequeue_unlock(pq);

@ -1258,7 +1291,7 @@ vm_pageout_scan(int pass)
 	 * active queue to the inactive queue.
 	 */
 	page_shortage = vm_paging_target() +
-		cnt.v_inactive_target - cnt.v_inactive_count;
+	    cnt.v_inactive_target - cnt.v_inactive_count;
 	page_shortage += addl_page_shortage;

 	/*
@ -1266,8 +1299,8 @@ vm_pageout_scan(int pass)
 	 * track the per-page activity counter and use it to locate
 	 * deactivation candidates.
 	 */
-	pcount = cnt.v_active_count;
-	pq = &vm_pagequeues[PQ_ACTIVE];
+	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
+	pcount = pq->pq_cnt;
 	vm_pagequeue_lock(pq);
 	m = TAILQ_FIRST(&pq->pq_pl);
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
@ -1393,12 +1426,54 @@ vm_pageout_scan(int pass)
 	 * chance to flush out dirty vnode-backed pages and to allow
 	 * active pages to be moved to the inactive queue and reclaimed.
 	 */
-	if (pass != 0 &&
-	    ((swap_pager_avail < 64 && vm_page_count_min()) ||
-	     (swap_pager_full && vm_paging_target() > 0)))
-		vm_pageout_oom(VM_OOM_MEM);
+	vm_pageout_mightbe_oom(vmd, pass);
 }

+static int vm_pageout_oom_vote;
+
+/*
+ * The pagedaemon threads randlomly select one to perform the
+ * OOM.  Trying to kill processes before all pagedaemons
+ * failed to reach free target is premature.
+ */
+static void
+vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
+{
+	int old_vote;
+
+	if (pass == 0 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
+	    (swap_pager_full && vm_paging_target() > 0))) {
+		if (vmd->vmd_oom) {
+			vmd->vmd_oom = FALSE;
+			atomic_subtract_int(&vm_pageout_oom_vote, 1);
+		}
+		return;
+	}
+
+	if (vmd->vmd_oom)
+		return;
+
+	vmd->vmd_oom = TRUE;
+	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
+	if (old_vote != vm_ndomains - 1)
+		return;
+
+	/*
+	 * The current pagedaemon thread is the last in the quorum to
+	 * start OOM.  Initiate the selection and signaling of the
+	 * victim.
+	 */
+	vm_pageout_oom(VM_OOM_MEM);
+
+	/*
+	 * After one round of OOM terror, recall our vote.  On the
+	 * next pass, current pagedaemon would vote again if the low
+	 * memory condition is still there, due to vmd_oom being
+	 * false.
+	 */
+	vmd->vmd_oom = FALSE;
+	atomic_subtract_int(&vm_pageout_oom_vote, 1);
+}

 void
 vm_pageout_oom(int shortage)
@ -1501,14 +1576,13 @@ vm_pageout_oom(int shortage)
 * helps the situation where paging just starts to occur.
 */
 static void
-vm_pageout_page_stats(void)
+vm_pageout_page_stats(struct vm_domain *vmd)
 {
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	vm_page_t m, next;
 	int pcount, tpcount;		/* Number of pages to check */
-	static int fullintervalcount = 0;
-	int page_shortage;
+	int actcount, page_shortage;

 	page_shortage = 
 	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
@ -1517,25 +1591,30 @@ vm_pageout_page_stats(void)
 	if (page_shortage <= 0)
 		return;

-	pcount = cnt.v_active_count;
-	fullintervalcount += vm_pageout_stats_interval;
-	if (fullintervalcount < vm_pageout_full_stats_interval) {
-		vm_pageout_stats++;
-		tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count /
-		    cnt.v_page_count;
+	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
+
+	/*
+	 * pcount limits the depth of the queue scan.  In particular,
+	 * for the full scan, it prevents the iteration from looking
+	 * into the requeued pages.  The limit is not exact since the
+	 * page queue lock is dropped during the iteration.
+	 */
+	pcount = pq->pq_cnt;
+	vmd->vmd_fullintervalcount += vm_pageout_stats_interval;
+	if (vmd->vmd_fullintervalcount < vm_pageout_full_stats_interval) {
+		atomic_add_int(&vm_pageout_stats, 1);
+		tpcount = (int64_t)vm_pageout_stats_max * pcount /
+		    vmd->vmd_page_count;
 		if (pcount > tpcount)
 			pcount = tpcount;
 	} else {
-		vm_pageout_full_stats++;
-		fullintervalcount = 0;
+		atomic_add_int(&vm_pageout_full_stats, 1);
+		vmd->vmd_fullintervalcount = 0;
 	}

-	pq = &vm_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
 	m = TAILQ_FIRST(&pq->pq_pl);
-	while ((m != NULL) && (pcount-- > 0)) {
-		int actcount;
-
+	while (m != NULL && pcount-- > 0) {
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_page_stats: page %p isn't active", m));

@ -1560,11 +1639,11 @@ vm_pageout_page_stats(void)
 		}

 		/*
-		 * Don't deactivate pages that are busy.
+		 * Don't deactivate pages that are busy or held.
 		 */
-		if ((m->busy != 0) ||
-		    (m->oflags & VPO_BUSY) ||
-		    (m->hold_count != 0)) {
+		if (m->busy != 0 ||
+		    (m->oflags & VPO_BUSY) != 0 ||
+		    m->hold_count != 0) {
 			vm_page_unlock(m);
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_requeue_locked(m);
@ -1579,7 +1658,7 @@ vm_pageout_page_stats(void)
 		}

 		actcount += pmap_ts_referenced(m);
-		if (actcount) {
+		if (actcount != 0) {
 			m->act_count += ACT_ADVANCE + actcount;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
@ -1611,13 +1690,105 @@ vm_pageout_page_stats(void)
 	vm_pagequeue_unlock(pq);
 }

+static void
+vm_pageout_worker(void *arg)
+{
+	struct vm_domain *domain;
+	struct pcpu *pc;
+	int cpu, error, domidx;
+
+	domidx = (uintptr_t)arg;
+	domain = &vm_dom[domidx];
+
+	/*
+	 * XXXKIB The bind is rather arbitrary.  With some minor
+	 * complications, we could assign the cpuset consisting of all
+	 * CPUs in the same domain.  In fact, it even does not matter
+	 * if the CPU we bind to is in the affinity domain of this
+	 * page queue, we only need to establish the fair distribution
+	 * of pagedaemon threads among CPUs.
+	 *
+	 * XXXKIB It would be useful to allocate vm_pages for the
+	 * domain from the domain, and put pcpu area into the page
+	 * owned by the domain.
+	 */
+	if (mem_affinity != NULL) {
+		CPU_FOREACH(cpu) {
+			pc = pcpu_find(cpu);
+			if (pc->pc_domain == domidx) {
+				thread_lock(curthread);
+				sched_bind(curthread, cpu);
+				thread_unlock(curthread);
+				break;
+			}
+		}
+	}
+
+	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
+
+	/*
+	 * The pageout daemon worker is never done, so loop forever.
+	 */
+	while (TRUE) {
+		/*
+		 * If we have enough free memory, wakeup waiters.  Do
+		 * not clear vm_pages_needed until we reach our target,
+		 * otherwise we may be woken up over and over again and
+		 * waste a lot of cpu.
+		 */
+		mtx_lock(&vm_page_queue_free_mtx);
+		if (vm_pages_needed && !vm_page_count_min()) {
+			if (!vm_paging_needed())
+				vm_pages_needed = 0;
+			wakeup(&cnt.v_free_count);
+		}
+		if (vm_pages_needed) {
+			/*
+			 * Still not done, take a second pass without waiting
+			 * (unlimited dirty cleaning), otherwise sleep a bit
+			 * and try again.
+			 */
+			++(domain->vmd_pass);
+			if (domain->vmd_pass > 1)
+				msleep(&vm_pages_needed,
+				    &vm_page_queue_free_mtx, PVM, "psleep",
+				    hz / 2);
+		} else {
+			/*
+			 * Good enough, sleep & handle stats.  Prime the pass
+			 * for the next run.
+			 */
+			if (domain->vmd_pass > 1)
+				domain->vmd_pass = 1;
+			else
+				domain->vmd_pass = 0;
+			error = msleep(&vm_pages_needed,
+			    &vm_page_queue_free_mtx, PVM, "psleep",
+			    vm_pageout_stats_interval * hz);
+			if (error && !vm_pages_needed) {
+				mtx_unlock(&vm_page_queue_free_mtx);
+				domain->vmd_pass = 0;
+				vm_pageout_page_stats(domain);
+				continue;
+			}
+		}
+		if (vm_pages_needed)
+			cnt.v_pdwakeups++;
+		mtx_unlock(&vm_page_queue_free_mtx);
+		vm_pageout_scan(domain, domain->vmd_pass);
+	}
+}
+
 /*
 *	vm_pageout is the high level pageout daemon.
 */
 static void
 vm_pageout(void)
 {
-	int error, pass;
+#if MAXMEMDOM > 1
+	int error, i;
+#endif

 	/*
 	 * Initialize some paging parameters.
@ -1687,58 +1858,17 @@ vm_pageout(void)
 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;

 	swap_pager_swap_init();
-	pass = 0;
-	/*
-	 * The pageout daemon is never done, so loop forever.
-	 */
-	while (TRUE) {
-		/*
-		 * If we have enough free memory, wakeup waiters.  Do
-		 * not clear vm_pages_needed until we reach our target,
-		 * otherwise we may be woken up over and over again and
-		 * waste a lot of cpu.
-		 */
-		mtx_lock(&vm_page_queue_free_mtx);
-		if (vm_pages_needed && !vm_page_count_min()) {
-			if (!vm_paging_needed())
-				vm_pages_needed = 0;
-			wakeup(&cnt.v_free_count);
+#if MAXMEMDOM > 1
+	for (i = 1; i < vm_ndomains; i++) {
+		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
+		    curproc, NULL, 0, 0, "dom%d", i);
+		if (error != 0) {
+			panic("starting pageout for domain %d, error %d\n",
+			    i, error);
 		}
-		if (vm_pages_needed) {
-			/*
-			 * Still not done, take a second pass without waiting
-			 * (unlimited dirty cleaning), otherwise sleep a bit
-			 * and try again.
-			 */
-			++pass;
-			if (pass > 1)
-				msleep(&vm_pages_needed,
-				    &vm_page_queue_free_mtx, PVM, "psleep",
-				    hz / 2);
-		} else {
-			/*
-			 * Good enough, sleep & handle stats.  Prime the pass
-			 * for the next run.
-			 */
-			if (pass > 1)
-				pass = 1;
-			else
-				pass = 0;
-			error = msleep(&vm_pages_needed,
-			    &vm_page_queue_free_mtx, PVM, "psleep",
-			    vm_pageout_stats_interval * hz);
-			if (error && !vm_pages_needed) {
-				mtx_unlock(&vm_page_queue_free_mtx);
-				pass = 0;
-				vm_pageout_page_stats();
-				continue;
-			}
-		}
-		if (vm_pages_needed)
-			cnt.v_pdwakeups++;
-		mtx_unlock(&vm_page_queue_free_mtx);
-		vm_pageout_scan(pass);
 	}
+#endif
+	vm_pageout_worker((uintptr_t)0);
 }

 /*
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@ -65,26 +65,15 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>

-struct vm_freelist {
-	struct pglist pl;
-	int lcnt;
-};
-
-struct vm_phys_seg {
-	vm_paddr_t	start;
-	vm_paddr_t	end;
-	vm_page_t	first_page;
-	int		domain;
-	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
-};
+_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
+    "Too many physsegs.");

 struct mem_affinity *mem_affinity;

 int vm_ndomains = 1;

-static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
-
-static int vm_phys_nsegs;
+struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+int vm_phys_nsegs;

 #define VM_PHYS_FICTITIOUS_NSEGS	8
 static struct vm_phys_fictitious_seg {
@ -140,6 +129,22 @@ vm_rr_selectdomain(void)
 #endif
 }

+boolean_t
+vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
+{
+	struct vm_phys_seg *s;
+	int idx;
+
+	while ((idx = ffsl(mask)) != 0) {
+		idx--;	/* ffsl counts from 1 */
+		mask &= ~(1UL << idx);
+		s = &vm_phys_segs[idx];
+		if (low < s->end && high > s->start)
+			return (TRUE);
+	}
+	return (FALSE);
+}
+
 /*
 * Outputs the state of the physical memory allocator, specifically,
 * the amount of physical memory in each free list.
@ -378,12 +383,16 @@ void
 vm_phys_add_page(vm_paddr_t pa)
 {
 	vm_page_t m;
+	struct vm_domain *vmd;

 	cnt.v_page_count++;
 	m = vm_phys_paddr_to_vm_page(pa);
 	m->phys_addr = pa;
 	m->queue = PQ_NONE;
 	m->segind = vm_phys_paddr_to_segind(pa);
+	vmd = vm_phys_domain(m);
+	vmd->vmd_page_count++;
+	vmd->vmd_segs |= 1UL << m->segind;
 	m->flags = PG_FREE;
 	KASSERT(m->order == VM_NFREEORDER,
 	    ("vm_phys_add_page: page %p has unexpected order %d",
@ -391,7 +400,7 @@ vm_phys_add_page(vm_paddr_t pa)
 	m->pool = VM_FREEPOOL_DEFAULT;
 	pmap_page_init(m);
 	mtx_lock(&vm_page_queue_free_mtx);
-	cnt.v_free_count++;
+	vm_phys_freecnt_adj(m, 1);
 	vm_phys_free_pages(m, 0);
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
@ -813,12 +822,12 @@ vm_phys_zero_pages_idle(void)
 			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
 				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
 					vm_phys_unfree_page(m_tmp);
-					cnt.v_free_count--;
+					vm_phys_freecnt_adj(m, -1);
 					mtx_unlock(&vm_page_queue_free_mtx);
 					pmap_zero_page_idle(m_tmp);
 					m_tmp->flags |= PG_ZERO;
 					mtx_lock(&vm_page_queue_free_mtx);
-					cnt.v_free_count++;
+					vm_phys_freecnt_adj(m, 1);
 					vm_phys_free_pages(m_tmp, 0);
 					vm_page_zero_count++;
 					cnt_prezero++;
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@ -47,8 +47,23 @@ struct mem_affinity {
 	int domain;
 };

+struct vm_freelist {
+	struct pglist pl;
+	int lcnt;
+};
+
+struct vm_phys_seg {
+	vm_paddr_t	start;
+	vm_paddr_t	end;
+	vm_page_t	first_page;
+	int		domain;
+	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
+};
+
 extern struct mem_affinity *mem_affinity;
 extern int vm_ndomains;
+extern struct vm_phys_seg vm_phys_segs[];
+extern int vm_phys_nsegs;

 /*
 * The following functions are only to be used by the virtual memory system.
@ -58,6 +73,7 @@ vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
    u_long alignment, vm_paddr_t boundary);
 vm_page_t vm_phys_alloc_freelist_pages(int flind, int pool, int order);
 vm_page_t vm_phys_alloc_pages(int pool, int order);
+boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high);
 int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
    vm_memattr_t memattr);
 void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end);
@ -70,5 +86,36 @@ void vm_phys_set_pool(int pool, vm_page_t m, int order);
 boolean_t vm_phys_unfree_page(vm_page_t m);
 boolean_t vm_phys_zero_pages_idle(void);

+/*
+ *	vm_phys_domain:
+ *
+ * 	Return the memory domain the page belongs to.
+ */
+static inline struct vm_domain *
+vm_phys_domain(vm_page_t m)
+{
+#if MAXMEMDOM > 1
+	int domn, segind;
+
+	/* XXXKIB try to assert that the page is managed */
+	segind = m->segind;
+	KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m));
+	domn = vm_phys_segs[segind].domain;
+	KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m));
+	return (&vm_dom[domn]);
+#else
+	return (&vm_dom[0]);
+#endif
+}
+
+static inline void
+vm_phys_freecnt_adj(vm_page_t m, int adj)
+{
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	cnt.v_free_count += adj;
+	vm_phys_domain(m)->vmd_free_count += adj;
+}
+
 #endif	/* _KERNEL */
 #endif	/* !_VM_PHYS_H_ */
--- a/sys/vm/vm_zeroidle.c
+++ b/sys/vm/vm_zeroidle.c
@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/unistd.h>

 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>

--- a/sys/x86/acpica/srat.c
+++ b/sys/x86/acpica/srat.c
@ -31,10 +31,14 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
 #include <sys/smp.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
+#include <vm/vm_page.h>
 #include <vm/vm_phys.h>

 #include <contrib/dev/acpica/include/acpi.h>