Extend uma_reclaim() to permit different reclamation targets.

The page daemon periodically invokes uma_reclaim() to reclaim cached items from each zone when the system is under memory pressure. This is important since the size of these caches is unbounded by default. However it also results in bursts of high latency when allocating from heavily used zones as threads miss in the per-CPU caches and must access the keg in order to allocate new items. With r340405 we maintain an estimate of each zone's usage of its (per-NUMA domain) cache of full buckets. Start making use of this estimate to avoid reclaiming the entire cache when under memory pressure. In particular, introduce TRIM, DRAIN and DRAIN_CPU verbs for uma_reclaim() and uma_zone_reclaim(). When trimming, only items in excess of the estimate are reclaimed. Draining a zone reclaims all of the cached full buckets (the previous behaviour of uma_reclaim()), and may further drain the per-CPU caches in extreme cases. Now, when under memory pressure, the page daemon will trim zones rather than draining them. As a result, heavily used zones do not incur bursts of bucket cache misses following reclamation, but large, unused caches will be reclaimed as before. Reviewed by: jeff Tested by: pho (an earlier version) MFC after: 2 months Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D16667
2019-09-01 22:22:43 +00:00 · 2019-09-01 22:22:43 +00:00 · 5451b35f06
commit 5451b35f06
parent b1573a081f
11 changed files with 192 additions and 85 deletions
--- a/lib/libmemstat/memstat_uma.c
+++ b/lib/libmemstat/memstat_uma.c
@ -474,9 +474,9 @@ skip_percpu:
 				ret = kread(kvm, &uz.uz_domain[i], &uzd,
 				   sizeof(uzd), 0);
 				for (ubp =
-				    LIST_FIRST(&uzd.uzd_buckets);
+				    TAILQ_FIRST(&uzd.uzd_buckets);
 				    ubp != NULL;
-				    ubp = LIST_NEXT(&ub, ub_link)) {
+				    ubp = TAILQ_NEXT(&ub, ub_link)) {
 					ret = kread(kvm, ubp, &ub,
 					   sizeof(ub), 0);
 					mtp->mt_zonefree += ub.ub_cnt;
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@ -2281,6 +2281,8 @@ MLINKS+=vrele.9 vput.9 \
 	vrele.9 vunref.9
 MLINKS+=vslock.9 vsunlock.9
 MLINKS+=zone.9 uma.9 \
+	zone.9 uma_prealloc.9 \
+	zone.9 uma_reclaim.9 \
 	zone.9 uma_zalloc.9 \
 	zone.9 uma_zalloc_arg.9 \
 	zone.9 uma_zalloc_domain.9 \
@ -2296,7 +2298,7 @@ MLINKS+=zone.9 uma.9 \
 	zone.9 uma_zfree_pcpu_arg.9 \
 	zone.9 uma_zone_get_cur.9 \
 	zone.9 uma_zone_get_max.9 \
-	zone.9 uma_zone_prealloc.9 \
+	zone.9 uma_zone_reclaim.9 \
 	zone.9 uma_zone_reserve.9 \
 	zone.9 uma_zone_reserve_kva.9 \
 	zone.9 uma_zone_set_allocf.9 \
--- a/share/man/man9/zone.9
+++ b/share/man/man9/zone.9
@ -25,7 +25,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd August 30, 2019
+.Dd September 1, 2019
 .Dt UMA 9
 .Os
 .Sh NAME
@ -98,6 +98,10 @@ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
 .Ft void
 .Fn uma_zone_reserve_kva "uma_zone_t zone" "int nitems"
 .Ft void
+.Fn uma_reclaim "int req"
+.Ft void
+.Fn uma_zone_reclaim "uma_zone_t zone" "int req"
+.Ft void
 .Fn uma_zone_set_allocf "uma_zone_t zone" "uma_alloc allocf"
 .Ft void
 .Fn uma_zone_set_freef "uma_zone_t zone" "uma_free freef"
@ -438,6 +442,32 @@ does not restrict the use of the pre-allocation to
 requests.
 .Pp
 The
+.Fn uma_reclaim
+and
+.Fn uma_zone_reclaim
+functions reclaim cached items from UMA zones, releasing unused memory.
+The
+.Fn uma_reclaim
+function reclaims items from all regular zones, while
+.Fn uma_zone_reclaim
+reclaims items only from the specified zone.
+The
+.Fa req
+parameter must be one of three values which specify how aggressively
+items are to be reclaimed:
+.Bl -tag -width indent
+.It Dv UMA_RECLAIM_TRIM
+Reclaim items only in excess of the zone's estimated working set size.
+The working set size is periodically updated and tracks the recent history
+of the zone's usage.
+.It Dv UMA_RECLAIM_DRAIN
+Reclaim all items from the unbounded cache.
+Free items in the per-CPU caches are left alone.
+.It Dv UMA_RECLAIM_DRAIN_CPU
+Reclaim all cached items.
+.El
+.Pp
+The
 .Fn uma_zone_set_allocf
 and
 .Fn uma_zone_set_freef
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
@ -238,14 +238,14 @@ void
 kmem_cache_reap_soon(kmem_cache_t *cache)
 {
 #ifndef KMEM_DEBUG
-	zone_drain(cache->kc_zone);
+	uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN);
 #endif
 }

 void
 kmem_reap(void)
 {
-	uma_reclaim();
+	uma_reclaim(UMA_RECLAIM_TRIM);
 }
 #else
 void
--- a/sys/kern/kern_mbuf.c
+++ b/sys/kern/kern_mbuf.c
@ -711,14 +711,14 @@ mb_dtor_pack(void *mem, int size, void *arg)
 #endif
 	/*
 	 * If there are processes blocked on zone_clust, waiting for pages
-	 * to be freed up, * cause them to be woken up by draining the
-	 * packet zone.  We are exposed to a race here * (in the check for
+	 * to be freed up, cause them to be woken up by draining the
+	 * packet zone.  We are exposed to a race here (in the check for
 	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
 	 * is deliberate. We don't want to acquire the zone lock for every
 	 * mbuf free.
 	 */
 	if (uma_zone_exhausted_nolock(zone_clust))
-		zone_drain(zone_pack);
+		uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 }

 /*
@ -1362,7 +1362,7 @@ m_clget(struct mbuf *m, int how)
 	 * we might be able to loosen a few clusters up on the drain.
 	 */
 	if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
-		zone_drain(zone_pack);
+		uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 		uma_zalloc_arg(zone_clust, m, how);
 	}
 	MBUF_PROBE2(m__clget, m, how);
--- a/sys/kern/subr_vmem.c
+++ b/sys/kern/subr_vmem.c
@ -588,7 +588,7 @@ qc_drain(vmem_t *vm)

 	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
 	for (i = 0; i < qcache_idx_max; i++)
-		zone_drain(vm->vm_qcache[i].qc_cache);
+		uma_zone_reclaim(vm->vm_qcache[i].qc_cache, UMA_RECLAIM_DRAIN);
 }

 #ifndef UMA_MD_SMALL_ALLOC
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -1321,7 +1321,7 @@ vnlru_proc(void)
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
-			uma_reclaim();
+			uma_reclaim(UMA_RECLAIM_DRAIN);
 		if (done == 0) {
 			if (force == 0 || force == 1) {
 				force = 2;
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@ -50,8 +50,6 @@ struct uma_zone;
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;

-void zone_drain(uma_zone_t);
-
 /*
 * Item constructor
 *
@ -438,17 +436,18 @@ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain,
 typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);

 /*
- * Reclaims unused memory for all zones
+ * Reclaims unused memory
 *
 * Arguments:
- *	None
+ *	req  Reclamation request type.
 * Returns:
 *	None
- *
- * This should only be called by the page out daemon.
 */
-
-void uma_reclaim(void);
+#define	UMA_RECLAIM_DRAIN	1	/* release bucket cache */
+#define	UMA_RECLAIM_DRAIN_CPU	2	/* release bucket and per-CPU caches */
+#define	UMA_RECLAIM_TRIM	3	/* trim bucket cache to WSS */
+void uma_reclaim(int req);
+void uma_zone_reclaim(uma_zone_t, int req);

 /*
 * Sets the alignment mask to be used for all zones requesting cache
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@ -142,7 +142,7 @@ static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 static char *bootmem;
 static int boot_pages;

-static struct sx uma_drain_lock;
+static struct sx uma_reclaim_lock;

 /*
 * kmem soft limit, initialized by uma_set_limit().  Ensure that early
@ -250,7 +250,7 @@ static void pcpu_page_free(void *, vm_size_t, uint8_t);
 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
-static void bucket_cache_drain(uma_zone_t zone);
+static void bucket_cache_reclaim(uma_zone_t zone, bool);
 static int keg_ctor(void *, int, void *, int);
 static void keg_dtor(void *, int, void *);
 static int zone_ctor(void *, int, void *, int);
@ -467,27 +467,36 @@ bucket_zone_drain(void)
 	struct uma_bucket_zone *ubz;

 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
-		zone_drain(ubz->ubz_zone);
+		uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN);
 }

+/*
+ * Attempt to satisfy an allocation by retrieving a full bucket from one of the
+ * zone's caches.
+ */
 static uma_bucket_t
-zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws)
+zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom)
 {
 	uma_bucket_t bucket;

 	ZONE_LOCK_ASSERT(zone);

-	if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
+	if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) {
 		MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
-		LIST_REMOVE(bucket, ub_link);
+		TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
 		zdom->uzd_nitems -= bucket->ub_cnt;
-		if (ws && zdom->uzd_imin > zdom->uzd_nitems)
+		if (zdom->uzd_imin > zdom->uzd_nitems)
 			zdom->uzd_imin = zdom->uzd_nitems;
 		zone->uz_bkt_count -= bucket->ub_cnt;
 	}
 	return (bucket);
 }

+/*
+ * Insert a full bucket into the specified cache.  The "ws" parameter indicates
+ * whether the bucket's contents should be counted as part of the zone's working
+ * set.
+ */
 static void
 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
    const bool ws)
@ -497,7 +506,10 @@ zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
 	KASSERT(zone->uz_bkt_count < zone->uz_bkt_max, ("%s: zone %p overflow",
 	    __func__, zone));

-	LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
+	if (ws)
+		TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
+	else
+		TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
 	zdom->uzd_nitems += bucket->ub_cnt;
 	if (ws && zdom->uzd_imax < zdom->uzd_nitems)
 		zdom->uzd_imax = zdom->uzd_nitems;
@ -558,7 +570,7 @@ zone_domain_update_wss(uma_zone_domain_t zdom)
 	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
 	wss = zdom->uzd_imax - zdom->uzd_imin;
 	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
-	zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5;
+	zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
 }

 /*
@ -609,11 +621,12 @@ zone_timeout(uma_zone_t zone)
 			return;
 		}
 	}
+	KEG_UNLOCK(keg);

+	ZONE_LOCK(zone);
 	for (int i = 0; i < vm_ndomains; i++)
 		zone_domain_update_wss(&zone->uz_domain[i]);
-
-	KEG_UNLOCK(keg);
+	ZONE_UNLOCK(zone);
 }

 /*
@ -777,7 +790,7 @@ cache_drain(uma_zone_t zone)
 	 * XXX: It would good to be able to assert that the zone is being
 	 * torn down to prevent improper use of cache_drain().
 	 *
-	 * XXX: We lock the zone before passing into bucket_cache_drain() as
+	 * XXX: We lock the zone before passing into bucket_cache_reclaim() as
 	 * it is used elsewhere.  Should the tear-down path be made special
 	 * there in some form?
 	 */
@ -797,7 +810,7 @@ cache_drain(uma_zone_t zone)
 		cache->uc_crossbucket = NULL;
 	}
 	ZONE_LOCK(zone);
-	bucket_cache_drain(zone);
+	bucket_cache_reclaim(zone, true);
 	ZONE_UNLOCK(zone);
 }

@ -869,7 +882,7 @@ cache_drain_safe_cpu(uma_zone_t zone)
 * Zone lock must not be held on call this function.
 */
 static void
-cache_drain_safe(uma_zone_t zone)
+pcpu_cache_drain_safe(uma_zone_t zone)
 {
 	int cpu;

@ -897,22 +910,46 @@ cache_drain_safe(uma_zone_t zone)
 }

 /*
- * Drain the cached buckets from a zone.  Expects a locked zone on entry.
+ * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
+ * requested a drain, otherwise the per-domain caches are trimmed to either
+ * estimated working set size.
 */
 static void
-bucket_cache_drain(uma_zone_t zone)
+bucket_cache_reclaim(uma_zone_t zone, bool drain)
 {
 	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
+	long target, tofree;
 	int i;

-	/*
-	 * Drain the bucket queues and free the buckets.
-	 */
 	for (i = 0; i < vm_ndomains; i++) {
 		zdom = &zone->uz_domain[i];
-		while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) !=
-		    NULL) {
+
+		/*
+		 * If we were asked to drain the zone, we are done only once
+		 * this bucket cache is empty.  Otherwise, we reclaim items in
+		 * excess of the zone's estimated working set size.  If the
+		 * difference nitems - imin is larger than the WSS estimate,
+		 * then the estimate will grow at the end of this interval and
+		 * we ignore the historical average.
+		 */
+		target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
+		    zdom->uzd_imin);
+		while (zdom->uzd_nitems > target) {
+			bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist);
+			if (bucket == NULL)
+				break;
+			tofree = bucket->ub_cnt;
+			TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
+			zdom->uzd_nitems -= tofree;
+
+			/*
+			 * Shift the bounds of the current WSS interval to avoid
+			 * perturbing the estimate.
+			 */
+			zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree);
+			zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree);
+
 			ZONE_UNLOCK(zone);
 			bucket_drain(zone, bucket);
 			bucket_free(zone, bucket, NULL);
@ -921,8 +958,8 @@ bucket_cache_drain(uma_zone_t zone)
 	}

 	/*
-	 * Shrink further bucket sizes.  Price of single zone lock collision
-	 * is probably lower then price of global cache drain.
+	 * Shrink the zone bucket size to ensure that the per-CPU caches
+	 * don't grow too large.
 	 */
 	if (zone->uz_count > zone->uz_count_min)
 		zone->uz_count--;
@ -1020,7 +1057,7 @@ finished:
 }

 static void
-zone_drain_wait(uma_zone_t zone, int waitok)
+zone_reclaim(uma_zone_t zone, int waitok, bool drain)
 {

 	/*
@ -1030,14 +1067,15 @@ zone_drain_wait(uma_zone_t zone, int waitok)
 	 * when it wakes up.
 	 */
 	ZONE_LOCK(zone);
-	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
+	while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) {
 		if (waitok == M_NOWAIT)
 			goto out;
 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
 	}
-	zone->uz_flags |= UMA_ZFLAG_DRAINING;
-	bucket_cache_drain(zone);
+	zone->uz_flags |= UMA_ZFLAG_RECLAIMING;
+	bucket_cache_reclaim(zone, drain);
 	ZONE_UNLOCK(zone);
+
 	/*
 	 * The DRAINING flag protects us from being freed while
 	 * we're running.  Normally the uma_rwlock would protect us but we
@ -1045,17 +1083,24 @@ zone_drain_wait(uma_zone_t zone, int waitok)
 	 */
 	keg_drain(zone->uz_keg);
 	ZONE_LOCK(zone);
-	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
+	zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING;
 	wakeup(zone);
 out:
 	ZONE_UNLOCK(zone);
 }

-void
+static void
 zone_drain(uma_zone_t zone)
 {

-	zone_drain_wait(zone, M_NOWAIT);
+	zone_reclaim(zone, M_NOWAIT, true);
+}
+
+static void
+zone_trim(uma_zone_t zone)
+{
+
+	zone_reclaim(zone, M_NOWAIT, false);
 }

 /*
@ -1756,6 +1801,7 @@ zone_ctor(void *mem, int size, void *udata, int flags)
 	uma_zone_t zone = mem;
 	uma_zone_t z;
 	uma_keg_t keg;
+	int i;

 	bzero(zone, size);
 	zone->uz_name = arg->name;
@ -1783,6 +1829,9 @@ zone_ctor(void *mem, int size, void *udata, int flags)
 		zone->uz_fails = EARLY_COUNTER;
 	}

+	for (i = 0; i < vm_ndomains; i++)
+		TAILQ_INIT(&zone->uz_domain[i].uzd_buckets);
+
 	/*
 	 * This is a pure cache zone, no kegs.
 	 */
@ -1933,7 +1982,7 @@ zone_dtor(void *arg, int size, void *udata)
 	 * released and then refilled before we
 	 * remove it... we dont care for now
 	 */
-	zone_drain_wait(zone, M_WAITOK);
+	zone_reclaim(zone, M_WAITOK, true);
 	/*
 	 * We only destroy kegs from non secondary/non cache zones.
 	 */
@ -2138,7 +2187,7 @@ uma_startup2(void)
 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
 #endif
 	booted = BOOT_BUCKETS;
-	sx_init(&uma_drain_lock, "umadrain");
+	sx_init(&uma_reclaim_lock, "umareclaim");
 	bucket_enable();
 }

@ -2233,12 +2282,12 @@ uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 	if (booted < BOOT_BUCKETS) {
 		locked = false;
 	} else {
-		sx_slock(&uma_drain_lock);
+		sx_slock(&uma_reclaim_lock);
 		locked = true;
 	}
 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	if (locked)
-		sx_sunlock(&uma_drain_lock);
+		sx_sunlock(&uma_reclaim_lock);
 	return (res);
 }

@ -2267,13 +2316,13 @@ uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 	if (booted < BOOT_BUCKETS) {
 		locked = false;
 	} else {
-		sx_slock(&uma_drain_lock);
+		sx_slock(&uma_reclaim_lock);
 		locked = true;
 	}
 	/* XXX Attaches only one keg of potentially many. */
 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 	if (locked)
-		sx_sunlock(&uma_drain_lock);
+		sx_sunlock(&uma_reclaim_lock);
 	return (res);
 }

@ -2306,9 +2355,9 @@ void
 uma_zdestroy(uma_zone_t zone)
 {

-	sx_slock(&uma_drain_lock);
+	sx_slock(&uma_reclaim_lock);
 	zone_free_item(zones, zone, NULL, SKIP_NONE);
-	sx_sunlock(&uma_drain_lock);
+	sx_sunlock(&uma_reclaim_lock);
 }

 void
@ -2521,7 +2570,7 @@ zalloc_start:
 		zdom = &zone->uz_domain[0];
 	}

-	if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) {
+	if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 		cache->uc_allocbucket = bucket;
@ -3672,17 +3721,28 @@ uma_prealloc(uma_zone_t zone, int items)
 }

 /* See uma.h */
-static void
-uma_reclaim_locked(bool kmem_danger)
+void
+uma_reclaim(int req)
 {

 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
-	sx_assert(&uma_drain_lock, SA_XLOCKED);
+	sx_xlock(&uma_reclaim_lock);
 	bucket_enable();
-	zone_foreach(zone_drain);
-	if (vm_page_count_min() || kmem_danger) {
-		cache_drain_safe(NULL);
+
+	switch (req) {
+	case UMA_RECLAIM_TRIM:
+		zone_foreach(zone_trim);
+		break;
+	case UMA_RECLAIM_DRAIN:
+	case UMA_RECLAIM_DRAIN_CPU:
 		zone_foreach(zone_drain);
+		if (req == UMA_RECLAIM_DRAIN_CPU) {
+			pcpu_cache_drain_safe(NULL);
+			zone_foreach(zone_drain);
+		}
+		break;
+	default:
+		panic("unhandled reclamation request %d", req);
 	}

 	/*
@ -3692,15 +3752,7 @@ uma_reclaim_locked(bool kmem_danger)
 	 */
 	zone_drain(slabzone);
 	bucket_zone_drain();
-}
-
-void
-uma_reclaim(void)
-{
-
-	sx_xlock(&uma_drain_lock);
-	uma_reclaim_locked(false);
-	sx_xunlock(&uma_drain_lock);
+	sx_xunlock(&uma_reclaim_lock);
 }

 static volatile int uma_reclaim_needed;
@ -3718,21 +3770,40 @@ uma_reclaim_worker(void *arg __unused)
 {

 	for (;;) {
-		sx_xlock(&uma_drain_lock);
+		sx_xlock(&uma_reclaim_lock);
 		while (atomic_load_int(&uma_reclaim_needed) == 0)
-			sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
+			sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
 			    hz);
-		sx_xunlock(&uma_drain_lock);
+		sx_xunlock(&uma_reclaim_lock);
 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
-		sx_xlock(&uma_drain_lock);
-		uma_reclaim_locked(true);
+		uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
 		atomic_store_int(&uma_reclaim_needed, 0);
-		sx_xunlock(&uma_drain_lock);
 		/* Don't fire more than once per-second. */
 		pause("umarclslp", hz);
 	}
 }

+/* See uma.h */
+void
+uma_zone_reclaim(uma_zone_t zone, int req)
+{
+
+	switch (req) {
+	case UMA_RECLAIM_TRIM:
+		zone_trim(zone);
+		break;
+	case UMA_RECLAIM_DRAIN:
+		zone_drain(zone);
+		break;
+	case UMA_RECLAIM_DRAIN_CPU:
+		pcpu_cache_drain_safe(zone);
+		zone_drain(zone);
+		break;
+	default:
+		panic("unhandled reclamation request %d", req);
+	}
+}
+
 /* See uma.h */
 int
 uma_zone_exhausted(uma_zone_t zone)
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@ -197,7 +197,7 @@ struct uma_hash {
 */

 struct uma_bucket {
-	LIST_ENTRY(uma_bucket)	ub_link;	/* Link into the zone */
+	TAILQ_ENTRY(uma_bucket)	ub_link;	/* Link into the zone */
 	int16_t	ub_cnt;				/* Count of items in bucket. */
 	int16_t	ub_entries;			/* Max items. */
 	void	*ub_bucket[];			/* actual allocation storage */
@ -306,8 +306,10 @@ struct uma_slab {

 typedef struct uma_slab * uma_slab_t;

+TAILQ_HEAD(uma_bucketlist, uma_bucket);
+
 struct uma_zone_domain {
-	LIST_HEAD(,uma_bucket)	uzd_buckets;	/* full buckets */
+	struct uma_bucketlist uzd_buckets; /* full buckets */
 	long		uzd_nitems;	/* total item count */
 	long		uzd_imax;	/* maximum item count this period */
 	long		uzd_imin;	/* minimum item count this period */
@ -384,7 +386,7 @@ struct uma_zone {
 * These flags must not overlap with the UMA_ZONE flags specified in uma.h.
 */
 #define	UMA_ZFLAG_CACHE		0x04000000	/* uma_zcache_create()d it */
-#define	UMA_ZFLAG_DRAINING	0x08000000	/* Running zone_drain. */
+#define	UMA_ZFLAG_RECLAIMING	0x08000000	/* Running zone_reclaim(). */
 #define	UMA_ZFLAG_BUCKET	0x10000000	/* Bucket zone. */
 #define UMA_ZFLAG_INTERNAL	0x20000000	/* No offpage no PCPU. */
 #define UMA_ZFLAG_CACHEONLY	0x80000000	/* Don't ask VM for buckets. */
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -1871,9 +1871,12 @@ vm_pageout_lowmem(void)

 		/*
 		 * We do this explicitly after the caches have been
-		 * drained above.
+		 * drained above.  If we have a severe page shortage on
+		 * our hands, completely drain all UMA zones.  Otherwise,
+		 * just prune the caches.
 		 */
-		uma_reclaim();
+		uma_reclaim(vm_page_count_min() ? UMA_RECLAIM_DRAIN_CPU :
+		    UMA_RECLAIM_TRIM);
 		return (true);
 	}
 	return (false);