From aabe13f1450bb4caba66ec2a7a41c0dfefff511d Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 14 Apr 2021 12:57:24 -0400 Subject: [PATCH] uma: Introduce per-domain reclamation functions Make it possible to reclaim items from a specific NUMA domain. - Add uma_zone_reclaim_domain() and uma_reclaim_domain(). - Permit parallel reclamations. Use a counter instead of a flag to synchronize with zone_dtor(). - Use the zone lock to protect cache_shrink() now that parallel reclaims can happen. - Add a sysctl that can be used to trigger reclamation from a specific domain. Currently the new KPIs are unused, so there should be no functional change. Reviewed by: mav MFC after: 2 weeks Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D29685 --- share/man/man9/zone.9 | 14 +++- sys/vm/uma.h | 8 +- sys/vm/uma_core.c | 166 ++++++++++++++++++++++++------------------ sys/vm/uma_int.h | 5 +- sys/vm/vm_kern.c | 29 +++++++- 5 files changed, 144 insertions(+), 78 deletions(-) diff --git a/share/man/man9/zone.9 b/share/man/man9/zone.9 index 7da40b13469b..89d5f3e2640f 100644 --- a/share/man/man9/zone.9 +++ b/share/man/man9/zone.9 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd March 11, 2021 +.Dd April 14, 2021 .Dt UMA 9 .Os .Sh NAME @@ -98,8 +98,12 @@ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); .Ft void .Fn uma_reclaim "int req" .Ft void +.Fn uma_reclaim_domain "int req" "int domain" +.Ft void .Fn uma_zone_reclaim "uma_zone_t zone" "int req" .Ft void +.Fn uma_zone_reclaim_domain "uma_zone_t zone" "int req" "int domain" +.Ft void .Fn uma_zone_set_allocf "uma_zone_t zone" "uma_alloc allocf" .Ft void .Fn uma_zone_set_freef "uma_zone_t zone" "uma_free freef" @@ -471,6 +475,14 @@ Free items in the per-CPU caches are left alone. .It Dv UMA_RECLAIM_DRAIN_CPU Reclaim all cached items. .El +The +.Fn uma_reclaim_domain +and +.Fn uma_zone_reclaim_domain +functions apply only to items allocated from the specified domain. +In the case of domains using a round-robin NUMA policy, cached items from all +domains are freed to the keg, but only slabs from the specific domain will +be freed. .Pp The .Fn uma_zone_set_allocf diff --git a/sys/vm/uma.h b/sys/vm/uma.h index 361c64900845..5d473ba909b6 100644 --- a/sys/vm/uma.h +++ b/sys/vm/uma.h @@ -446,10 +446,12 @@ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain, typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); /* - * Reclaims unused memory + * Reclaims unused memory. If no NUMA domain is specified, memory from all + * domains is reclaimed. * * Arguments: - * req Reclamation request type. + * req Reclamation request type. + * domain The target NUMA domain. * Returns: * None */ @@ -457,7 +459,9 @@ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); #define UMA_RECLAIM_DRAIN_CPU 2 /* release bucket and per-CPU caches */ #define UMA_RECLAIM_TRIM 3 /* trim bucket cache to WSS */ void uma_reclaim(int req); +void uma_reclaim_domain(int req, int domain); void uma_zone_reclaim(uma_zone_t, int req); +void uma_zone_reclaim_domain(uma_zone_t, int req, int domain); /* * Sets the alignment mask to be used for all zones requesting cache diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 0348d6468d74..6b0add6b6b07 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -168,17 +168,20 @@ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); static LIST_HEAD(,uma_zone) uma_cachezones = LIST_HEAD_INITIALIZER(uma_cachezones); -/* This RW lock protects the keg list */ +/* + * Mutex for global lists: uma_kegs, uma_cachezones, and the per-keg list of + * zones. + */ static struct rwlock_padalign __exclusive_cache_line uma_rwlock; +static struct sx uma_reclaim_lock; + /* * First available virual address for boot time allocations. */ static vm_offset_t bootstart; static vm_offset_t bootmem; -static struct sx uma_reclaim_lock; - /* * kmem soft limit, initialized by uma_set_limit(). Ensure that early * allocations don't trigger a wakeup of the reclaim thread. @@ -289,7 +292,7 @@ static void pcpu_page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); -static void bucket_cache_reclaim(uma_zone_t zone, bool); +static void bucket_cache_reclaim(uma_zone_t zone, bool, int); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); @@ -315,7 +318,7 @@ static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); -static void bucket_zone_drain(void); +static void bucket_zone_drain(int domain); static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); @@ -525,12 +528,13 @@ bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) } static void -bucket_zone_drain(void) +bucket_zone_drain(int domain) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) - uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN); + uma_zone_reclaim_domain(ubz->ubz_zone, UMA_RECLAIM_DRAIN, + domain); } #ifdef KASAN @@ -1308,7 +1312,7 @@ cache_drain(uma_zone_t zone) bucket_free(zone, bucket, NULL); } } - bucket_cache_reclaim(zone, true); + bucket_cache_reclaim(zone, true, UMA_ANYDOMAIN); } static void @@ -1318,8 +1322,10 @@ cache_shrink(uma_zone_t zone, void *unused) if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; + ZONE_LOCK(zone); zone->uz_bucket_size = (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; + ZONE_UNLOCK(zone); } static void @@ -1442,7 +1448,7 @@ bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, int domain) } static void -bucket_cache_reclaim(uma_zone_t zone, bool drain) +bucket_cache_reclaim(uma_zone_t zone, bool drain, int domain) { int i; @@ -1453,8 +1459,13 @@ bucket_cache_reclaim(uma_zone_t zone, bool drain) if (zone->uz_bucket_size > zone->uz_bucket_size_min) zone->uz_bucket_size--; - for (i = 0; i < vm_ndomains; i++) - bucket_cache_reclaim_domain(zone, drain, i); + if (domain != UMA_ANYDOMAIN && + (zone->uz_flags & UMA_ZONE_ROUNDROBIN) == 0) { + bucket_cache_reclaim_domain(zone, drain, domain); + } else { + for (i = 0; i < vm_ndomains; i++) + bucket_cache_reclaim_domain(zone, drain, i); + } } static void @@ -1561,63 +1572,65 @@ keg_drain_domain(uma_keg_t keg, int domain) * Returns nothing. */ static void -keg_drain(uma_keg_t keg) +keg_drain(uma_keg_t keg, int domain) { int i; if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0) return; - for (i = 0; i < vm_ndomains; i++) - keg_drain_domain(keg, i); -} - -static void -zone_reclaim(uma_zone_t zone, int waitok, bool drain) -{ - - /* - * Set draining to interlock with zone_dtor() so we can release our - * locks as we go. Only dtor() should do a WAITOK call since it - * is the only call that knows the structure will still be available - * when it wakes up. - */ - ZONE_LOCK(zone); - while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) { - if (waitok == M_NOWAIT) - goto out; - msleep(zone, &ZDOM_GET(zone, 0)->uzd_lock, PVM, "zonedrain", - 1); + if (domain != UMA_ANYDOMAIN) { + keg_drain_domain(keg, domain); + } else { + for (i = 0; i < vm_ndomains; i++) + keg_drain_domain(keg, i); } - zone->uz_flags |= UMA_ZFLAG_RECLAIMING; - ZONE_UNLOCK(zone); - bucket_cache_reclaim(zone, drain); +} +static void +zone_reclaim(uma_zone_t zone, int domain, int waitok, bool drain) +{ /* - * The DRAINING flag protects us from being freed while - * we're running. Normally the uma_rwlock would protect us but we - * must be able to release and acquire the right lock for each keg. + * Count active reclaim operations in order to interlock with + * zone_dtor(), which removes the zone from global lists before + * attempting to reclaim items itself. + * + * The zone may be destroyed while sleeping, so only zone_dtor() should + * specify M_WAITOK. */ - if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) - keg_drain(zone->uz_keg); ZONE_LOCK(zone); - zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING; - wakeup(zone); -out: + if (waitok == M_WAITOK) { + while (zone->uz_reclaimers > 0) + msleep(zone, ZONE_LOCKPTR(zone), PVM, "zonedrain", 1); + } + zone->uz_reclaimers++; + ZONE_UNLOCK(zone); + bucket_cache_reclaim(zone, drain, domain); + + if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) + keg_drain(zone->uz_keg, domain); + ZONE_LOCK(zone); + zone->uz_reclaimers--; + if (zone->uz_reclaimers == 0) + wakeup(zone); ZONE_UNLOCK(zone); } static void -zone_drain(uma_zone_t zone, void *unused) +zone_drain(uma_zone_t zone, void *arg) { + int domain; - zone_reclaim(zone, M_NOWAIT, true); + domain = (int)(uintptr_t)arg; + zone_reclaim(zone, domain, M_NOWAIT, true); } static void -zone_trim(uma_zone_t zone, void *unused) +zone_trim(uma_zone_t zone, void *arg) { + int domain; - zone_reclaim(zone, M_NOWAIT, false); + domain = (int)(uintptr_t)arg; + zone_reclaim(zone, domain, M_NOWAIT, false); } /* @@ -2883,7 +2896,7 @@ zone_dtor(void *arg, int size, void *udata) keg = zone->uz_keg; keg->uk_reserve = 0; } - zone_reclaim(zone, M_WAITOK, true); + zone_reclaim(zone, UMA_ANYDOMAIN, M_WAITOK, true); /* * We only destroy kegs from non secondary/non cache zones. @@ -3153,9 +3166,9 @@ uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, args.flags = flags; args.keg = NULL; - sx_slock(&uma_reclaim_lock); + sx_xlock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); - sx_sunlock(&uma_reclaim_lock); + sx_xunlock(&uma_reclaim_lock); return (res); } @@ -3181,9 +3194,9 @@ uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor, args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = keg; - sx_slock(&uma_reclaim_lock); + sx_xlock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); - sx_sunlock(&uma_reclaim_lock); + sx_xunlock(&uma_reclaim_lock); return (res); } @@ -3224,9 +3237,9 @@ uma_zdestroy(uma_zone_t zone) if (booted == BOOT_SHUTDOWN && zone->uz_fini == NULL && zone->uz_release == zone_release) return; - sx_slock(&uma_reclaim_lock); + sx_xlock(&uma_reclaim_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); - sx_sunlock(&uma_reclaim_lock); + sx_xunlock(&uma_reclaim_lock); } void @@ -5035,22 +5048,29 @@ uma_zone_memory(uma_zone_t zone) void uma_reclaim(int req) { + uma_reclaim_domain(req, UMA_ANYDOMAIN); +} + +void +uma_reclaim_domain(int req, int domain) +{ + void *arg; - CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); - sx_xlock(&uma_reclaim_lock); bucket_enable(); + arg = (void *)(uintptr_t)domain; + sx_slock(&uma_reclaim_lock); switch (req) { case UMA_RECLAIM_TRIM: - zone_foreach(zone_trim, NULL); + zone_foreach(zone_trim, arg); break; case UMA_RECLAIM_DRAIN: + zone_foreach(zone_drain, arg); + break; case UMA_RECLAIM_DRAIN_CPU: - zone_foreach(zone_drain, NULL); - if (req == UMA_RECLAIM_DRAIN_CPU) { - pcpu_cache_drain_safe(NULL); - zone_foreach(zone_drain, NULL); - } + zone_foreach(zone_drain, arg); + pcpu_cache_drain_safe(NULL); + zone_foreach(zone_drain, arg); break; default: panic("unhandled reclamation request %d", req); @@ -5061,10 +5081,10 @@ uma_reclaim(int req) * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ - zone_drain(slabzones[0], NULL); - zone_drain(slabzones[1], NULL); - bucket_zone_drain(); - sx_xunlock(&uma_reclaim_lock); + zone_drain(slabzones[0], arg); + zone_drain(slabzones[1], arg); + bucket_zone_drain(domain); + sx_sunlock(&uma_reclaim_lock); } static volatile int uma_reclaim_needed; @@ -5099,17 +5119,25 @@ uma_reclaim_worker(void *arg __unused) void uma_zone_reclaim(uma_zone_t zone, int req) { + uma_zone_reclaim_domain(zone, req, UMA_ANYDOMAIN); +} +void +uma_zone_reclaim_domain(uma_zone_t zone, int req, int domain) +{ + void *arg; + + arg = (void *)(uintptr_t)domain; switch (req) { case UMA_RECLAIM_TRIM: - zone_trim(zone, NULL); + zone_trim(zone, arg); break; case UMA_RECLAIM_DRAIN: - zone_drain(zone, NULL); + zone_drain(zone, arg); break; case UMA_RECLAIM_DRAIN_CPU: pcpu_cache_drain_safe(zone); - zone_drain(zone, NULL); + zone_drain(zone, arg); break; default: panic("unhandled reclamation request %d", req); diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h index 9965e486ca53..93910e78165b 100644 --- a/sys/vm/uma_int.h +++ b/sys/vm/uma_int.h @@ -162,7 +162,6 @@ #define UMA_ZFLAG_CTORDTOR 0x01000000 /* Zone has ctor/dtor set. */ #define UMA_ZFLAG_LIMIT 0x02000000 /* Zone has limit set. */ #define UMA_ZFLAG_CACHE 0x04000000 /* uma_zcache_create()d it */ -#define UMA_ZFLAG_RECLAIMING 0x08000000 /* Running zone_reclaim(). */ #define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */ #define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */ #define UMA_ZFLAG_TRASH 0x40000000 /* Add trash ctor/dtor. */ @@ -175,7 +174,6 @@ "\37TRASH" \ "\36INTERNAL" \ "\35BUCKET" \ - "\34RECLAIMING" \ "\33CACHE" \ "\32LIMIT" \ "\31CTORDTOR" \ @@ -490,7 +488,7 @@ struct uma_zone { char *uz_ctlname; /* sysctl safe name string. */ int uz_namecnt; /* duplicate name count. */ uint16_t uz_bucket_size_min; /* Min number of items in bucket */ - uint16_t uz_pad0; + uint16_t uz_reclaimers; /* pending reclaim operations. */ /* Offset 192, rare read-only. */ struct sysctl_oid *uz_oid; /* sysctl oid pointer. */ @@ -582,6 +580,7 @@ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data); #define ZONE_LOCK(z) ZDOM_LOCK(ZDOM_GET((z), 0)) #define ZONE_UNLOCK(z) ZDOM_UNLOCK(ZDOM_GET((z), 0)) +#define ZONE_LOCKPTR(z) (&ZDOM_GET((z), 0)->uzd_lock) #define ZONE_CROSS_LOCK_INIT(z) \ mtx_init(&(z)->uz_cross_lock, "UMA Cross", NULL, MTX_DEF) diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index a69493d1323f..7ab1fdb8950e 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -907,7 +907,6 @@ debug_vm_lowmem(SYSCTL_HANDLER_ARGS) EVENTHANDLER_INVOKE(vm_lowmem, i); return (0); } - SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, 0, debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags"); @@ -919,7 +918,7 @@ debug_uma_reclaim(SYSCTL_HANDLER_ARGS) i = 0; error = sysctl_handle_int(oidp, &i, 0, req); - if (error != 0) + if (error != 0 || req->newptr == NULL) return (error); if (i != UMA_RECLAIM_TRIM && i != UMA_RECLAIM_DRAIN && i != UMA_RECLAIM_DRAIN_CPU) @@ -927,7 +926,31 @@ debug_uma_reclaim(SYSCTL_HANDLER_ARGS) uma_reclaim(i); return (0); } - SYSCTL_PROC(_debug, OID_AUTO, uma_reclaim, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, 0, debug_uma_reclaim, "I", "set to generate request to reclaim uma caches"); + +static int +debug_uma_reclaim_domain(SYSCTL_HANDLER_ARGS) +{ + int domain, error, request; + + request = 0; + error = sysctl_handle_int(oidp, &request, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + + domain = request >> 4; + request &= 0xf; + if (request != UMA_RECLAIM_TRIM && request != UMA_RECLAIM_DRAIN && + request != UMA_RECLAIM_DRAIN_CPU) + return (EINVAL); + if (domain < 0 || domain >= vm_ndomains) + return (EINVAL); + uma_reclaim_domain(request, domain); + return (0); +} +SYSCTL_PROC(_debug, OID_AUTO, uma_reclaim_domain, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, 0, + debug_uma_reclaim_domain, "I", + "");