Further reduce the cacheline footprint of fast allocations by duplicating

the zone size and flags fields in the per-cpu caches.  This allows fast
alloctions to proceed only touching the single per-cpu cacheline and
simplifies the common case when no ctor/dtor is specified.

Reviewed by:	markj, rlibby
Differential Revision:	https://reviews.freebsd.org/D22826
This commit is contained in:
Jeff Roberson 2019-12-25 20:57:24 +00:00
parent 376b1ba394
commit cc7ce83ae0
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=356081
2 changed files with 115 additions and 44 deletions

View File

@ -281,7 +281,6 @@ static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
uma_fini fini, int align, uint32_t flags);
static int zone_import(void *, void **, int, int, int);
static void zone_release(void *, void **, int);
static void uma_zero_item(void *, uma_zone_t);
static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int);
static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int);
@ -2183,6 +2182,17 @@ zone_count(uma_zone_t zone, void *arg)
zone->uz_namecnt + 1);
}
static void
zone_update_caches(uma_zone_t zone)
{
int i;
for (i = 0; i <= mp_maxid; i++) {
cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size);
cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags);
}
}
/*
* Zone header ctor. This initializes all fields, locks, etc.
*
@ -2228,7 +2238,7 @@ zone_ctor(void *mem, int size, void *udata, int flags)
#ifdef INVARIANTS
if (arg->uminit == trash_init && arg->fini == trash_fini)
zone->uz_flags |= UMA_ZFLAG_TRASH;
zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR;
#endif
/*
@ -2327,6 +2337,9 @@ zone_ctor(void *mem, int size, void *udata, int flags)
else
zone->uz_bucket_size = bucket_select(zone->uz_size);
zone->uz_bucket_size_min = zone->uz_bucket_size;
if (zone->uz_dtor != NULL || zone->uz_ctor != NULL)
zone->uz_flags |= UMA_ZFLAG_CTORDTOR;
zone_update_caches(zone);
return (0);
}
@ -2801,8 +2814,14 @@ uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
uma_zfree_arg(zone, item, udata);
}
#ifdef INVARIANTS
#define UMA_ALWAYS_CTORDTOR 1
#else
#define UMA_ALWAYS_CTORDTOR 0
#endif
static void *
item_ctor(uma_zone_t zone, void *udata, int flags, void *item)
item_ctor(uma_zone_t zone, int size, void *udata, int flags, void *item)
{
#ifdef INVARIANTS
bool skipdbg;
@ -2810,10 +2829,10 @@ item_ctor(uma_zone_t zone, void *udata, int flags, void *item)
skipdbg = uma_dbg_zskip(zone, item);
if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
zone->uz_ctor != trash_ctor)
trash_ctor(item, zone->uz_size, udata, flags);
trash_ctor(item, size, udata, flags);
#endif
if (__predict_false(zone->uz_ctor != NULL) &&
zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
zone->uz_ctor(item, size, udata, flags) != 0) {
counter_u64_add(zone->uz_fails, 1);
zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
return (NULL);
@ -2823,13 +2842,14 @@ item_ctor(uma_zone_t zone, void *udata, int flags, void *item)
uma_dbg_alloc(zone, NULL, item);
#endif
if (flags & M_ZERO)
uma_zero_item(item, zone);
bzero(item, size);
return (item);
}
static inline void
item_dtor(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
item_dtor(uma_zone_t zone, void *item, int size, void *udata,
enum zfreeskip skip)
{
#ifdef INVARIANTS
bool skipdbg;
@ -2842,13 +2862,13 @@ item_dtor(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
uma_dbg_free(zone, NULL, item);
}
#endif
if (skip < SKIP_DTOR) {
if (__predict_true(skip < SKIP_DTOR)) {
if (zone->uz_dtor != NULL)
zone->uz_dtor(item, zone->uz_size, udata);
zone->uz_dtor(item, size, udata);
#ifdef INVARIANTS
if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
zone->uz_dtor != trash_dtor)
trash_dtor(item, zone->uz_size, udata);
trash_dtor(item, size, udata);
#endif
}
}
@ -2860,7 +2880,7 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
uma_cache_bucket_t bucket;
uma_cache_t cache;
void *item;
int cpu, domain;
int domain, size, uz_flags;
/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
@ -2869,16 +2889,21 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
curthread, zone->uz_name, zone, flags);
#ifdef WITNESS
if (flags & M_WAITOK) {
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"uma_zalloc_arg: zone \"%s\"", zone->uz_name);
}
#endif
#ifdef INVARIANTS
KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
("uma_zalloc_arg: called with spinlock or critical section held"));
if (zone->uz_flags & UMA_ZONE_PCPU)
KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
"with M_ZERO passed"));
#endif
#ifdef DEBUG_MEMGUARD
if (memguard_cmp_zone(zone)) {
@ -2912,13 +2937,19 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
*/
critical_enter();
do {
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
cache = &zone->uz_cpu[curcpu];
bucket = &cache->uc_allocbucket;
size = cache_uz_size(cache);
uz_flags = cache_uz_flags(cache);
if (__predict_true(bucket->ucb_cnt != 0)) {
item = cache_bucket_pop(cache, bucket);
critical_exit();
return (item_ctor(zone, udata, flags, item));
if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0 ||
UMA_ALWAYS_CTORDTOR))
return (item_ctor(zone, size, udata, flags, item));
if (flags & M_ZERO)
bzero(item, size);
return (item);
}
} while (cache_alloc(zone, cache, udata, flags));
critical_exit();
@ -2926,7 +2957,7 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
/*
* We can not get a bucket so try to return a single item.
*/
if (zone->uz_flags & UMA_ZONE_NUMA)
if (uz_flags & UMA_ZONE_NUMA)
domain = PCPU_GET(domain);
else
domain = UMA_ANYDOMAIN;
@ -2945,7 +2976,7 @@ cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
{
uma_zone_domain_t zdom;
uma_bucket_t bucket;
int cpu, domain;
int domain;
bool lockfail;
CRITICAL_ASSERT(curthread);
@ -2988,8 +3019,7 @@ cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
if (zone->uz_bucket_size == 0 || bucketdisable)
return (false);
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
cache = &zone->uz_cpu[curcpu];
/* See if we lost the race to fill the cache. */
if (cache->uc_allocbucket.ucb_bucket != NULL) {
@ -3040,8 +3070,7 @@ cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
* initialized bucket to make this less likely or claim
* the memory directly.
*/
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
cache = &zone->uz_cpu[curcpu];
if (cache->uc_allocbucket.ucb_bucket == NULL &&
((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
domain == PCPU_GET(domain))) {
@ -3437,7 +3466,7 @@ zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags)
goto fail_cnt;
}
}
item = item_ctor(zone, udata, flags, item);
item = item_ctor(zone, zone->uz_size, udata, flags, item);
if (item == NULL)
goto fail;
@ -3467,7 +3496,7 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
{
uma_cache_t cache;
uma_cache_bucket_t bucket;
int cpu, domain, itemdomain;
int domain, itemdomain, uz_flags;
/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
@ -3491,14 +3520,26 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
return;
}
#endif
item_dtor(zone, item, udata, SKIP_NONE);
/*
* We are accessing the per-cpu cache without a critical section to
* fetch size and flags. This is acceptable, if we are preempted we
* will simply read another cpu's line.
*/
cache = &zone->uz_cpu[curcpu];
uz_flags = cache_uz_flags(cache);
if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0 ||
UMA_ALWAYS_CTORDTOR))
item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE);
/*
* The race here is acceptable. If we miss it we'll just have to wait
* a little longer for the limits to be reset.
*/
if (zone->uz_sleepers > 0)
goto zfree_item;
if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) {
if (zone->uz_sleepers > 0)
goto zfree_item;
}
/*
* If possible, free to the per-CPU cache. There are two
@ -3514,16 +3555,14 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
domain = itemdomain = 0;
critical_enter();
do {
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
cache = &zone->uz_cpu[curcpu];
bucket = &cache->uc_allocbucket;
#ifdef UMA_XDOMAIN
if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
if ((uz_flags & UMA_ZONE_NUMA) != 0) {
itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
domain = PCPU_GET(domain);
}
if ((zone->uz_flags & UMA_ZONE_NUMA) != 0 &&
domain != itemdomain) {
if ((uz_flags & UMA_ZONE_NUMA) != 0 && domain != itemdomain) {
bucket = &cache->uc_crossbucket;
} else
#endif
@ -3615,15 +3654,14 @@ cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
int itemdomain)
{
uma_bucket_t bucket;
int cpu, domain;
int domain;
CRITICAL_ASSERT(curthread);
if (zone->uz_bucket_size == 0 || bucketdisable)
return false;
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
cache = &zone->uz_cpu[curcpu];
/*
* NUMA domains need to free to the correct zdom. When XDOMAIN
@ -3660,8 +3698,7 @@ cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
critical_enter();
if (bucket == NULL)
return (false);
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
cache = &zone->uz_cpu[curcpu];
#ifdef UMA_XDOMAIN
/*
* Check to see if we should be populating the cross bucket. If it
@ -3783,7 +3820,7 @@ static void
zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
{
item_dtor(zone, item, udata, skip);
item_dtor(zone, item, zone->uz_size, udata, skip);
if (skip < SKIP_FINI && zone->uz_fini)
zone->uz_fini(item, zone->uz_size);
@ -3819,6 +3856,8 @@ uma_zone_set_max(uma_zone_t zone, int nitems)
if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
zone->uz_bucket_size_min = zone->uz_bucket_size_max;
zone->uz_max_items = nitems;
zone->uz_flags |= UMA_ZFLAG_LIMIT;
zone_update_caches(zone);
ZONE_UNLOCK(zone);
return (nitems);
@ -4086,7 +4125,9 @@ uma_zone_reserve_kva(uma_zone_t zone, int count)
#else
keg->uk_allocf = noobj_alloc;
#endif
keg->uk_flags |= UMA_ZONE_NOFREE;
keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
zone_update_caches(zone);
ZONE_UNLOCK(zone);
return (1);
@ -4233,13 +4274,6 @@ uma_zone_exhausted_nolock(uma_zone_t zone)
return (zone->uz_sleepers > 0);
}
static void
uma_zero_item(void *item, uma_zone_t zone)
{
bzero(item, zone->uz_size);
}
unsigned long
uma_limit(void)
{

View File

@ -217,6 +217,39 @@ typedef struct uma_cache * uma_cache_t;
LIST_HEAD(slabhead, uma_slab);
/*
* The cache structure pads perfectly into 64 bytes so we use spare
* bits from the embedded cache buckets to store information from the zone
* and keep all fast-path allocations accessing a single per-cpu line.
*/
static inline void
cache_set_uz_flags(uma_cache_t cache, uint32_t flags)
{
cache->uc_freebucket.ucb_spare = flags;
}
static inline void
cache_set_uz_size(uma_cache_t cache, uint32_t size)
{
cache->uc_allocbucket.ucb_spare = size;
}
static inline uint32_t
cache_uz_flags(uma_cache_t cache)
{
return (cache->uc_freebucket.ucb_spare);
}
static inline uint32_t
cache_uz_size(uma_cache_t cache)
{
return (cache->uc_allocbucket.ucb_spare);
}
/*
* Per-domain slab lists. Embedded in the kegs.
*/
@ -442,6 +475,8 @@ struct uma_zone {
/*
* These flags must not overlap with the UMA_ZONE flags specified in uma.h.
*/
#define UMA_ZFLAG_CTORDTOR 0x01000000 /* Zone has ctor/dtor set. */
#define UMA_ZFLAG_LIMIT 0x02000000 /* Zone has limit set. */
#define UMA_ZFLAG_CACHE 0x04000000 /* uma_zcache_create()d it */
#define UMA_ZFLAG_RECLAIMING 0x08000000 /* Running zone_reclaim(). */
#define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */
@ -459,6 +494,8 @@ struct uma_zone {
"\35BUCKET" \
"\34RECLAIMING" \
"\33CACHE" \
"\32LIMIT" \
"\31CTORDTOR" \
"\22MINBUCKET" \
"\21NUMA" \
"\20PCPU" \