From ab3185d15ef8958b3a60325620460229f4ae07ce Mon Sep 17 00:00:00 2001 From: Jeff Roberson Date: Fri, 12 Jan 2018 23:25:05 +0000 Subject: [PATCH] Implement NUMA support in uma(9) and malloc(9). Allocations from specific domains can be done by the _domain() API variants. UMA also supports a first-touch policy via the NUMA zone flag. The slab layer is now segregated by VM domains and is precise. It handles iteration for round-robin directly. The per-cpu cache layer remains a mix of domains according to where memory is allocated and freed. Well behaved clients can achieve perfect locality with no performance penalty. The direct domain allocation functions have to visit the slab layer and so require per-zone locks which come at some expense. Reviewed by: Attilio (a slightly older version) Tested by: pho Sponsored by: Netflix, Dell/EMC Isilon --- lib/libmemstat/memstat_uma.c | 26 +- sys/amd64/amd64/uma_machdep.c | 5 +- sys/arm64/arm64/uma_machdep.c | 5 +- sys/i386/i386/pmap.c | 9 +- sys/kern/kern_malloc.c | 260 +++++++++++---- sys/kern/kern_mbuf.c | 7 +- sys/kern/subr_busdma_bufalloc.c | 4 +- sys/kern/subr_vmem.c | 7 +- sys/kern/vfs_bio.c | 4 +- sys/mips/mips/uma_machdep.c | 6 +- sys/powerpc/aim/mmu_oea64.c | 6 +- sys/powerpc/aim/slb.c | 5 +- sys/powerpc/powerpc/uma_machdep.c | 5 +- sys/riscv/riscv/uma_machdep.c | 3 +- sys/sparc64/sparc64/vm_machdep.c | 5 +- sys/sys/malloc.h | 8 + sys/vm/uma.h | 42 ++- sys/vm/uma_core.c | 524 +++++++++++++++++++++--------- sys/vm/uma_int.h | 67 +++- 19 files changed, 729 insertions(+), 269 deletions(-) diff --git a/lib/libmemstat/memstat_uma.c b/lib/libmemstat/memstat_uma.c index 4579a8152b04..423d4711c25e 100644 --- a/lib/libmemstat/memstat_uma.c +++ b/lib/libmemstat/memstat_uma.c @@ -55,6 +55,8 @@ static struct nlist namelist[] = { { .n_name = "_mp_maxid" }, #define X_ALL_CPUS 2 { .n_name = "_all_cpus" }, +#define X_VM_NDOMAINS 3 + { .n_name = "_vm_ndomains" }, { .n_name = "" }, }; @@ -297,11 +299,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle) { LIST_HEAD(, uma_keg) uma_kegs; struct memory_type *mtp; + struct uma_zone_domain uzd; struct uma_bucket *ubp, ub; struct uma_cache *ucp, *ucp_array; struct uma_zone *uzp, uz; struct uma_keg *kzp, kz; - int hint_dontsearch, i, mp_maxid, ret; + int hint_dontsearch, i, mp_maxid, ndomains, ret; char name[MEMTYPE_MAXNAME]; cpuset_t all_cpus; long cpusetsize; @@ -323,6 +326,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle) list->mtl_error = ret; return (-1); } + ret = kread_symbol(kvm, X_VM_NDOMAINS, &ndomains, + sizeof(ndomains), 0); + if (ret != 0) { + list->mtl_error = ret; + return (-1); + } ret = kread_symbol(kvm, X_UMA_KEGS, &uma_kegs, sizeof(uma_kegs), 0); if (ret != 0) { list->mtl_error = ret; @@ -447,10 +456,17 @@ skip_percpu: kz.uk_ipers; mtp->mt_byteslimit = mtp->mt_countlimit * mtp->mt_size; mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees; - for (ubp = LIST_FIRST(&uz.uz_buckets); ubp != - NULL; ubp = LIST_NEXT(&ub, ub_link)) { - ret = kread(kvm, ubp, &ub, sizeof(ub), 0); - mtp->mt_zonefree += ub.ub_cnt; + for (i = 0; i < ndomains; i++) { + ret = kread(kvm, &uz.uz_domain[i], &uzd, + sizeof(uzd), 0); + for (ubp = + LIST_FIRST(&uzd.uzd_buckets); + ubp != NULL; + ubp = LIST_NEXT(&ub, ub_link)) { + ret = kread(kvm, ubp, &ub, + sizeof(ub), 0); + mtp->mt_zonefree += ub.ub_cnt; + } } if (!((kz.uk_flags & UMA_ZONE_SECONDARY) && LIST_FIRST(&kz.uk_zones) != uzp)) { diff --git a/sys/amd64/amd64/uma_machdep.c b/sys/amd64/amd64/uma_machdep.c index 41ba8250e5cc..cb960e4b674b 100644 --- a/sys/amd64/amd64/uma_machdep.c +++ b/sys/amd64/amd64/uma_machdep.c @@ -44,14 +44,15 @@ __FBSDID("$FreeBSD$"); #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) return (NULL); diff --git a/sys/arm64/arm64/uma_machdep.c b/sys/arm64/arm64/uma_machdep.c index 620b8c46828a..597e7e80b062 100644 --- a/sys/arm64/arm64/uma_machdep.c +++ b/sys/arm64/arm64/uma_machdep.c @@ -42,14 +42,15 @@ __FBSDID("$FreeBSD$"); #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) return (NULL); diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 38cc84c19f00..957443863f88 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -338,8 +338,8 @@ static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) -static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, - int wait); +static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *flags, int wait); #endif static void pmap_set_pg(void); @@ -697,12 +697,13 @@ pmap_page_init(vm_page_t m) #if defined(PAE) || defined(PAE_TABLES) static void * -pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; - return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, + return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index 010b1ee8799b..4a9b9bf27d20 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -96,6 +96,11 @@ __FBSDID("$FreeBSD$"); dtrace_malloc_probe_func_t dtrace_malloc_probe; #endif +#if defined(INVARIANTS) || defined(MALLOC_MAKE_FAILURES) || \ + defined(DEBUG_MEMGUARD) || defined(DEBUG_REDZONE) +#define MALLOC_DEBUG 1 +#endif + /* * When realloc() is called, if the new size is sufficiently smaller than * the old size, realloc() will allocate a new, smaller block to avoid @@ -417,6 +422,20 @@ contigmalloc(unsigned long size, struct malloc_type *type, int flags, return (ret); } +void * +contigmalloc_domain(unsigned long size, struct malloc_type *type, + int domain, int flags, vm_paddr_t low, vm_paddr_t high, + unsigned long alignment, vm_paddr_t boundary) +{ + void *ret; + + ret = (void *)kmem_alloc_contig_domain(domain, size, flags, low, high, + alignment, boundary, VM_MEMATTR_DEFAULT); + if (ret != NULL) + malloc_type_allocated(type, round_page(size)); + return (ret); +} + /* * contigfree: * @@ -432,26 +451,14 @@ contigfree(void *addr, unsigned long size, struct malloc_type *type) malloc_type_freed(type, round_page(size)); } -/* - * malloc: - * - * Allocate a block of memory. - * - * If M_NOWAIT is set, this routine will not block and return NULL if - * the allocation fails. - */ -void * -malloc(unsigned long size, struct malloc_type *mtp, int flags) +#ifdef MALLOC_DEBUG +static int +malloc_dbg(caddr_t *vap, unsigned long *sizep, struct malloc_type *mtp, + int flags) { - int indx; - struct malloc_type_internal *mtip; - caddr_t va; - uma_zone_t zone; -#if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE) - unsigned long osize = size; -#endif - #ifdef INVARIANTS + int indx; + KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic")); /* * Check that exactly one of M_WAITOK or M_NOWAIT is specified. @@ -474,7 +481,8 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) if ((malloc_nowait_count % malloc_failure_rate) == 0) { atomic_add_int(&malloc_failure_count, 1); t_malloc_fail = time_uptime; - return (NULL); + *vap = NULL; + return (EJUSTRETURN); } } #endif @@ -485,16 +493,44 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) ("malloc: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD - if (memguard_cmp_mtp(mtp, size)) { - va = memguard_alloc(size, flags); - if (va != NULL) - return (va); + if (memguard_cmp_mtp(mtp, *sizep)) { + *vap = memguard_alloc(*sizep, flags); + if (*vap != NULL) + return (EJUSTRETURN); /* This is unfortunate but should not be fatal. */ } #endif #ifdef DEBUG_REDZONE - size = redzone_size_ntor(size); + *sizep = redzone_size_ntor(*sizep); +#endif + + return (0); +} +#endif + +/* + * malloc: + * + * Allocate a block of memory. + * + * If M_NOWAIT is set, this routine will not block and return NULL if + * the allocation fails. + */ +void * +malloc(unsigned long size, struct malloc_type *mtp, int flags) +{ + int indx; + struct malloc_type_internal *mtip; + caddr_t va; + uma_zone_t zone; +#if defined(DEBUG_REDZONE) + unsigned long osize = size; +#endif + +#ifdef MALLOC_DEBUG + if (malloc_dbg(&va, &size, mtp, flags) != 0) + return (va); #endif if (size <= kmem_zmax) { @@ -523,11 +559,55 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL")); else if (va == NULL) t_malloc_fail = time_uptime; -#ifdef DIAGNOSTIC - if (va != NULL && !(flags & M_ZERO)) { - memset(va, 0x70, osize); - } +#ifdef DEBUG_REDZONE + if (va != NULL) + va = redzone_setup(va, osize); #endif + return ((void *) va); +} + +void * +malloc_domain(unsigned long size, struct malloc_type *mtp, int domain, + int flags) +{ + int indx; + struct malloc_type_internal *mtip; + caddr_t va; + uma_zone_t zone; +#if defined(DEBUG_REDZONE) + unsigned long osize = size; +#endif + +#ifdef MALLOC_DEBUG + if (malloc_dbg(&va, &size, mtp, flags) != 0) + return (va); +#endif + if (size <= kmem_zmax) { + mtip = mtp->ks_handle; + if (size & KMEM_ZMASK) + size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; + indx = kmemsize[size >> KMEM_ZSHIFT]; + KASSERT(mtip->mti_zone < numzones, + ("mti_zone %u out of range %d", + mtip->mti_zone, numzones)); + zone = kmemzones[indx].kz_zone[mtip->mti_zone]; +#ifdef MALLOC_PROFILE + krequests[size >> KMEM_ZSHIFT]++; +#endif + va = uma_zalloc_domain(zone, NULL, domain, flags); + if (va != NULL) + size = zone->uz_size; + malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); + } else { + size = roundup(size, PAGE_SIZE); + zone = NULL; + va = uma_large_malloc_domain(size, domain, flags); + malloc_type_allocated(mtp, va == NULL ? 0 : size); + } + if (flags & M_WAITOK) + KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL")); + else if (va == NULL) + t_malloc_fail = time_uptime; #ifdef DEBUG_REDZONE if (va != NULL) va = redzone_setup(va, osize); @@ -545,6 +625,58 @@ mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags) return (malloc(size * nmemb, type, flags)); } +#ifdef INVARIANTS +static void +free_save_type(void *addr, struct malloc_type *mtp, u_long size) +{ + struct malloc_type **mtpp = addr; + + /* + * Cache a pointer to the malloc_type that most recently freed + * this memory here. This way we know who is most likely to + * have stepped on it later. + * + * This code assumes that size is a multiple of 8 bytes for + * 64 bit machines + */ + mtpp = (struct malloc_type **) ((unsigned long)mtpp & ~UMA_ALIGN_PTR); + mtpp += (size - sizeof(struct malloc_type *)) / + sizeof(struct malloc_type *); + *mtpp = mtp; +} +#endif + +#ifdef MALLOC_DEBUG +static int +free_dbg(void **addrp, struct malloc_type *mtp) +{ + void *addr; + + addr = *addrp; + KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic")); + KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), + ("free: called with spinlock or critical section held")); + + /* free(NULL, ...) does nothing */ + if (addr == NULL) + return (EJUSTRETURN); + +#ifdef DEBUG_MEMGUARD + if (is_memguard_addr(addr)) { + memguard_free(addr); + return (EJUSTRETURN); + } +#endif + +#ifdef DEBUG_REDZONE + redzone_check(addr); + *addrp = redzone_addr_ntor(addr); +#endif + + return (0); +} +#endif + /* * free: * @@ -558,51 +690,23 @@ free(void *addr, struct malloc_type *mtp) uma_slab_t slab; u_long size; - KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic")); - KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), - ("free: called with spinlock or critical section held")); - +#ifdef MALLOC_DEBUG + if (free_dbg(&addr, mtp) != 0) + return; +#endif /* free(NULL, ...) does nothing */ if (addr == NULL) return; -#ifdef DEBUG_MEMGUARD - if (is_memguard_addr(addr)) { - memguard_free(addr); - return; - } -#endif - -#ifdef DEBUG_REDZONE - redzone_check(addr); - addr = redzone_addr_ntor(addr); -#endif - slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK)); - if (slab == NULL) panic("free: address %p(%p) has not been allocated.\n", addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); if (!(slab->us_flags & UMA_SLAB_MALLOC)) { -#ifdef INVARIANTS - struct malloc_type **mtpp = addr; -#endif size = slab->us_keg->uk_size; #ifdef INVARIANTS - /* - * Cache a pointer to the malloc_type that most recently freed - * this memory here. This way we know who is most likely to - * have stepped on it later. - * - * This code assumes that size is a multiple of 8 bytes for - * 64 bit machines - */ - mtpp = (struct malloc_type **) - ((unsigned long)mtpp & ~UMA_ALIGN_PTR); - mtpp += (size - sizeof(struct malloc_type *)) / - sizeof(struct malloc_type *); - *mtpp = mtp; + free_save_type(addr, mtp, size); #endif uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab); } else { @@ -612,6 +716,40 @@ free(void *addr, struct malloc_type *mtp) malloc_type_freed(mtp, size); } +void +free_domain(void *addr, struct malloc_type *mtp) +{ + uma_slab_t slab; + u_long size; + +#ifdef MALLOC_DEBUG + if (free_dbg(&addr, mtp) != 0) + return; +#endif + + /* free(NULL, ...) does nothing */ + if (addr == NULL) + return; + + slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK)); + if (slab == NULL) + panic("free_domain: address %p(%p) has not been allocated.\n", + addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); + + if (!(slab->us_flags & UMA_SLAB_MALLOC)) { + size = slab->us_keg->uk_size; +#ifdef INVARIANTS + free_save_type(addr, mtp, size); +#endif + uma_zfree_domain(LIST_FIRST(&slab->us_keg->uk_zones), + addr, slab); + } else { + size = slab->us_size; + uma_large_free(slab); + } + malloc_type_freed(mtp, size); +} + /* * realloc: change the size of a memory block */ diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index a1ab9229bd1e..aa3fed74697a 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -283,7 +283,7 @@ static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); -static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); @@ -386,12 +386,13 @@ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); * pages. */ static void * -mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; - return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, + return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } diff --git a/sys/kern/subr_busdma_bufalloc.c b/sys/kern/subr_busdma_bufalloc.c index d035153c6dbd..c449e8820e7b 100644 --- a/sys/kern/subr_busdma_bufalloc.c +++ b/sys/kern/subr_busdma_bufalloc.c @@ -149,7 +149,7 @@ busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size) } void * -busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, +busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, int domain, uint8_t *pflag, int wait) { #ifdef VM_MEMATTR_UNCACHEABLE @@ -157,7 +157,7 @@ busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, /* Inform UMA that this allocator uses kernel_arena/object. */ *pflag = UMA_SLAB_KERNEL; - return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0, + return ((void *)kmem_alloc_attr_domain(domain, size, wait, 0, BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE)); #else diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c index 8af283d95c95..ef400c93b8d7 100644 --- a/sys/kern/subr_vmem.c +++ b/sys/kern/subr_vmem.c @@ -500,7 +500,7 @@ bt_insfree(vmem_t *vm, bt_t *bt) * Import from the arena into the quantum cache in UMA. */ static int -qc_import(void *arg, void **store, int cnt, int flags) +qc_import(void *arg, void **store, int cnt, int domain, int flags) { qcache_t *qc; vmem_addr_t addr; @@ -614,13 +614,12 @@ static struct mtx_padalign __exclusive_cache_line vmem_bt_lock; * we are really out of KVA. */ static void * -vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { vmem_addr_t addr; - int domain; *pflag = UMA_SLAB_KERNEL; - domain = 0; /* XXX Temporary. */ /* * Single thread boundary tag allocation so that the address space diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index a9a915301abc..c63579db5a2c 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -133,7 +133,7 @@ static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); -static int buf_import(void *, void **, int, int); +static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); @@ -1419,7 +1419,7 @@ buf_free(struct buf *bp) * only as a per-cpu cache of bufs still maintained on a global list. */ static int -buf_import(void *arg, void **store, int cnt, int flags) +buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; diff --git a/sys/mips/mips/uma_machdep.c b/sys/mips/mips/uma_machdep.c index bfc87a8efcc2..9ffbf523ecf7 100644 --- a/sys/mips/mips/uma_machdep.c +++ b/sys/mips/mips/uma_machdep.c @@ -44,7 +44,8 @@ __FBSDID("$FreeBSD$"); #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_paddr_t pa; vm_page_t m; @@ -59,7 +60,8 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) #endif for (;;) { - m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags); + m = vm_page_alloc_freelist_domain(domain, VM_FREELIST_DIRECT, + pflags); #ifndef __mips_n64 if (m == NULL && vm_page_reclaim_contig(pflags, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index c2640087b445..118a1525fd50 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -1504,8 +1504,8 @@ retry: static mmu_t installed_mmu; static void * -moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, - int wait) +moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *flags, int wait) { struct pvo_entry *pvo; vm_offset_t va; @@ -1522,7 +1522,7 @@ moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); diff --git a/sys/powerpc/aim/slb.c b/sys/powerpc/aim/slb.c index 5a725f4860d3..da1f624bdcee 100644 --- a/sys/powerpc/aim/slb.c +++ b/sys/powerpc/aim/slb.c @@ -480,7 +480,8 @@ slb_insert_user(pmap_t pm, struct slb *slb) } static void * -slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + u_int8_t *flags, int wait) { static vm_offset_t realmax = 0; void *va; @@ -490,7 +491,7 @@ slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) realmax = platform_real_maxaddr(); *flags = UMA_SLAB_PRIV; - m = vm_page_alloc_contig(NULL, 0, + m = vm_page_alloc_contig_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, 1, 0, realmax, PAGE_SIZE, PAGE_SIZE, VM_MEMATTR_DEFAULT); if (m == NULL) diff --git a/sys/powerpc/powerpc/uma_machdep.c b/sys/powerpc/powerpc/uma_machdep.c index e121c2bbf637..c9e63c199dbc 100644 --- a/sys/powerpc/powerpc/uma_machdep.c +++ b/sys/powerpc/powerpc/uma_machdep.c @@ -51,7 +51,8 @@ SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0, "UMA MD pages in use"); void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { void *va; vm_paddr_t pa; @@ -59,7 +60,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) *flags = UMA_SLAB_PRIV; - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); diff --git a/sys/riscv/riscv/uma_machdep.c b/sys/riscv/riscv/uma_machdep.c index ba480713fc75..32cb41d28f49 100644 --- a/sys/riscv/riscv/uma_machdep.c +++ b/sys/riscv/riscv/uma_machdep.c @@ -41,7 +41,8 @@ __FBSDID("$FreeBSD$"); #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { panic("uma_small_alloc"); diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c index 70e11404b3d0..5ba0844e4f5b 100644 --- a/sys/sparc64/sparc64/vm_machdep.c +++ b/sys/sparc64/sparc64/vm_machdep.c @@ -392,7 +392,8 @@ swi_vm(void *v) } void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_paddr_t pa; vm_page_t m; @@ -402,7 +403,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) *flags = UMA_SLAB_PRIV; - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h index 47ca13f5ebd6..57a06e8ccfd4 100644 --- a/sys/sys/malloc.h +++ b/sys/sys/malloc.h @@ -175,9 +175,17 @@ void *contigmalloc(unsigned long size, struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, vm_paddr_t boundary) __malloc_like __result_use_check __alloc_size(1) __alloc_align(6); +void *contigmalloc_domain(unsigned long size, struct malloc_type *type, + int domain, int flags, vm_paddr_t low, vm_paddr_t high, + unsigned long alignment, vm_paddr_t boundary) + __malloc_like __result_use_check __alloc_size(1) __alloc_align(6); void free(void *addr, struct malloc_type *type); +void free_domain(void *addr, struct malloc_type *type); void *malloc(unsigned long size, struct malloc_type *type, int flags) __malloc_like __result_use_check __alloc_size(1); +void *malloc_domain(unsigned long size, struct malloc_type *type, + int domain, int flags) + __malloc_like __result_use_check __alloc_size(1); void *mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags) __malloc_like __result_use_check __alloc_size(1) __alloc_size(2); diff --git a/sys/vm/uma.h b/sys/vm/uma.h index 1bd366416cbc..b15eaf727bff 100644 --- a/sys/vm/uma.h +++ b/sys/vm/uma.h @@ -128,7 +128,8 @@ typedef void (*uma_fini)(void *mem, int size); /* * Import new memory into a cache zone. */ -typedef int (*uma_import)(void *arg, void **store, int count, int flags); +typedef int (*uma_import)(void *arg, void **store, int count, int domain, + int flags); /* * Free memory from a cache zone. @@ -281,6 +282,10 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, * Allocates mp_maxid + 1 slabs sized to * sizeof(struct pcpu). */ +#define UMA_ZONE_NUMA 0x10000 /* + * NUMA aware Zone. Implements a best + * effort first-touch policy. + */ /* * These flags are shared between the keg and zone. In zones wishing to add @@ -325,6 +330,19 @@ void uma_zdestroy(uma_zone_t zone); void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags); +/* + * Allocate an item from a specific NUMA domain. This uses a slow path in + * the allocator but is guaranteed to allocate memory from the requested + * domain if M_WAITOK is set. + * + * Arguments: + * zone The zone we are allocating from + * arg This data is passed to the ctor function + * domain The domain to allocate from. + * flags See sys/malloc.h for available flags. + */ +void *uma_zalloc_domain(uma_zone_t zone, void *arg, int domain, int flags); + /* * Allocates an item out of a zone without supplying an argument * @@ -353,6 +371,16 @@ uma_zalloc(uma_zone_t zone, int flags) void uma_zfree_arg(uma_zone_t zone, void *item, void *arg); +/* + * Frees an item back to the specified zone's domain specific pool. + * + * Arguments: + * zone The zone the item was originally allocated out of. + * item The memory to be freed. + * arg Argument passed to the destructor + */ +void uma_zfree_domain(uma_zone_t zone, void *item, void *arg); + /* * Frees an item back to a zone without supplying an argument * @@ -372,11 +400,6 @@ uma_zfree(uma_zone_t zone, void *item) */ void uma_zwait(uma_zone_t zone); -/* - * XXX The rest of the prototypes in this header are h0h0 magic for the VM. - * If you think you need to use it for a normal zone you're probably incorrect. - */ - /* * Backend page supplier routines * @@ -384,14 +407,15 @@ void uma_zwait(uma_zone_t zone); * zone The zone that is requesting pages. * size The number of bytes being requested. * pflag Flags for these memory pages, see below. + * domain The NUMA domain that we prefer for this allocation. * wait Indicates our willingness to block. * * Returns: * A pointer to the allocated memory or NULL on failure. */ -typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag, - int wait); +typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain, + uint8_t *pflag, int wait); /* * Backend page free routines @@ -406,8 +430,6 @@ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag, */ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); - - /* * Sets up the uma allocator. (Called by vm_mem_init) * diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 0dd16dd9c325..86b6030b1935 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -97,17 +98,12 @@ __FBSDID("$FreeBSD$"); #endif /* - * This is the zone and keg from which all zones are spawned. The idea is that - * even the zone & keg heads are allocated from the allocator, so we use the - * bss section to bootstrap us. + * This is the zone and keg from which all zones are spawned. */ -static struct uma_keg masterkeg; -static struct uma_zone masterzone_k; -static struct uma_zone masterzone_z; -static uma_zone_t kegs = &masterzone_k; -static uma_zone_t zones = &masterzone_z; +static uma_zone_t kegs; +static uma_zone_t zones; -/* This is the zone from which all of uma_slab_t's are allocated. */ +/* This is the zone from which all offpage uma_slab_ts are allocated. */ static uma_zone_t slabzone; /* @@ -226,13 +222,15 @@ struct uma_bucket_zone bucket_zones[] = { */ enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI }; +#define UMA_ANYDOMAIN -1 /* Special value for domain search. */ + /* Prototypes.. */ -static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int); -static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int); -static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); -static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int); +static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_drain(uma_zone_t zone); @@ -250,23 +248,23 @@ static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); -static void *zone_alloc_item(uma_zone_t, void *, int); +static void *zone_alloc_item(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(void); -static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags); -static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags); -static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags); +static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); +static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int); +static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); -static int zone_import(uma_zone_t zone, void **bucket, int max, int flags); -static void zone_release(uma_zone_t zone, void **bucket, int cnt); -static void uma_zero_item(void *item, uma_zone_t zone); +static int zone_import(uma_zone_t, void **, int, int, int); +static void zone_release(uma_zone_t, void **, int); +static void uma_zero_item(void *, uma_zone_t); void uma_print_zone(uma_zone_t); void uma_print_stats(void); @@ -332,7 +330,7 @@ bucket_init(void) size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, - UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET); + UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA); } } @@ -570,7 +568,7 @@ hash_alloc(struct uma_hash *hash) } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, - M_WAITOK); + UMA_ANYDOMAIN, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { @@ -736,6 +734,7 @@ cache_drain_safe_cpu(uma_zone_t zone) { uma_cache_t cache; uma_bucket_t b1, b2; + int domain; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; @@ -743,10 +742,14 @@ cache_drain_safe_cpu(uma_zone_t zone) b1 = b2 = NULL; ZONE_LOCK(zone); critical_enter(); + if (zone->uz_flags & UMA_ZONE_NUMA) + domain = PCPU_GET(domain); + else + domain = 0; cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket) { if (cache->uc_allocbucket->ub_cnt != 0) - LIST_INSERT_HEAD(&zone->uz_buckets, + LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets, cache->uc_allocbucket, ub_link); else b1 = cache->uc_allocbucket; @@ -754,7 +757,7 @@ cache_drain_safe_cpu(uma_zone_t zone) } if (cache->uc_freebucket) { if (cache->uc_freebucket->ub_cnt != 0) - LIST_INSERT_HEAD(&zone->uz_buckets, + LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets, cache->uc_freebucket, ub_link); else b2 = cache->uc_freebucket; @@ -809,18 +812,22 @@ cache_drain_safe(uma_zone_t zone) static void bucket_cache_drain(uma_zone_t zone) { + uma_zone_domain_t zdom; uma_bucket_t bucket; + int i; /* - * Drain the bucket queues and free the buckets, we just keep two per - * cpu (alloc/free). + * Drain the bucket queues and free the buckets. */ - while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { - LIST_REMOVE(bucket, ub_link); - ZONE_UNLOCK(zone); - bucket_drain(zone, bucket); - bucket_free(zone, bucket, NULL); - ZONE_LOCK(zone); + for (i = 0; i < vm_ndomains; i++) { + zdom = &zone->uz_domain[i]; + while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { + LIST_REMOVE(bucket, ub_link); + ZONE_UNLOCK(zone); + bucket_drain(zone, bucket); + bucket_free(zone, bucket, NULL); + ZONE_LOCK(zone); + } } /* @@ -865,7 +872,9 @@ static void keg_drain(uma_keg_t keg) { struct slabhead freeslabs = { 0 }; + uma_domain_t dom; uma_slab_t slab, tmp; + int i; /* * We don't want to take pages from statically allocated kegs at this @@ -880,20 +889,25 @@ keg_drain(uma_keg_t keg) if (keg->uk_free == 0) goto finished; - LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) { - /* We have nowhere to free these to. */ - if (slab->us_flags & UMA_SLAB_BOOT) - continue; + for (i = 0; i < vm_ndomains; i++) { + dom = &keg->uk_domain[i]; + LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) { + /* We have nowhere to free these to. */ + if (slab->us_flags & UMA_SLAB_BOOT) + continue; - LIST_REMOVE(slab, us_link); - keg->uk_pages -= keg->uk_ppera; - keg->uk_free -= keg->uk_ipers; + LIST_REMOVE(slab, us_link); + keg->uk_pages -= keg->uk_ppera; + keg->uk_free -= keg->uk_ipers; - if (keg->uk_flags & UMA_ZONE_HASH) - UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data); + if (keg->uk_flags & UMA_ZONE_HASH) + UMA_HASH_REMOVE(&keg->uk_hash, slab, + slab->us_data); - SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); + SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); + } } + finished: KEG_UNLOCK(keg); @@ -953,7 +967,7 @@ zone_drain(uma_zone_t zone) * caller specified M_NOWAIT. */ static uma_slab_t -keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) +keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait) { uma_alloc allocf; uma_slab_t slab; @@ -962,6 +976,8 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) uint8_t flags; int i; + KASSERT(domain >= 0 && domain < vm_ndomains, + ("keg_alloc_slab: domain %d out of range", domain)); mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; mem = NULL; @@ -971,7 +987,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) size = keg->uk_ppera * PAGE_SIZE; if (keg->uk_flags & UMA_ZONE_OFFPAGE) { - slab = zone_alloc_item(keg->uk_slabzone, NULL, wait); + slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait); if (slab == NULL) goto out; } @@ -992,7 +1008,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) wait |= M_NODUMP; /* zone is passed for legacy reasons. */ - mem = allocf(zone, size, &flags, wait); + mem = allocf(zone, size, domain, &flags, wait); if (mem == NULL) { if (keg->uk_flags & UMA_ZONE_OFFPAGE) zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); @@ -1013,6 +1029,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) slab->us_data = mem; slab->us_freecount = keg->uk_ipers; slab->us_flags = flags; + slab->us_domain = domain; BIT_FILL(SLAB_SETSIZE, &slab->us_free); #ifdef INVARIANTS BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree); @@ -1052,7 +1069,8 @@ out: * the VM is ready. */ static void * -startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { uma_keg_t keg; void *mem; @@ -1085,7 +1103,7 @@ startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) #else keg->uk_allocf = page_alloc; #endif - return keg->uk_allocf(zone, bytes, pflag, wait); + return keg->uk_allocf(zone, bytes, domain, pflag, wait); } /* @@ -1100,12 +1118,13 @@ startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) * NULL if M_NOWAIT is set. */ static void * -page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KERNEL; - p = (void *) kmem_malloc(kernel_arena, bytes, wait); + p = (void *) kmem_malloc_domain(domain, bytes, wait); return (p); } @@ -1122,7 +1141,8 @@ page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) * NULL if M_NOWAIT is set. */ static void * -noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; @@ -1135,7 +1155,7 @@ noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { - p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | + p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : VM_ALLOC_NOWAIT)); @@ -1407,6 +1427,7 @@ keg_ctor(void *mem, int size, void *udata, int flags) keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; + keg->uk_cursor = 0; keg->uk_free = 0; keg->uk_reserve = 0; keg->uk_pages = 0; @@ -1548,6 +1569,8 @@ zone_ctor(void *mem, int size, void *udata, int flags) zone->uz_count_min = 0; zone->uz_flags = 0; zone->uz_warning = NULL; + /* The domain structures follow the cpu structures. */ + zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus]; timevalclear(&zone->uz_ratecheck); keg = arg->keg; @@ -1753,22 +1776,43 @@ void uma_startup(void *mem, int npages) { struct uma_zctor_args args; + uma_keg_t masterkeg; + uintptr_t m; + int zsize; + int ksize; rw_init(&uma_rwlock, "UMA lock"); + ksize = sizeof(struct uma_keg) + + (sizeof(struct uma_domain) * vm_ndomains); + zsize = sizeof(struct uma_zone) + + (sizeof(struct uma_cache) * mp_ncpus) + + (sizeof(struct uma_zone_domain) * vm_ndomains); + + /* Use bootpages memory for the zone of zones and zone of kegs. */ + m = (uintptr_t)mem; + zones = (uma_zone_t)m; + m += roundup(zsize, CACHE_LINE_SIZE); + kegs = (uma_zone_t)m; + m += roundup(zsize, CACHE_LINE_SIZE); + masterkeg = (uma_keg_t)m; + m += roundup(ksize, CACHE_LINE_SIZE); + m = roundup(m, PAGE_SIZE); + npages -= (m - (uintptr_t)mem) / PAGE_SIZE; + mem = (void *)m; + /* "manually" create the initial zone */ memset(&args, 0, sizeof(args)); args.name = "UMA Kegs"; - args.size = sizeof(struct uma_keg); + args.size = ksize; args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; - args.keg = &masterkeg; + args.keg = masterkeg; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; - /* The initial zone has no Per cpu queues so it's smaller */ - zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK); + zone_ctor(kegs, zsize, &args, M_WAITOK); mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF); bootmem = mem; @@ -1776,7 +1820,8 @@ uma_startup(void *mem, int npages) args.name = "UMA Zones"; args.size = sizeof(struct uma_zone) + - (sizeof(struct uma_cache) * (mp_maxid + 1)); + (sizeof(struct uma_cache) * (mp_maxid + 1)) + + (sizeof(struct uma_zone_domain) * vm_ndomains); args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; @@ -1784,8 +1829,7 @@ uma_startup(void *mem, int npages) args.keg = NULL; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; - /* The initial zone has no Per cpu queues so it's smaller */ - zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK); + zone_ctor(zones, zsize, &args, M_WAITOK); /* Now make a zone for slab headers */ slabzone = uma_zcreate("UMA Slabs", @@ -1837,7 +1881,7 @@ uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; args.flags = flags; args.zone = zone; - return (zone_alloc_item(kegs, &args, M_WAITOK)); + return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* See uma.h */ @@ -1894,7 +1938,7 @@ uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, sx_slock(&uma_drain_lock); locked = true; } - res = zone_alloc_item(zones, &args, M_WAITOK); + res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) sx_sunlock(&uma_drain_lock); return (res); @@ -1929,7 +1973,7 @@ uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, locked = true; } /* XXX Attaches only one keg of potentially many. */ - res = zone_alloc_item(zones, &args, M_WAITOK); + res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) sx_sunlock(&uma_drain_lock); return (res); @@ -1956,7 +2000,7 @@ uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, args.align = 0; args.flags = flags; - return (zone_alloc_item(zones, &args, M_WAITOK)); + return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } static void @@ -2061,11 +2105,11 @@ uma_zwait(uma_zone_t zone) void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { - void *item; - uma_cache_t cache; + uma_zone_domain_t zdom; uma_bucket_t bucket; - int lockfail; - int cpu; + uma_cache_t cache; + void *item; + int cpu, domain, lockfail; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); @@ -2162,6 +2206,11 @@ zalloc_start: if (bucket != NULL) bucket_free(zone, bucket, udata); + if (zone->uz_flags & UMA_ZONE_NUMA) + domain = PCPU_GET(domain); + else + domain = UMA_ANYDOMAIN; + /* Short-circuit for zones without buckets and low memory. */ if (zone->uz_count == 0 || bucketdisable) goto zalloc_item; @@ -2202,7 +2251,11 @@ zalloc_start: /* * Check the zone's cache of buckets. */ - if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { + if (domain == UMA_ANYDOMAIN) + zdom = &zone->uz_domain[0]; + else + zdom = &zone->uz_domain[domain]; + if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); @@ -2227,7 +2280,7 @@ zalloc_start: * works we'll restart the allocation from the beginning and it * will use the just filled bucket. */ - bucket = zone_alloc_bucket(zone, udata, flags); + bucket = zone_alloc_bucket(zone, udata, domain, flags); CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); if (bucket != NULL) { @@ -2240,10 +2293,12 @@ zalloc_start: * initialized bucket to make this less likely or claim * the memory directly. */ - if (cache->uc_allocbucket == NULL) - cache->uc_allocbucket = bucket; + if (cache->uc_allocbucket != NULL || + (zone->uz_flags & UMA_ZONE_NUMA && + domain != PCPU_GET(domain))) + LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); else - LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); + cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; } @@ -2252,38 +2307,103 @@ zalloc_start: * We may not be able to get a bucket so return an actual item. */ zalloc_item: - item = zone_alloc_item(zone, udata, flags); + item = zone_alloc_item(zone, udata, domain, flags); return (item); } -static uma_slab_t -keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags) +void * +uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) { + + /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ + random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); + + /* This is the fast path allocation */ + CTR5(KTR_UMA, + "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d", + curthread, zone->uz_name, zone, domain, flags); + + if (flags & M_WAITOK) { + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "uma_zalloc_domain: zone \"%s\"", zone->uz_name); + } + KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), + ("uma_zalloc_domain: called with spinlock or critical section held")); + + return (zone_alloc_item(zone, udata, domain, flags)); +} + +/* + * Find a slab with some space. Prefer slabs that are partially used over those + * that are totally full. This helps to reduce fragmentation. + * + * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check + * only 'domain'. + */ +static uma_slab_t +keg_first_slab(uma_keg_t keg, int domain, int rr) +{ + uma_domain_t dom; uma_slab_t slab; - int reserve; + int start; + + KASSERT(domain >= 0 && domain < vm_ndomains, + ("keg_first_slab: domain %d out of range", domain)); + + slab = NULL; + start = domain; + do { + dom = &keg->uk_domain[domain]; + if (!LIST_EMPTY(&dom->ud_part_slab)) + return (LIST_FIRST(&dom->ud_part_slab)); + if (!LIST_EMPTY(&dom->ud_free_slab)) { + slab = LIST_FIRST(&dom->ud_free_slab); + LIST_REMOVE(slab, us_link); + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); + return (slab); + } + if (rr) + domain = (domain + 1) % vm_ndomains; + } while (domain != start); + + return (NULL); +} + +static uma_slab_t +keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags) +{ + uma_domain_t dom; + uma_slab_t slab; + int allocflags, domain, reserve, rr, start; mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; reserve = 0; + allocflags = flags; if ((flags & M_USE_RESERVE) == 0) reserve = keg->uk_reserve; - for (;;) { - /* - * Find a slab with some space. Prefer slabs that are partially - * used over those that are totally full. This helps to reduce - * fragmentation. - */ - if (keg->uk_free > reserve) { - if (!LIST_EMPTY(&keg->uk_part_slab)) { - slab = LIST_FIRST(&keg->uk_part_slab); - } else { - slab = LIST_FIRST(&keg->uk_free_slab); - LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, - us_link); - } + /* + * Round-robin for non first-touch zones when there is more than one + * domain. + */ + if (vm_ndomains == 1) + rdomain = 0; + rr = rdomain == UMA_ANYDOMAIN; + if (rr) { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = start = keg->uk_cursor; + /* Only block on the second pass. */ + if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK) + allocflags = (allocflags & ~M_WAITOK) | M_NOWAIT; + } else + domain = start = rdomain; + +again: + do { + if (keg->uk_free > reserve && + (slab = keg_first_slab(keg, domain, rr)) != NULL) { MPASS(slab->us_keg == keg); return (slab); } @@ -2306,12 +2426,12 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags) zone_maxaction(zone); } if (flags & M_NOWAIT) - break; + return (NULL); zone->uz_sleeps++; msleep(keg, &keg->uk_lock, PVM, "keglimit", 0); continue; } - slab = keg_alloc_slab(keg, zone, flags); + slab = keg_alloc_slab(keg, zone, domain, allocflags); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove @@ -2319,21 +2439,37 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags) */ if (slab) { MPASS(slab->us_keg == keg); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } - /* - * We might not have been able to get a slab but another cpu - * could have while we were unlocked. Check again before we - * fail. - */ - flags |= M_NOVM; + if (rr) { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = keg->uk_cursor; + } + } while (domain != start); + + /* Retry domain scan with blocking. */ + if (allocflags != flags) { + allocflags = flags; + goto again; } - return (slab); + + /* + * We might not have been able to get a slab but another cpu + * could have while we were unlocked. Check again before we + * fail. + */ + if (keg->uk_free > reserve && + (slab = keg_first_slab(keg, domain, rr)) != NULL) { + MPASS(slab->us_keg == keg); + return (slab); + } + return (NULL); } static uma_slab_t -zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags) +zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags) { uma_slab_t slab; @@ -2343,7 +2479,7 @@ zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags) } for (;;) { - slab = keg_fetch_slab(keg, zone, flags); + slab = keg_fetch_slab(keg, zone, domain, flags); if (slab) return (slab); if (flags & (M_NOWAIT | M_NOVM)) @@ -2360,7 +2496,7 @@ zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags) * The last pointer is used to seed the search. It is not required. */ static uma_slab_t -zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) +zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags) { uma_klink_t klink; uma_slab_t slab; @@ -2380,7 +2516,7 @@ zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) * the search. */ if (last != NULL) { - slab = keg_fetch_slab(last, zone, flags); + slab = keg_fetch_slab(last, zone, domain, flags); if (slab) return (slab); KEG_UNLOCK(last); @@ -2401,7 +2537,7 @@ zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) keg = klink->kl_keg; KEG_LOCK(keg); if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) { - slab = keg_fetch_slab(keg, zone, flags); + slab = keg_fetch_slab(keg, zone, domain, flags); if (slab) return (slab); } @@ -2437,6 +2573,7 @@ zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { + uma_domain_t dom; void *item; uint8_t freei; @@ -2452,32 +2589,48 @@ slab_alloc_item(uma_keg_t keg, uma_slab_t slab) /* Move this slab to the full list */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); } return (item); } static int -zone_import(uma_zone_t zone, void **bucket, int max, int flags) +zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags) { uma_slab_t slab; uma_keg_t keg; + int stripe; int i; slab = NULL; keg = NULL; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { - if ((slab = zone->uz_slab(zone, keg, flags)) == NULL) + if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL) break; keg = slab->us_keg; + stripe = howmany(max, vm_ndomains); while (slab->us_freecount && i < max) { bucket[i++] = slab_alloc_item(keg, slab); if (keg->uk_free <= keg->uk_reserve) break; +#if MAXMEMDOM > 1 + /* + * If the zone is striped we pick a new slab for every + * N allocations. Eliminating this conditional will + * instead pick a new domain for each bucket rather + * than stripe within each bucket. The current option + * produces more fragmentation and requires more cpu + * time but yields better distribution. + */ + if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 && + vm_ndomains > 1 && --stripe == 0) + break; +#endif } - /* Don't grab more than one slab at a time. */ + /* Don't block if we allocated any successfully. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } @@ -2488,7 +2641,7 @@ zone_import(uma_zone_t zone, void **bucket, int max, int flags) } static uma_bucket_t -zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) +zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { uma_bucket_t bucket; int max; @@ -2500,7 +2653,7 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) max = MIN(bucket->ub_entries, zone->uz_count); bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, - max, flags); + max, domain, flags); /* * Initialize the memory if necessary. @@ -2542,6 +2695,7 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. + * domain The domain to allocate from or UMA_ANYDOMAIN. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns @@ -2550,13 +2704,13 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) */ static void * -zone_alloc_item(uma_zone_t zone, void *udata, int flags) +zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) { void *item; item = NULL; - if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1) + if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail; atomic_add_long(&zone->uz_allocs, 1); @@ -2602,8 +2756,8 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_bucket_t bucket; - int lockfail; - int cpu; + uma_zone_domain_t zdom; + int cpu, domain, lockfail; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); @@ -2717,6 +2871,12 @@ zfree_start: /* We are no longer associated with this CPU. */ critical_exit(); + if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) + domain = PCPU_GET(domain); + else + domain = 0; + zdom = &zone->uz_domain[0]; + /* Can we throw this on the zone full list? */ if (bucket != NULL) { CTR3(KTR_UMA, @@ -2725,7 +2885,7 @@ zfree_start: /* ub_cnt is pointing to the last free item */ KASSERT(bucket->ub_cnt != 0, ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); - LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); + LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); } /* @@ -2743,7 +2903,9 @@ zfree_start: critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; - if (cache->uc_freebucket == NULL) { + if (cache->uc_freebucket == NULL && + ((zone->uz_flags & UMA_ZONE_NUMA) == 0 || + domain == PCPU_GET(domain))) { cache->uc_freebucket = bucket; goto zfree_start; } @@ -2765,21 +2927,43 @@ zfree_item: return; } +void +uma_zfree_domain(uma_zone_t zone, void *item, void *udata) +{ + + /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ + random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); + + CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread, + zone->uz_name); + + KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), + ("uma_zfree_domain: called with spinlock or critical section held")); + + /* uma_zfree(..., NULL) does nothing, to match free(9). */ + if (item == NULL) + return; + zone_free_item(zone, item, udata, SKIP_NONE); +} + static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item) { + uma_domain_t dom; uint8_t freei; mtx_assert(&keg->uk_lock, MA_OWNED); MPASS(keg == slab->us_keg); + dom = &keg->uk_domain[slab->us_domain]; + /* Do we need to remove from any lists? */ if (slab->us_freecount+1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); + LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); } /* Slab management. */ @@ -3102,24 +3286,28 @@ uma_zone_reserve_kva(uma_zone_t zone, int count) void uma_prealloc(uma_zone_t zone, int items) { - int slabs; + uma_domain_t dom; uma_slab_t slab; uma_keg_t keg; + int domain, slabs; keg = zone_first_keg(zone); if (keg == NULL) return; KEG_LOCK(keg); slabs = items / keg->uk_ipers; + domain = 0; if (slabs * keg->uk_ipers < items) slabs++; while (slabs > 0) { - slab = keg_alloc_slab(keg, zone, M_WAITOK); + slab = keg_alloc_slab(keg, zone, domain, M_WAITOK); if (slab == NULL) break; MPASS(slab->us_keg == keg); - LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); slabs--; + domain = (domain + 1) % vm_ndomains; } KEG_UNLOCK(keg); } @@ -3204,34 +3392,47 @@ uma_zone_exhausted_nolock(uma_zone_t zone) } void * -uma_large_malloc(vm_size_t size, int wait) +uma_large_malloc_domain(vm_size_t size, int domain, int wait) { - void *mem; + vm_offset_t addr; uma_slab_t slab; - uint8_t flags; - slab = zone_alloc_item(slabzone, NULL, wait); + slab = zone_alloc_item(slabzone, NULL, domain, wait); if (slab == NULL) return (NULL); - mem = page_alloc(NULL, size, &flags, wait); - if (mem) { - vsetslab((vm_offset_t)mem, slab); - slab->us_data = mem; - slab->us_flags = flags | UMA_SLAB_MALLOC; + if (domain == UMA_ANYDOMAIN) + addr = kmem_malloc(kernel_arena, size, wait); + else + addr = kmem_malloc_domain(domain, size, wait); + if (addr != 0) { + vsetslab(addr, slab); + slab->us_data = (void *)addr; + slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC; slab->us_size = size; + slab->us_domain = vm_phys_domidx(PHYS_TO_VM_PAGE( + pmap_kextract(addr))); uma_total_inc(size); } else { zone_free_item(slabzone, slab, NULL, SKIP_NONE); } - return (mem); + return ((void *)addr); +} + +void * +uma_large_malloc(vm_size_t size, int wait) +{ + + return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait); } void uma_large_free(uma_slab_t slab) { - page_free(slab->us_data, slab->us_size, slab->us_flags); + KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0, + ("uma_large_free: Memory not allocated with uma_large_malloc.")); + kmem_free(kernel_arena, (vm_offset_t)slab->us_data, slab->us_size); uma_total_dec(slab->us_size); zone_free_item(slabzone, slab, NULL, SKIP_NONE); } @@ -3302,7 +3503,9 @@ cache_print(uma_cache_t cache) static void uma_print_keg(uma_keg_t keg) { + uma_domain_t dom; uma_slab_t slab; + int i; printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d " "out %d free %d limit %d\n", @@ -3310,15 +3513,18 @@ uma_print_keg(uma_keg_t keg) keg->uk_ipers, keg->uk_ppera, (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers); - printf("Part slabs:\n"); - LIST_FOREACH(slab, &keg->uk_part_slab, us_link) - slab_print(slab); - printf("Free slabs:\n"); - LIST_FOREACH(slab, &keg->uk_free_slab, us_link) - slab_print(slab); - printf("Full slabs:\n"); - LIST_FOREACH(slab, &keg->uk_full_slab, us_link) - slab_print(slab); + for (i = 0; i < vm_ndomains; i++) { + dom = &keg->uk_domain[i]; + printf("Part slabs:\n"); + LIST_FOREACH(slab, &dom->ud_part_slab, us_link) + slab_print(slab); + printf("Free slabs:\n"); + LIST_FOREACH(slab, &dom->ud_free_slab, us_link) + slab_print(slab); + printf("Full slabs:\n"); + LIST_FOREACH(slab, &dom->ud_full_slab, us_link) + slab_print(slab); + } } void @@ -3408,6 +3614,7 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) struct uma_type_header uth; struct uma_percpu_stat ups; uma_bucket_t bucket; + uma_zone_domain_t zdom; struct sbuf sbuf; uma_cache_t cache; uma_klink_t kl; @@ -3463,8 +3670,12 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - uth.uth_zone_free += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, + ub_link) + uth.uth_zone_free += bucket->ub_cnt; + } uth.uth_allocs = z->uz_allocs; uth.uth_frees = z->uz_frees; uth.uth_fails = z->uz_fails; @@ -3630,11 +3841,12 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) #ifdef DDB DB_SHOW_COMMAND(uma, db_show_uma) { - uint64_t allocs, frees, sleeps; uma_bucket_t bucket; uma_keg_t kz; uma_zone_t z; - int cachefree; + uma_zone_domain_t zdom; + uint64_t allocs, frees, sleeps; + int cachefree, i; db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Sleeps", "Bucket"); @@ -3651,8 +3863,12 @@ DB_SHOW_COMMAND(uma, db_show_uma) if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) cachefree += kz->uk_free; - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - cachefree += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, + ub_link) + cachefree += bucket->ub_cnt; + } db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n", z->uz_name, (uintmax_t)kz->uk_size, (intmax_t)(allocs - frees), cachefree, @@ -3665,17 +3881,21 @@ DB_SHOW_COMMAND(uma, db_show_uma) DB_SHOW_COMMAND(umacache, db_show_umacache) { - uint64_t allocs, frees; uma_bucket_t bucket; uma_zone_t z; - int cachefree; + uma_zone_domain_t zdom; + uint64_t allocs, frees; + int cachefree, i; db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL); - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - cachefree += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, ub_link) + cachefree += bucket->ub_cnt; + } db_printf("%18s %8ju %8jd %8d %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h index a8aca454f066..5836900863e4 100644 --- a/sys/vm/uma_int.h +++ b/sys/vm/uma_int.h @@ -39,7 +39,22 @@ */ /* - * Here's a quick description of the relationship between the objects: + * The brief summary; Zones describe unique allocation types. Zones are + * organized into per-CPU caches which are filled by buckets. Buckets are + * organized according to memory domains. Buckets are filled from kegs which + * are also organized according to memory domains. Kegs describe a unique + * allocation type, backend memory provider, and layout. Kegs are associated + * with one or more zones and zones reference one or more kegs. Kegs provide + * slabs which are virtually contiguous collections of pages. Each slab is + * broken down int one or more items that will satisfy an individual allocation. + * + * Allocation is satisfied in the following order: + * 1) Per-CPU cache + * 2) Per-domain cache of buckets + * 3) Slab from any of N kegs + * 4) Backend page provider + * + * More detail on individual objects is contained below: * * Kegs contain lists of slabs which are stored in either the full bin, empty * bin, or partially allocated bin, to reduce fragmentation. They also contain @@ -47,6 +62,13 @@ * and rsize is the result of that. The Keg also stores information for * managing a hash of page addresses that maps pages to uma_slab_t structures * for pages that don't have embedded uma_slab_t's. + * + * Keg slab lists are organized by memory domain to support NUMA allocation + * policies. By default allocations are spread across domains to reduce the + * potential for hotspots. Special keg creation flags may be specified to + * prefer location allocation. However there is no strict enforcement as frees + * may happen on any CPU and these are returned to the CPU-local cache + * regardless of the originating domain. * * The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may * be allocated off the page from a special slab zone. The free list within a @@ -181,6 +203,17 @@ struct uma_cache { typedef struct uma_cache * uma_cache_t; +/* + * Per-domain memory list. Embedded in the kegs. + */ +struct uma_domain { + LIST_HEAD(,uma_slab) ud_part_slab; /* partially allocated slabs */ + LIST_HEAD(,uma_slab) ud_free_slab; /* empty slab list */ + LIST_HEAD(,uma_slab) ud_full_slab; /* full slabs */ +}; + +typedef struct uma_domain * uma_domain_t; + /* * Keg management structure * @@ -192,10 +225,8 @@ struct uma_keg { struct uma_hash uk_hash; LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */ - LIST_HEAD(,uma_slab) uk_part_slab; /* partially allocated slabs */ - LIST_HEAD(,uma_slab) uk_free_slab; /* empty slab list */ - LIST_HEAD(,uma_slab) uk_full_slab; /* full slabs */ + uint32_t uk_cursor; /* Domain alloc cursor. */ uint32_t uk_align; /* Alignment mask */ uint32_t uk_pages; /* Total page count */ uint32_t uk_free; /* Count of items free in slabs */ @@ -221,6 +252,9 @@ struct uma_keg { /* Least used fields go to the last cache line. */ const char *uk_name; /* Name of creating zone. */ LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */ + + /* Must be last, variable sized. */ + struct uma_domain uk_domain[]; /* Keg's slab lists. */ }; typedef struct uma_keg * uma_keg_t; @@ -248,14 +282,18 @@ struct uma_slab { #endif uint16_t us_freecount; /* How many are free? */ uint8_t us_flags; /* Page flags see uma.h */ - uint8_t us_pad; /* Pad to 32bits, unused. */ + uint8_t us_domain; /* Backing NUMA domain. */ }; #define us_link us_type._us_link #define us_size us_type._us_size +#if MAXMEMDOM >= 255 +#error "Slab domain type insufficient" +#endif + typedef struct uma_slab * uma_slab_t; -typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int); +typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int, int); struct uma_klink { LIST_ENTRY(uma_klink) kl_link; @@ -263,6 +301,12 @@ struct uma_klink { }; typedef struct uma_klink *uma_klink_t; +struct uma_zone_domain { + LIST_HEAD(,uma_bucket) uzd_buckets; /* full buckets */ +}; + +typedef struct uma_zone_domain * uma_zone_domain_t; + /* * Zone management structure * @@ -275,7 +319,7 @@ struct uma_zone { const char *uz_name; /* Text name of the zone */ LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ - LIST_HEAD(,uma_bucket) uz_buckets; /* full buckets */ + struct uma_zone_domain *uz_domain; /* per-domain buckets */ LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ struct uma_klink uz_klink; /* klink for first keg. */ @@ -309,7 +353,9 @@ struct uma_zone { * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. */ - struct uma_cache uz_cpu[1]; /* Per cpu caches */ + struct uma_cache uz_cpu[]; /* Per cpu caches */ + + /* uz_domain follows here. */ }; /* @@ -340,6 +386,7 @@ zone_first_keg(uma_zone_t zone) /* Internal prototypes */ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data); void *uma_large_malloc(vm_size_t size, int wait); +void *uma_large_malloc_domain(vm_size_t size, int domain, int wait); void uma_large_free(uma_slab_t slab); /* Lock Macros */ @@ -422,8 +469,8 @@ vsetslab(vm_offset_t va, uma_slab_t slab) * if they can provide more efficient allocation functions. This is useful * for using direct mapped addresses. */ -void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, - int wait); +void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *pflag, int wait); void uma_small_free(void *mem, vm_size_t size, uint8_t flags); /* Set a global soft limit on UMA managed memory. */