diff --git a/lib/libmemstat/memstat_uma.c b/lib/libmemstat/memstat_uma.c index 4579a8152b04..423d4711c25e 100644 --- a/lib/libmemstat/memstat_uma.c +++ b/lib/libmemstat/memstat_uma.c @@ -55,6 +55,8 @@ static struct nlist namelist[] = { { .n_name = "_mp_maxid" }, #define X_ALL_CPUS 2 { .n_name = "_all_cpus" }, +#define X_VM_NDOMAINS 3 + { .n_name = "_vm_ndomains" }, { .n_name = "" }, }; @@ -297,11 +299,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle) { LIST_HEAD(, uma_keg) uma_kegs; struct memory_type *mtp; + struct uma_zone_domain uzd; struct uma_bucket *ubp, ub; struct uma_cache *ucp, *ucp_array; struct uma_zone *uzp, uz; struct uma_keg *kzp, kz; - int hint_dontsearch, i, mp_maxid, ret; + int hint_dontsearch, i, mp_maxid, ndomains, ret; char name[MEMTYPE_MAXNAME]; cpuset_t all_cpus; long cpusetsize; @@ -323,6 +326,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle) list->mtl_error = ret; return (-1); } + ret = kread_symbol(kvm, X_VM_NDOMAINS, &ndomains, + sizeof(ndomains), 0); + if (ret != 0) { + list->mtl_error = ret; + return (-1); + } ret = kread_symbol(kvm, X_UMA_KEGS, &uma_kegs, sizeof(uma_kegs), 0); if (ret != 0) { list->mtl_error = ret; @@ -447,10 +456,17 @@ skip_percpu: kz.uk_ipers; mtp->mt_byteslimit = mtp->mt_countlimit * mtp->mt_size; mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees; - for (ubp = LIST_FIRST(&uz.uz_buckets); ubp != - NULL; ubp = LIST_NEXT(&ub, ub_link)) { - ret = kread(kvm, ubp, &ub, sizeof(ub), 0); - mtp->mt_zonefree += ub.ub_cnt; + for (i = 0; i < ndomains; i++) { + ret = kread(kvm, &uz.uz_domain[i], &uzd, + sizeof(uzd), 0); + for (ubp = + LIST_FIRST(&uzd.uzd_buckets); + ubp != NULL; + ubp = LIST_NEXT(&ub, ub_link)) { + ret = kread(kvm, ubp, &ub, + sizeof(ub), 0); + mtp->mt_zonefree += ub.ub_cnt; + } } if (!((kz.uk_flags & UMA_ZONE_SECONDARY) && LIST_FIRST(&kz.uk_zones) != uzp)) { diff --git a/sys/amd64/amd64/uma_machdep.c b/sys/amd64/amd64/uma_machdep.c index 41ba8250e5cc..cb960e4b674b 100644 --- a/sys/amd64/amd64/uma_machdep.c +++ b/sys/amd64/amd64/uma_machdep.c @@ -44,14 +44,15 @@ __FBSDID("$FreeBSD$"); #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) return (NULL); diff --git a/sys/arm64/arm64/uma_machdep.c b/sys/arm64/arm64/uma_machdep.c index 620b8c46828a..597e7e80b062 100644 --- a/sys/arm64/arm64/uma_machdep.c +++ b/sys/arm64/arm64/uma_machdep.c @@ -42,14 +42,15 @@ __FBSDID("$FreeBSD$"); #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) return (NULL); diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 38cc84c19f00..957443863f88 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -338,8 +338,8 @@ static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) -static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, - int wait); +static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *flags, int wait); #endif static void pmap_set_pg(void); @@ -697,12 +697,13 @@ pmap_page_init(vm_page_t m) #if defined(PAE) || defined(PAE_TABLES) static void * -pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; - return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, + return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index 010b1ee8799b..4a9b9bf27d20 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -96,6 +96,11 @@ __FBSDID("$FreeBSD$"); dtrace_malloc_probe_func_t dtrace_malloc_probe; #endif +#if defined(INVARIANTS) || defined(MALLOC_MAKE_FAILURES) || \ + defined(DEBUG_MEMGUARD) || defined(DEBUG_REDZONE) +#define MALLOC_DEBUG 1 +#endif + /* * When realloc() is called, if the new size is sufficiently smaller than * the old size, realloc() will allocate a new, smaller block to avoid @@ -417,6 +422,20 @@ contigmalloc(unsigned long size, struct malloc_type *type, int flags, return (ret); } +void * +contigmalloc_domain(unsigned long size, struct malloc_type *type, + int domain, int flags, vm_paddr_t low, vm_paddr_t high, + unsigned long alignment, vm_paddr_t boundary) +{ + void *ret; + + ret = (void *)kmem_alloc_contig_domain(domain, size, flags, low, high, + alignment, boundary, VM_MEMATTR_DEFAULT); + if (ret != NULL) + malloc_type_allocated(type, round_page(size)); + return (ret); +} + /* * contigfree: * @@ -432,26 +451,14 @@ contigfree(void *addr, unsigned long size, struct malloc_type *type) malloc_type_freed(type, round_page(size)); } -/* - * malloc: - * - * Allocate a block of memory. - * - * If M_NOWAIT is set, this routine will not block and return NULL if - * the allocation fails. - */ -void * -malloc(unsigned long size, struct malloc_type *mtp, int flags) +#ifdef MALLOC_DEBUG +static int +malloc_dbg(caddr_t *vap, unsigned long *sizep, struct malloc_type *mtp, + int flags) { - int indx; - struct malloc_type_internal *mtip; - caddr_t va; - uma_zone_t zone; -#if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE) - unsigned long osize = size; -#endif - #ifdef INVARIANTS + int indx; + KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic")); /* * Check that exactly one of M_WAITOK or M_NOWAIT is specified. @@ -474,7 +481,8 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) if ((malloc_nowait_count % malloc_failure_rate) == 0) { atomic_add_int(&malloc_failure_count, 1); t_malloc_fail = time_uptime; - return (NULL); + *vap = NULL; + return (EJUSTRETURN); } } #endif @@ -485,16 +493,44 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) ("malloc: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD - if (memguard_cmp_mtp(mtp, size)) { - va = memguard_alloc(size, flags); - if (va != NULL) - return (va); + if (memguard_cmp_mtp(mtp, *sizep)) { + *vap = memguard_alloc(*sizep, flags); + if (*vap != NULL) + return (EJUSTRETURN); /* This is unfortunate but should not be fatal. */ } #endif #ifdef DEBUG_REDZONE - size = redzone_size_ntor(size); + *sizep = redzone_size_ntor(*sizep); +#endif + + return (0); +} +#endif + +/* + * malloc: + * + * Allocate a block of memory. + * + * If M_NOWAIT is set, this routine will not block and return NULL if + * the allocation fails. + */ +void * +malloc(unsigned long size, struct malloc_type *mtp, int flags) +{ + int indx; + struct malloc_type_internal *mtip; + caddr_t va; + uma_zone_t zone; +#if defined(DEBUG_REDZONE) + unsigned long osize = size; +#endif + +#ifdef MALLOC_DEBUG + if (malloc_dbg(&va, &size, mtp, flags) != 0) + return (va); #endif if (size <= kmem_zmax) { @@ -523,11 +559,55 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL")); else if (va == NULL) t_malloc_fail = time_uptime; -#ifdef DIAGNOSTIC - if (va != NULL && !(flags & M_ZERO)) { - memset(va, 0x70, osize); - } +#ifdef DEBUG_REDZONE + if (va != NULL) + va = redzone_setup(va, osize); #endif + return ((void *) va); +} + +void * +malloc_domain(unsigned long size, struct malloc_type *mtp, int domain, + int flags) +{ + int indx; + struct malloc_type_internal *mtip; + caddr_t va; + uma_zone_t zone; +#if defined(DEBUG_REDZONE) + unsigned long osize = size; +#endif + +#ifdef MALLOC_DEBUG + if (malloc_dbg(&va, &size, mtp, flags) != 0) + return (va); +#endif + if (size <= kmem_zmax) { + mtip = mtp->ks_handle; + if (size & KMEM_ZMASK) + size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; + indx = kmemsize[size >> KMEM_ZSHIFT]; + KASSERT(mtip->mti_zone < numzones, + ("mti_zone %u out of range %d", + mtip->mti_zone, numzones)); + zone = kmemzones[indx].kz_zone[mtip->mti_zone]; +#ifdef MALLOC_PROFILE + krequests[size >> KMEM_ZSHIFT]++; +#endif + va = uma_zalloc_domain(zone, NULL, domain, flags); + if (va != NULL) + size = zone->uz_size; + malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); + } else { + size = roundup(size, PAGE_SIZE); + zone = NULL; + va = uma_large_malloc_domain(size, domain, flags); + malloc_type_allocated(mtp, va == NULL ? 0 : size); + } + if (flags & M_WAITOK) + KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL")); + else if (va == NULL) + t_malloc_fail = time_uptime; #ifdef DEBUG_REDZONE if (va != NULL) va = redzone_setup(va, osize); @@ -545,6 +625,58 @@ mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags) return (malloc(size * nmemb, type, flags)); } +#ifdef INVARIANTS +static void +free_save_type(void *addr, struct malloc_type *mtp, u_long size) +{ + struct malloc_type **mtpp = addr; + + /* + * Cache a pointer to the malloc_type that most recently freed + * this memory here. This way we know who is most likely to + * have stepped on it later. + * + * This code assumes that size is a multiple of 8 bytes for + * 64 bit machines + */ + mtpp = (struct malloc_type **) ((unsigned long)mtpp & ~UMA_ALIGN_PTR); + mtpp += (size - sizeof(struct malloc_type *)) / + sizeof(struct malloc_type *); + *mtpp = mtp; +} +#endif + +#ifdef MALLOC_DEBUG +static int +free_dbg(void **addrp, struct malloc_type *mtp) +{ + void *addr; + + addr = *addrp; + KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic")); + KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), + ("free: called with spinlock or critical section held")); + + /* free(NULL, ...) does nothing */ + if (addr == NULL) + return (EJUSTRETURN); + +#ifdef DEBUG_MEMGUARD + if (is_memguard_addr(addr)) { + memguard_free(addr); + return (EJUSTRETURN); + } +#endif + +#ifdef DEBUG_REDZONE + redzone_check(addr); + *addrp = redzone_addr_ntor(addr); +#endif + + return (0); +} +#endif + /* * free: * @@ -558,51 +690,23 @@ free(void *addr, struct malloc_type *mtp) uma_slab_t slab; u_long size; - KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic")); - KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), - ("free: called with spinlock or critical section held")); - +#ifdef MALLOC_DEBUG + if (free_dbg(&addr, mtp) != 0) + return; +#endif /* free(NULL, ...) does nothing */ if (addr == NULL) return; -#ifdef DEBUG_MEMGUARD - if (is_memguard_addr(addr)) { - memguard_free(addr); - return; - } -#endif - -#ifdef DEBUG_REDZONE - redzone_check(addr); - addr = redzone_addr_ntor(addr); -#endif - slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK)); - if (slab == NULL) panic("free: address %p(%p) has not been allocated.\n", addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); if (!(slab->us_flags & UMA_SLAB_MALLOC)) { -#ifdef INVARIANTS - struct malloc_type **mtpp = addr; -#endif size = slab->us_keg->uk_size; #ifdef INVARIANTS - /* - * Cache a pointer to the malloc_type that most recently freed - * this memory here. This way we know who is most likely to - * have stepped on it later. - * - * This code assumes that size is a multiple of 8 bytes for - * 64 bit machines - */ - mtpp = (struct malloc_type **) - ((unsigned long)mtpp & ~UMA_ALIGN_PTR); - mtpp += (size - sizeof(struct malloc_type *)) / - sizeof(struct malloc_type *); - *mtpp = mtp; + free_save_type(addr, mtp, size); #endif uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab); } else { @@ -612,6 +716,40 @@ free(void *addr, struct malloc_type *mtp) malloc_type_freed(mtp, size); } +void +free_domain(void *addr, struct malloc_type *mtp) +{ + uma_slab_t slab; + u_long size; + +#ifdef MALLOC_DEBUG + if (free_dbg(&addr, mtp) != 0) + return; +#endif + + /* free(NULL, ...) does nothing */ + if (addr == NULL) + return; + + slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK)); + if (slab == NULL) + panic("free_domain: address %p(%p) has not been allocated.\n", + addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); + + if (!(slab->us_flags & UMA_SLAB_MALLOC)) { + size = slab->us_keg->uk_size; +#ifdef INVARIANTS + free_save_type(addr, mtp, size); +#endif + uma_zfree_domain(LIST_FIRST(&slab->us_keg->uk_zones), + addr, slab); + } else { + size = slab->us_size; + uma_large_free(slab); + } + malloc_type_freed(mtp, size); +} + /* * realloc: change the size of a memory block */ diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index a1ab9229bd1e..aa3fed74697a 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -283,7 +283,7 @@ static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); -static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); @@ -386,12 +386,13 @@ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); * pages. */ static void * -mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; - return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, + return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } diff --git a/sys/kern/subr_busdma_bufalloc.c b/sys/kern/subr_busdma_bufalloc.c index d035153c6dbd..c449e8820e7b 100644 --- a/sys/kern/subr_busdma_bufalloc.c +++ b/sys/kern/subr_busdma_bufalloc.c @@ -149,7 +149,7 @@ busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size) } void * -busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, +busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, int domain, uint8_t *pflag, int wait) { #ifdef VM_MEMATTR_UNCACHEABLE @@ -157,7 +157,7 @@ busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, /* Inform UMA that this allocator uses kernel_arena/object. */ *pflag = UMA_SLAB_KERNEL; - return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0, + return ((void *)kmem_alloc_attr_domain(domain, size, wait, 0, BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE)); #else diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c index 8af283d95c95..ef400c93b8d7 100644 --- a/sys/kern/subr_vmem.c +++ b/sys/kern/subr_vmem.c @@ -500,7 +500,7 @@ bt_insfree(vmem_t *vm, bt_t *bt) * Import from the arena into the quantum cache in UMA. */ static int -qc_import(void *arg, void **store, int cnt, int flags) +qc_import(void *arg, void **store, int cnt, int domain, int flags) { qcache_t *qc; vmem_addr_t addr; @@ -614,13 +614,12 @@ static struct mtx_padalign __exclusive_cache_line vmem_bt_lock; * we are really out of KVA. */ static void * -vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { vmem_addr_t addr; - int domain; *pflag = UMA_SLAB_KERNEL; - domain = 0; /* XXX Temporary. */ /* * Single thread boundary tag allocation so that the address space diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index a9a915301abc..c63579db5a2c 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -133,7 +133,7 @@ static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); -static int buf_import(void *, void **, int, int); +static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); @@ -1419,7 +1419,7 @@ buf_free(struct buf *bp) * only as a per-cpu cache of bufs still maintained on a global list. */ static int -buf_import(void *arg, void **store, int cnt, int flags) +buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; diff --git a/sys/mips/mips/uma_machdep.c b/sys/mips/mips/uma_machdep.c index bfc87a8efcc2..9ffbf523ecf7 100644 --- a/sys/mips/mips/uma_machdep.c +++ b/sys/mips/mips/uma_machdep.c @@ -44,7 +44,8 @@ __FBSDID("$FreeBSD$"); #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_paddr_t pa; vm_page_t m; @@ -59,7 +60,8 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) #endif for (;;) { - m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags); + m = vm_page_alloc_freelist_domain(domain, VM_FREELIST_DIRECT, + pflags); #ifndef __mips_n64 if (m == NULL && vm_page_reclaim_contig(pflags, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index c2640087b445..118a1525fd50 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -1504,8 +1504,8 @@ retry: static mmu_t installed_mmu; static void * -moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, - int wait) +moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *flags, int wait) { struct pvo_entry *pvo; vm_offset_t va; @@ -1522,7 +1522,7 @@ moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); diff --git a/sys/powerpc/aim/slb.c b/sys/powerpc/aim/slb.c index 5a725f4860d3..da1f624bdcee 100644 --- a/sys/powerpc/aim/slb.c +++ b/sys/powerpc/aim/slb.c @@ -480,7 +480,8 @@ slb_insert_user(pmap_t pm, struct slb *slb) } static void * -slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + u_int8_t *flags, int wait) { static vm_offset_t realmax = 0; void *va; @@ -490,7 +491,7 @@ slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) realmax = platform_real_maxaddr(); *flags = UMA_SLAB_PRIV; - m = vm_page_alloc_contig(NULL, 0, + m = vm_page_alloc_contig_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, 1, 0, realmax, PAGE_SIZE, PAGE_SIZE, VM_MEMATTR_DEFAULT); if (m == NULL) diff --git a/sys/powerpc/powerpc/uma_machdep.c b/sys/powerpc/powerpc/uma_machdep.c index e121c2bbf637..c9e63c199dbc 100644 --- a/sys/powerpc/powerpc/uma_machdep.c +++ b/sys/powerpc/powerpc/uma_machdep.c @@ -51,7 +51,8 @@ SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0, "UMA MD pages in use"); void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { void *va; vm_paddr_t pa; @@ -59,7 +60,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) *flags = UMA_SLAB_PRIV; - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); diff --git a/sys/riscv/riscv/uma_machdep.c b/sys/riscv/riscv/uma_machdep.c index ba480713fc75..32cb41d28f49 100644 --- a/sys/riscv/riscv/uma_machdep.c +++ b/sys/riscv/riscv/uma_machdep.c @@ -41,7 +41,8 @@ __FBSDID("$FreeBSD$"); #include void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { panic("uma_small_alloc"); diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c index 70e11404b3d0..5ba0844e4f5b 100644 --- a/sys/sparc64/sparc64/vm_machdep.c +++ b/sys/sparc64/sparc64/vm_machdep.c @@ -392,7 +392,8 @@ swi_vm(void *v) } void * -uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) +uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, + int wait) { vm_paddr_t pa; vm_page_t m; @@ -402,7 +403,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) *flags = UMA_SLAB_PRIV; - m = vm_page_alloc(NULL, 0, + m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h index 47ca13f5ebd6..57a06e8ccfd4 100644 --- a/sys/sys/malloc.h +++ b/sys/sys/malloc.h @@ -175,9 +175,17 @@ void *contigmalloc(unsigned long size, struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, vm_paddr_t boundary) __malloc_like __result_use_check __alloc_size(1) __alloc_align(6); +void *contigmalloc_domain(unsigned long size, struct malloc_type *type, + int domain, int flags, vm_paddr_t low, vm_paddr_t high, + unsigned long alignment, vm_paddr_t boundary) + __malloc_like __result_use_check __alloc_size(1) __alloc_align(6); void free(void *addr, struct malloc_type *type); +void free_domain(void *addr, struct malloc_type *type); void *malloc(unsigned long size, struct malloc_type *type, int flags) __malloc_like __result_use_check __alloc_size(1); +void *malloc_domain(unsigned long size, struct malloc_type *type, + int domain, int flags) + __malloc_like __result_use_check __alloc_size(1); void *mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags) __malloc_like __result_use_check __alloc_size(1) __alloc_size(2); diff --git a/sys/vm/uma.h b/sys/vm/uma.h index 1bd366416cbc..b15eaf727bff 100644 --- a/sys/vm/uma.h +++ b/sys/vm/uma.h @@ -128,7 +128,8 @@ typedef void (*uma_fini)(void *mem, int size); /* * Import new memory into a cache zone. */ -typedef int (*uma_import)(void *arg, void **store, int count, int flags); +typedef int (*uma_import)(void *arg, void **store, int count, int domain, + int flags); /* * Free memory from a cache zone. @@ -281,6 +282,10 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, * Allocates mp_maxid + 1 slabs sized to * sizeof(struct pcpu). */ +#define UMA_ZONE_NUMA 0x10000 /* + * NUMA aware Zone. Implements a best + * effort first-touch policy. + */ /* * These flags are shared between the keg and zone. In zones wishing to add @@ -325,6 +330,19 @@ void uma_zdestroy(uma_zone_t zone); void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags); +/* + * Allocate an item from a specific NUMA domain. This uses a slow path in + * the allocator but is guaranteed to allocate memory from the requested + * domain if M_WAITOK is set. + * + * Arguments: + * zone The zone we are allocating from + * arg This data is passed to the ctor function + * domain The domain to allocate from. + * flags See sys/malloc.h for available flags. + */ +void *uma_zalloc_domain(uma_zone_t zone, void *arg, int domain, int flags); + /* * Allocates an item out of a zone without supplying an argument * @@ -353,6 +371,16 @@ uma_zalloc(uma_zone_t zone, int flags) void uma_zfree_arg(uma_zone_t zone, void *item, void *arg); +/* + * Frees an item back to the specified zone's domain specific pool. + * + * Arguments: + * zone The zone the item was originally allocated out of. + * item The memory to be freed. + * arg Argument passed to the destructor + */ +void uma_zfree_domain(uma_zone_t zone, void *item, void *arg); + /* * Frees an item back to a zone without supplying an argument * @@ -372,11 +400,6 @@ uma_zfree(uma_zone_t zone, void *item) */ void uma_zwait(uma_zone_t zone); -/* - * XXX The rest of the prototypes in this header are h0h0 magic for the VM. - * If you think you need to use it for a normal zone you're probably incorrect. - */ - /* * Backend page supplier routines * @@ -384,14 +407,15 @@ void uma_zwait(uma_zone_t zone); * zone The zone that is requesting pages. * size The number of bytes being requested. * pflag Flags for these memory pages, see below. + * domain The NUMA domain that we prefer for this allocation. * wait Indicates our willingness to block. * * Returns: * A pointer to the allocated memory or NULL on failure. */ -typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag, - int wait); +typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain, + uint8_t *pflag, int wait); /* * Backend page free routines @@ -406,8 +430,6 @@ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag, */ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); - - /* * Sets up the uma allocator. (Called by vm_mem_init) * diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 0dd16dd9c325..86b6030b1935 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -97,17 +98,12 @@ __FBSDID("$FreeBSD$"); #endif /* - * This is the zone and keg from which all zones are spawned. The idea is that - * even the zone & keg heads are allocated from the allocator, so we use the - * bss section to bootstrap us. + * This is the zone and keg from which all zones are spawned. */ -static struct uma_keg masterkeg; -static struct uma_zone masterzone_k; -static struct uma_zone masterzone_z; -static uma_zone_t kegs = &masterzone_k; -static uma_zone_t zones = &masterzone_z; +static uma_zone_t kegs; +static uma_zone_t zones; -/* This is the zone from which all of uma_slab_t's are allocated. */ +/* This is the zone from which all offpage uma_slab_ts are allocated. */ static uma_zone_t slabzone; /* @@ -226,13 +222,15 @@ struct uma_bucket_zone bucket_zones[] = { */ enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI }; +#define UMA_ANYDOMAIN -1 /* Special value for domain search. */ + /* Prototypes.. */ -static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int); -static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int); -static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int); +static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); -static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int); +static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_drain(uma_zone_t zone); @@ -250,23 +248,23 @@ static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); -static void *zone_alloc_item(uma_zone_t, void *, int); +static void *zone_alloc_item(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(void); -static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags); -static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags); -static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags); +static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); +static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int); +static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); -static int zone_import(uma_zone_t zone, void **bucket, int max, int flags); -static void zone_release(uma_zone_t zone, void **bucket, int cnt); -static void uma_zero_item(void *item, uma_zone_t zone); +static int zone_import(uma_zone_t, void **, int, int, int); +static void zone_release(uma_zone_t, void **, int); +static void uma_zero_item(void *, uma_zone_t); void uma_print_zone(uma_zone_t); void uma_print_stats(void); @@ -332,7 +330,7 @@ bucket_init(void) size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, - UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET); + UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA); } } @@ -570,7 +568,7 @@ hash_alloc(struct uma_hash *hash) } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, - M_WAITOK); + UMA_ANYDOMAIN, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { @@ -736,6 +734,7 @@ cache_drain_safe_cpu(uma_zone_t zone) { uma_cache_t cache; uma_bucket_t b1, b2; + int domain; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; @@ -743,10 +742,14 @@ cache_drain_safe_cpu(uma_zone_t zone) b1 = b2 = NULL; ZONE_LOCK(zone); critical_enter(); + if (zone->uz_flags & UMA_ZONE_NUMA) + domain = PCPU_GET(domain); + else + domain = 0; cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket) { if (cache->uc_allocbucket->ub_cnt != 0) - LIST_INSERT_HEAD(&zone->uz_buckets, + LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets, cache->uc_allocbucket, ub_link); else b1 = cache->uc_allocbucket; @@ -754,7 +757,7 @@ cache_drain_safe_cpu(uma_zone_t zone) } if (cache->uc_freebucket) { if (cache->uc_freebucket->ub_cnt != 0) - LIST_INSERT_HEAD(&zone->uz_buckets, + LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets, cache->uc_freebucket, ub_link); else b2 = cache->uc_freebucket; @@ -809,18 +812,22 @@ cache_drain_safe(uma_zone_t zone) static void bucket_cache_drain(uma_zone_t zone) { + uma_zone_domain_t zdom; uma_bucket_t bucket; + int i; /* - * Drain the bucket queues and free the buckets, we just keep two per - * cpu (alloc/free). + * Drain the bucket queues and free the buckets. */ - while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { - LIST_REMOVE(bucket, ub_link); - ZONE_UNLOCK(zone); - bucket_drain(zone, bucket); - bucket_free(zone, bucket, NULL); - ZONE_LOCK(zone); + for (i = 0; i < vm_ndomains; i++) { + zdom = &zone->uz_domain[i]; + while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { + LIST_REMOVE(bucket, ub_link); + ZONE_UNLOCK(zone); + bucket_drain(zone, bucket); + bucket_free(zone, bucket, NULL); + ZONE_LOCK(zone); + } } /* @@ -865,7 +872,9 @@ static void keg_drain(uma_keg_t keg) { struct slabhead freeslabs = { 0 }; + uma_domain_t dom; uma_slab_t slab, tmp; + int i; /* * We don't want to take pages from statically allocated kegs at this @@ -880,20 +889,25 @@ keg_drain(uma_keg_t keg) if (keg->uk_free == 0) goto finished; - LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) { - /* We have nowhere to free these to. */ - if (slab->us_flags & UMA_SLAB_BOOT) - continue; + for (i = 0; i < vm_ndomains; i++) { + dom = &keg->uk_domain[i]; + LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) { + /* We have nowhere to free these to. */ + if (slab->us_flags & UMA_SLAB_BOOT) + continue; - LIST_REMOVE(slab, us_link); - keg->uk_pages -= keg->uk_ppera; - keg->uk_free -= keg->uk_ipers; + LIST_REMOVE(slab, us_link); + keg->uk_pages -= keg->uk_ppera; + keg->uk_free -= keg->uk_ipers; - if (keg->uk_flags & UMA_ZONE_HASH) - UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data); + if (keg->uk_flags & UMA_ZONE_HASH) + UMA_HASH_REMOVE(&keg->uk_hash, slab, + slab->us_data); - SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); + SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); + } } + finished: KEG_UNLOCK(keg); @@ -953,7 +967,7 @@ zone_drain(uma_zone_t zone) * caller specified M_NOWAIT. */ static uma_slab_t -keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) +keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait) { uma_alloc allocf; uma_slab_t slab; @@ -962,6 +976,8 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) uint8_t flags; int i; + KASSERT(domain >= 0 && domain < vm_ndomains, + ("keg_alloc_slab: domain %d out of range", domain)); mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; mem = NULL; @@ -971,7 +987,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) size = keg->uk_ppera * PAGE_SIZE; if (keg->uk_flags & UMA_ZONE_OFFPAGE) { - slab = zone_alloc_item(keg->uk_slabzone, NULL, wait); + slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait); if (slab == NULL) goto out; } @@ -992,7 +1008,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) wait |= M_NODUMP; /* zone is passed for legacy reasons. */ - mem = allocf(zone, size, &flags, wait); + mem = allocf(zone, size, domain, &flags, wait); if (mem == NULL) { if (keg->uk_flags & UMA_ZONE_OFFPAGE) zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); @@ -1013,6 +1029,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait) slab->us_data = mem; slab->us_freecount = keg->uk_ipers; slab->us_flags = flags; + slab->us_domain = domain; BIT_FILL(SLAB_SETSIZE, &slab->us_free); #ifdef INVARIANTS BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree); @@ -1052,7 +1069,8 @@ out: * the VM is ready. */ static void * -startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { uma_keg_t keg; void *mem; @@ -1085,7 +1103,7 @@ startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) #else keg->uk_allocf = page_alloc; #endif - return keg->uk_allocf(zone, bytes, pflag, wait); + return keg->uk_allocf(zone, bytes, domain, pflag, wait); } /* @@ -1100,12 +1118,13 @@ startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) * NULL if M_NOWAIT is set. */ static void * -page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) +page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KERNEL; - p = (void *) kmem_malloc(kernel_arena, bytes, wait); + p = (void *) kmem_malloc_domain(domain, bytes, wait); return (p); } @@ -1122,7 +1141,8 @@ page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) * NULL if M_NOWAIT is set. */ static void * -noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) +noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, + int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; @@ -1135,7 +1155,7 @@ noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { - p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | + p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : VM_ALLOC_NOWAIT)); @@ -1407,6 +1427,7 @@ keg_ctor(void *mem, int size, void *udata, int flags) keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; + keg->uk_cursor = 0; keg->uk_free = 0; keg->uk_reserve = 0; keg->uk_pages = 0; @@ -1548,6 +1569,8 @@ zone_ctor(void *mem, int size, void *udata, int flags) zone->uz_count_min = 0; zone->uz_flags = 0; zone->uz_warning = NULL; + /* The domain structures follow the cpu structures. */ + zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus]; timevalclear(&zone->uz_ratecheck); keg = arg->keg; @@ -1753,22 +1776,43 @@ void uma_startup(void *mem, int npages) { struct uma_zctor_args args; + uma_keg_t masterkeg; + uintptr_t m; + int zsize; + int ksize; rw_init(&uma_rwlock, "UMA lock"); + ksize = sizeof(struct uma_keg) + + (sizeof(struct uma_domain) * vm_ndomains); + zsize = sizeof(struct uma_zone) + + (sizeof(struct uma_cache) * mp_ncpus) + + (sizeof(struct uma_zone_domain) * vm_ndomains); + + /* Use bootpages memory for the zone of zones and zone of kegs. */ + m = (uintptr_t)mem; + zones = (uma_zone_t)m; + m += roundup(zsize, CACHE_LINE_SIZE); + kegs = (uma_zone_t)m; + m += roundup(zsize, CACHE_LINE_SIZE); + masterkeg = (uma_keg_t)m; + m += roundup(ksize, CACHE_LINE_SIZE); + m = roundup(m, PAGE_SIZE); + npages -= (m - (uintptr_t)mem) / PAGE_SIZE; + mem = (void *)m; + /* "manually" create the initial zone */ memset(&args, 0, sizeof(args)); args.name = "UMA Kegs"; - args.size = sizeof(struct uma_keg); + args.size = ksize; args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; - args.keg = &masterkeg; + args.keg = masterkeg; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; - /* The initial zone has no Per cpu queues so it's smaller */ - zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK); + zone_ctor(kegs, zsize, &args, M_WAITOK); mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF); bootmem = mem; @@ -1776,7 +1820,8 @@ uma_startup(void *mem, int npages) args.name = "UMA Zones"; args.size = sizeof(struct uma_zone) + - (sizeof(struct uma_cache) * (mp_maxid + 1)); + (sizeof(struct uma_cache) * (mp_maxid + 1)) + + (sizeof(struct uma_zone_domain) * vm_ndomains); args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; @@ -1784,8 +1829,7 @@ uma_startup(void *mem, int npages) args.keg = NULL; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; - /* The initial zone has no Per cpu queues so it's smaller */ - zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK); + zone_ctor(zones, zsize, &args, M_WAITOK); /* Now make a zone for slab headers */ slabzone = uma_zcreate("UMA Slabs", @@ -1837,7 +1881,7 @@ uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; args.flags = flags; args.zone = zone; - return (zone_alloc_item(kegs, &args, M_WAITOK)); + return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* See uma.h */ @@ -1894,7 +1938,7 @@ uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, sx_slock(&uma_drain_lock); locked = true; } - res = zone_alloc_item(zones, &args, M_WAITOK); + res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) sx_sunlock(&uma_drain_lock); return (res); @@ -1929,7 +1973,7 @@ uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, locked = true; } /* XXX Attaches only one keg of potentially many. */ - res = zone_alloc_item(zones, &args, M_WAITOK); + res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) sx_sunlock(&uma_drain_lock); return (res); @@ -1956,7 +2000,7 @@ uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, args.align = 0; args.flags = flags; - return (zone_alloc_item(zones, &args, M_WAITOK)); + return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } static void @@ -2061,11 +2105,11 @@ uma_zwait(uma_zone_t zone) void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { - void *item; - uma_cache_t cache; + uma_zone_domain_t zdom; uma_bucket_t bucket; - int lockfail; - int cpu; + uma_cache_t cache; + void *item; + int cpu, domain, lockfail; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); @@ -2162,6 +2206,11 @@ zalloc_start: if (bucket != NULL) bucket_free(zone, bucket, udata); + if (zone->uz_flags & UMA_ZONE_NUMA) + domain = PCPU_GET(domain); + else + domain = UMA_ANYDOMAIN; + /* Short-circuit for zones without buckets and low memory. */ if (zone->uz_count == 0 || bucketdisable) goto zalloc_item; @@ -2202,7 +2251,11 @@ zalloc_start: /* * Check the zone's cache of buckets. */ - if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) { + if (domain == UMA_ANYDOMAIN) + zdom = &zone->uz_domain[0]; + else + zdom = &zone->uz_domain[domain]; + if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); @@ -2227,7 +2280,7 @@ zalloc_start: * works we'll restart the allocation from the beginning and it * will use the just filled bucket. */ - bucket = zone_alloc_bucket(zone, udata, flags); + bucket = zone_alloc_bucket(zone, udata, domain, flags); CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); if (bucket != NULL) { @@ -2240,10 +2293,12 @@ zalloc_start: * initialized bucket to make this less likely or claim * the memory directly. */ - if (cache->uc_allocbucket == NULL) - cache->uc_allocbucket = bucket; + if (cache->uc_allocbucket != NULL || + (zone->uz_flags & UMA_ZONE_NUMA && + domain != PCPU_GET(domain))) + LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); else - LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); + cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; } @@ -2252,38 +2307,103 @@ zalloc_start: * We may not be able to get a bucket so return an actual item. */ zalloc_item: - item = zone_alloc_item(zone, udata, flags); + item = zone_alloc_item(zone, udata, domain, flags); return (item); } -static uma_slab_t -keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags) +void * +uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) { + + /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ + random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); + + /* This is the fast path allocation */ + CTR5(KTR_UMA, + "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d", + curthread, zone->uz_name, zone, domain, flags); + + if (flags & M_WAITOK) { + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "uma_zalloc_domain: zone \"%s\"", zone->uz_name); + } + KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), + ("uma_zalloc_domain: called with spinlock or critical section held")); + + return (zone_alloc_item(zone, udata, domain, flags)); +} + +/* + * Find a slab with some space. Prefer slabs that are partially used over those + * that are totally full. This helps to reduce fragmentation. + * + * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check + * only 'domain'. + */ +static uma_slab_t +keg_first_slab(uma_keg_t keg, int domain, int rr) +{ + uma_domain_t dom; uma_slab_t slab; - int reserve; + int start; + + KASSERT(domain >= 0 && domain < vm_ndomains, + ("keg_first_slab: domain %d out of range", domain)); + + slab = NULL; + start = domain; + do { + dom = &keg->uk_domain[domain]; + if (!LIST_EMPTY(&dom->ud_part_slab)) + return (LIST_FIRST(&dom->ud_part_slab)); + if (!LIST_EMPTY(&dom->ud_free_slab)) { + slab = LIST_FIRST(&dom->ud_free_slab); + LIST_REMOVE(slab, us_link); + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); + return (slab); + } + if (rr) + domain = (domain + 1) % vm_ndomains; + } while (domain != start); + + return (NULL); +} + +static uma_slab_t +keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags) +{ + uma_domain_t dom; + uma_slab_t slab; + int allocflags, domain, reserve, rr, start; mtx_assert(&keg->uk_lock, MA_OWNED); slab = NULL; reserve = 0; + allocflags = flags; if ((flags & M_USE_RESERVE) == 0) reserve = keg->uk_reserve; - for (;;) { - /* - * Find a slab with some space. Prefer slabs that are partially - * used over those that are totally full. This helps to reduce - * fragmentation. - */ - if (keg->uk_free > reserve) { - if (!LIST_EMPTY(&keg->uk_part_slab)) { - slab = LIST_FIRST(&keg->uk_part_slab); - } else { - slab = LIST_FIRST(&keg->uk_free_slab); - LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, - us_link); - } + /* + * Round-robin for non first-touch zones when there is more than one + * domain. + */ + if (vm_ndomains == 1) + rdomain = 0; + rr = rdomain == UMA_ANYDOMAIN; + if (rr) { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = start = keg->uk_cursor; + /* Only block on the second pass. */ + if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK) + allocflags = (allocflags & ~M_WAITOK) | M_NOWAIT; + } else + domain = start = rdomain; + +again: + do { + if (keg->uk_free > reserve && + (slab = keg_first_slab(keg, domain, rr)) != NULL) { MPASS(slab->us_keg == keg); return (slab); } @@ -2306,12 +2426,12 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags) zone_maxaction(zone); } if (flags & M_NOWAIT) - break; + return (NULL); zone->uz_sleeps++; msleep(keg, &keg->uk_lock, PVM, "keglimit", 0); continue; } - slab = keg_alloc_slab(keg, zone, flags); + slab = keg_alloc_slab(keg, zone, domain, allocflags); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove @@ -2319,21 +2439,37 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags) */ if (slab) { MPASS(slab->us_keg == keg); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } - /* - * We might not have been able to get a slab but another cpu - * could have while we were unlocked. Check again before we - * fail. - */ - flags |= M_NOVM; + if (rr) { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = keg->uk_cursor; + } + } while (domain != start); + + /* Retry domain scan with blocking. */ + if (allocflags != flags) { + allocflags = flags; + goto again; } - return (slab); + + /* + * We might not have been able to get a slab but another cpu + * could have while we were unlocked. Check again before we + * fail. + */ + if (keg->uk_free > reserve && + (slab = keg_first_slab(keg, domain, rr)) != NULL) { + MPASS(slab->us_keg == keg); + return (slab); + } + return (NULL); } static uma_slab_t -zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags) +zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags) { uma_slab_t slab; @@ -2343,7 +2479,7 @@ zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags) } for (;;) { - slab = keg_fetch_slab(keg, zone, flags); + slab = keg_fetch_slab(keg, zone, domain, flags); if (slab) return (slab); if (flags & (M_NOWAIT | M_NOVM)) @@ -2360,7 +2496,7 @@ zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags) * The last pointer is used to seed the search. It is not required. */ static uma_slab_t -zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) +zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags) { uma_klink_t klink; uma_slab_t slab; @@ -2380,7 +2516,7 @@ zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) * the search. */ if (last != NULL) { - slab = keg_fetch_slab(last, zone, flags); + slab = keg_fetch_slab(last, zone, domain, flags); if (slab) return (slab); KEG_UNLOCK(last); @@ -2401,7 +2537,7 @@ zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) keg = klink->kl_keg; KEG_LOCK(keg); if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) { - slab = keg_fetch_slab(keg, zone, flags); + slab = keg_fetch_slab(keg, zone, domain, flags); if (slab) return (slab); } @@ -2437,6 +2573,7 @@ zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags) static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { + uma_domain_t dom; void *item; uint8_t freei; @@ -2452,32 +2589,48 @@ slab_alloc_item(uma_keg_t keg, uma_slab_t slab) /* Move this slab to the full list */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); } return (item); } static int -zone_import(uma_zone_t zone, void **bucket, int max, int flags) +zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags) { uma_slab_t slab; uma_keg_t keg; + int stripe; int i; slab = NULL; keg = NULL; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { - if ((slab = zone->uz_slab(zone, keg, flags)) == NULL) + if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL) break; keg = slab->us_keg; + stripe = howmany(max, vm_ndomains); while (slab->us_freecount && i < max) { bucket[i++] = slab_alloc_item(keg, slab); if (keg->uk_free <= keg->uk_reserve) break; +#if MAXMEMDOM > 1 + /* + * If the zone is striped we pick a new slab for every + * N allocations. Eliminating this conditional will + * instead pick a new domain for each bucket rather + * than stripe within each bucket. The current option + * produces more fragmentation and requires more cpu + * time but yields better distribution. + */ + if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 && + vm_ndomains > 1 && --stripe == 0) + break; +#endif } - /* Don't grab more than one slab at a time. */ + /* Don't block if we allocated any successfully. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } @@ -2488,7 +2641,7 @@ zone_import(uma_zone_t zone, void **bucket, int max, int flags) } static uma_bucket_t -zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) +zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { uma_bucket_t bucket; int max; @@ -2500,7 +2653,7 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) max = MIN(bucket->ub_entries, zone->uz_count); bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, - max, flags); + max, domain, flags); /* * Initialize the memory if necessary. @@ -2542,6 +2695,7 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. + * domain The domain to allocate from or UMA_ANYDOMAIN. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns @@ -2550,13 +2704,13 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int flags) */ static void * -zone_alloc_item(uma_zone_t zone, void *udata, int flags) +zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) { void *item; item = NULL; - if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1) + if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail; atomic_add_long(&zone->uz_allocs, 1); @@ -2602,8 +2756,8 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_bucket_t bucket; - int lockfail; - int cpu; + uma_zone_domain_t zdom; + int cpu, domain, lockfail; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); @@ -2717,6 +2871,12 @@ zfree_start: /* We are no longer associated with this CPU. */ critical_exit(); + if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) + domain = PCPU_GET(domain); + else + domain = 0; + zdom = &zone->uz_domain[0]; + /* Can we throw this on the zone full list? */ if (bucket != NULL) { CTR3(KTR_UMA, @@ -2725,7 +2885,7 @@ zfree_start: /* ub_cnt is pointing to the last free item */ KASSERT(bucket->ub_cnt != 0, ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); - LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link); + LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); } /* @@ -2743,7 +2903,9 @@ zfree_start: critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; - if (cache->uc_freebucket == NULL) { + if (cache->uc_freebucket == NULL && + ((zone->uz_flags & UMA_ZONE_NUMA) == 0 || + domain == PCPU_GET(domain))) { cache->uc_freebucket = bucket; goto zfree_start; } @@ -2765,21 +2927,43 @@ zfree_item: return; } +void +uma_zfree_domain(uma_zone_t zone, void *item, void *udata) +{ + + /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ + random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA); + + CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread, + zone->uz_name); + + KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), + ("uma_zfree_domain: called with spinlock or critical section held")); + + /* uma_zfree(..., NULL) does nothing, to match free(9). */ + if (item == NULL) + return; + zone_free_item(zone, item, udata, SKIP_NONE); +} + static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item) { + uma_domain_t dom; uint8_t freei; mtx_assert(&keg->uk_lock, MA_OWNED); MPASS(keg == slab->us_keg); + dom = &keg->uk_domain[slab->us_domain]; + /* Do we need to remove from any lists? */ if (slab->us_freecount+1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); + LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); + LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); } /* Slab management. */ @@ -3102,24 +3286,28 @@ uma_zone_reserve_kva(uma_zone_t zone, int count) void uma_prealloc(uma_zone_t zone, int items) { - int slabs; + uma_domain_t dom; uma_slab_t slab; uma_keg_t keg; + int domain, slabs; keg = zone_first_keg(zone); if (keg == NULL) return; KEG_LOCK(keg); slabs = items / keg->uk_ipers; + domain = 0; if (slabs * keg->uk_ipers < items) slabs++; while (slabs > 0) { - slab = keg_alloc_slab(keg, zone, M_WAITOK); + slab = keg_alloc_slab(keg, zone, domain, M_WAITOK); if (slab == NULL) break; MPASS(slab->us_keg == keg); - LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); slabs--; + domain = (domain + 1) % vm_ndomains; } KEG_UNLOCK(keg); } @@ -3204,34 +3392,47 @@ uma_zone_exhausted_nolock(uma_zone_t zone) } void * -uma_large_malloc(vm_size_t size, int wait) +uma_large_malloc_domain(vm_size_t size, int domain, int wait) { - void *mem; + vm_offset_t addr; uma_slab_t slab; - uint8_t flags; - slab = zone_alloc_item(slabzone, NULL, wait); + slab = zone_alloc_item(slabzone, NULL, domain, wait); if (slab == NULL) return (NULL); - mem = page_alloc(NULL, size, &flags, wait); - if (mem) { - vsetslab((vm_offset_t)mem, slab); - slab->us_data = mem; - slab->us_flags = flags | UMA_SLAB_MALLOC; + if (domain == UMA_ANYDOMAIN) + addr = kmem_malloc(kernel_arena, size, wait); + else + addr = kmem_malloc_domain(domain, size, wait); + if (addr != 0) { + vsetslab(addr, slab); + slab->us_data = (void *)addr; + slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC; slab->us_size = size; + slab->us_domain = vm_phys_domidx(PHYS_TO_VM_PAGE( + pmap_kextract(addr))); uma_total_inc(size); } else { zone_free_item(slabzone, slab, NULL, SKIP_NONE); } - return (mem); + return ((void *)addr); +} + +void * +uma_large_malloc(vm_size_t size, int wait) +{ + + return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait); } void uma_large_free(uma_slab_t slab) { - page_free(slab->us_data, slab->us_size, slab->us_flags); + KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0, + ("uma_large_free: Memory not allocated with uma_large_malloc.")); + kmem_free(kernel_arena, (vm_offset_t)slab->us_data, slab->us_size); uma_total_dec(slab->us_size); zone_free_item(slabzone, slab, NULL, SKIP_NONE); } @@ -3302,7 +3503,9 @@ cache_print(uma_cache_t cache) static void uma_print_keg(uma_keg_t keg) { + uma_domain_t dom; uma_slab_t slab; + int i; printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d " "out %d free %d limit %d\n", @@ -3310,15 +3513,18 @@ uma_print_keg(uma_keg_t keg) keg->uk_ipers, keg->uk_ppera, (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers); - printf("Part slabs:\n"); - LIST_FOREACH(slab, &keg->uk_part_slab, us_link) - slab_print(slab); - printf("Free slabs:\n"); - LIST_FOREACH(slab, &keg->uk_free_slab, us_link) - slab_print(slab); - printf("Full slabs:\n"); - LIST_FOREACH(slab, &keg->uk_full_slab, us_link) - slab_print(slab); + for (i = 0; i < vm_ndomains; i++) { + dom = &keg->uk_domain[i]; + printf("Part slabs:\n"); + LIST_FOREACH(slab, &dom->ud_part_slab, us_link) + slab_print(slab); + printf("Free slabs:\n"); + LIST_FOREACH(slab, &dom->ud_free_slab, us_link) + slab_print(slab); + printf("Full slabs:\n"); + LIST_FOREACH(slab, &dom->ud_full_slab, us_link) + slab_print(slab); + } } void @@ -3408,6 +3614,7 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) struct uma_type_header uth; struct uma_percpu_stat ups; uma_bucket_t bucket; + uma_zone_domain_t zdom; struct sbuf sbuf; uma_cache_t cache; uma_klink_t kl; @@ -3463,8 +3670,12 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - uth.uth_zone_free += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, + ub_link) + uth.uth_zone_free += bucket->ub_cnt; + } uth.uth_allocs = z->uz_allocs; uth.uth_frees = z->uz_frees; uth.uth_fails = z->uz_fails; @@ -3630,11 +3841,12 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) #ifdef DDB DB_SHOW_COMMAND(uma, db_show_uma) { - uint64_t allocs, frees, sleeps; uma_bucket_t bucket; uma_keg_t kz; uma_zone_t z; - int cachefree; + uma_zone_domain_t zdom; + uint64_t allocs, frees, sleeps; + int cachefree, i; db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Sleeps", "Bucket"); @@ -3651,8 +3863,12 @@ DB_SHOW_COMMAND(uma, db_show_uma) if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) cachefree += kz->uk_free; - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - cachefree += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, + ub_link) + cachefree += bucket->ub_cnt; + } db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n", z->uz_name, (uintmax_t)kz->uk_size, (intmax_t)(allocs - frees), cachefree, @@ -3665,17 +3881,21 @@ DB_SHOW_COMMAND(uma, db_show_uma) DB_SHOW_COMMAND(umacache, db_show_umacache) { - uint64_t allocs, frees; uma_bucket_t bucket; uma_zone_t z; - int cachefree; + uma_zone_domain_t zdom; + uint64_t allocs, frees; + int cachefree, i; db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL); - LIST_FOREACH(bucket, &z->uz_buckets, ub_link) - cachefree += bucket->ub_cnt; + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + LIST_FOREACH(bucket, &zdom->uzd_buckets, ub_link) + cachefree += bucket->ub_cnt; + } db_printf("%18s %8ju %8jd %8d %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h index a8aca454f066..5836900863e4 100644 --- a/sys/vm/uma_int.h +++ b/sys/vm/uma_int.h @@ -39,7 +39,22 @@ */ /* - * Here's a quick description of the relationship between the objects: + * The brief summary; Zones describe unique allocation types. Zones are + * organized into per-CPU caches which are filled by buckets. Buckets are + * organized according to memory domains. Buckets are filled from kegs which + * are also organized according to memory domains. Kegs describe a unique + * allocation type, backend memory provider, and layout. Kegs are associated + * with one or more zones and zones reference one or more kegs. Kegs provide + * slabs which are virtually contiguous collections of pages. Each slab is + * broken down int one or more items that will satisfy an individual allocation. + * + * Allocation is satisfied in the following order: + * 1) Per-CPU cache + * 2) Per-domain cache of buckets + * 3) Slab from any of N kegs + * 4) Backend page provider + * + * More detail on individual objects is contained below: * * Kegs contain lists of slabs which are stored in either the full bin, empty * bin, or partially allocated bin, to reduce fragmentation. They also contain @@ -47,6 +62,13 @@ * and rsize is the result of that. The Keg also stores information for * managing a hash of page addresses that maps pages to uma_slab_t structures * for pages that don't have embedded uma_slab_t's. + * + * Keg slab lists are organized by memory domain to support NUMA allocation + * policies. By default allocations are spread across domains to reduce the + * potential for hotspots. Special keg creation flags may be specified to + * prefer location allocation. However there is no strict enforcement as frees + * may happen on any CPU and these are returned to the CPU-local cache + * regardless of the originating domain. * * The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may * be allocated off the page from a special slab zone. The free list within a @@ -181,6 +203,17 @@ struct uma_cache { typedef struct uma_cache * uma_cache_t; +/* + * Per-domain memory list. Embedded in the kegs. + */ +struct uma_domain { + LIST_HEAD(,uma_slab) ud_part_slab; /* partially allocated slabs */ + LIST_HEAD(,uma_slab) ud_free_slab; /* empty slab list */ + LIST_HEAD(,uma_slab) ud_full_slab; /* full slabs */ +}; + +typedef struct uma_domain * uma_domain_t; + /* * Keg management structure * @@ -192,10 +225,8 @@ struct uma_keg { struct uma_hash uk_hash; LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */ - LIST_HEAD(,uma_slab) uk_part_slab; /* partially allocated slabs */ - LIST_HEAD(,uma_slab) uk_free_slab; /* empty slab list */ - LIST_HEAD(,uma_slab) uk_full_slab; /* full slabs */ + uint32_t uk_cursor; /* Domain alloc cursor. */ uint32_t uk_align; /* Alignment mask */ uint32_t uk_pages; /* Total page count */ uint32_t uk_free; /* Count of items free in slabs */ @@ -221,6 +252,9 @@ struct uma_keg { /* Least used fields go to the last cache line. */ const char *uk_name; /* Name of creating zone. */ LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */ + + /* Must be last, variable sized. */ + struct uma_domain uk_domain[]; /* Keg's slab lists. */ }; typedef struct uma_keg * uma_keg_t; @@ -248,14 +282,18 @@ struct uma_slab { #endif uint16_t us_freecount; /* How many are free? */ uint8_t us_flags; /* Page flags see uma.h */ - uint8_t us_pad; /* Pad to 32bits, unused. */ + uint8_t us_domain; /* Backing NUMA domain. */ }; #define us_link us_type._us_link #define us_size us_type._us_size +#if MAXMEMDOM >= 255 +#error "Slab domain type insufficient" +#endif + typedef struct uma_slab * uma_slab_t; -typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int); +typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int, int); struct uma_klink { LIST_ENTRY(uma_klink) kl_link; @@ -263,6 +301,12 @@ struct uma_klink { }; typedef struct uma_klink *uma_klink_t; +struct uma_zone_domain { + LIST_HEAD(,uma_bucket) uzd_buckets; /* full buckets */ +}; + +typedef struct uma_zone_domain * uma_zone_domain_t; + /* * Zone management structure * @@ -275,7 +319,7 @@ struct uma_zone { const char *uz_name; /* Text name of the zone */ LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ - LIST_HEAD(,uma_bucket) uz_buckets; /* full buckets */ + struct uma_zone_domain *uz_domain; /* per-domain buckets */ LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ struct uma_klink uz_klink; /* klink for first keg. */ @@ -309,7 +353,9 @@ struct uma_zone { * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. */ - struct uma_cache uz_cpu[1]; /* Per cpu caches */ + struct uma_cache uz_cpu[]; /* Per cpu caches */ + + /* uz_domain follows here. */ }; /* @@ -340,6 +386,7 @@ zone_first_keg(uma_zone_t zone) /* Internal prototypes */ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data); void *uma_large_malloc(vm_size_t size, int wait); +void *uma_large_malloc_domain(vm_size_t size, int domain, int wait); void uma_large_free(uma_slab_t slab); /* Lock Macros */ @@ -422,8 +469,8 @@ vsetslab(vm_offset_t va, uma_slab_t slab) * if they can provide more efficient allocation functions. This is useful * for using direct mapped addresses. */ -void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, - int wait); +void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *pflag, int wait); void uma_small_free(void *mem, vm_size_t size, uint8_t flags); /* Set a global soft limit on UMA managed memory. */