From ab3059a8e7f382cff93dbe4d1b082adf62f4d849 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Fri, 6 Jul 2018 02:06:03 +0000 Subject: [PATCH] Back pcpu zone with domain correct pages - Change pcpu zone consumers to use a stride size of PAGE_SIZE. (defined as UMA_PCPU_ALLOC_SIZE to make future identification easier) - Allocate page from the correct domain for a given cpu. - Don't initialize pc_domain to non-zero value if NUMA is not defined There are some misconceptions surrounding this field. It is the _VM_ NUMA domain and should only ever correspond to valid domain values as understood by the VM. The former slab size of sizeof(struct pcpu) was somewhat arbitrary. The new value is PAGE_SIZE because that's the smallest granularity which the VM can allocate a slab for a given domain. If you have fewer than PAGE_SIZE/8 counters on your system there will be some memory wasted, but this is obviously something where you want the cache line to be coming from the correct domain. Reviewed by: jeff Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D15933 --- sys/amd64/include/counter.h | 4 +- sys/arm/include/counter.h | 4 +- sys/arm64/include/counter.h | 4 +- sys/i386/include/counter.h | 8 +-- sys/kern/subr_counter.c | 13 ++++- sys/mips/include/counter.h | 4 +- sys/powerpc/include/counter.h | 8 +-- sys/riscv/include/counter.h | 4 +- sys/sparc64/include/counter.h | 4 +- sys/sys/pcpu.h | 6 ++- sys/vm/uma.h | 3 +- sys/vm/uma_core.c | 97 +++++++++++++++++++++++++++++++++-- sys/x86/acpica/srat.c | 8 ++- 13 files changed, 135 insertions(+), 32 deletions(-) diff --git a/sys/amd64/include/counter.h b/sys/amd64/include/counter.h index 3e628f3f7a57..2255a0575a0c 100644 --- a/sys/amd64/include/counter.h +++ b/sys/amd64/include/counter.h @@ -45,7 +45,7 @@ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ALLOC_SIZE * cpu)); } static inline uint64_t @@ -65,7 +65,7 @@ static void counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid))) = 0; } diff --git a/sys/arm/include/counter.h b/sys/arm/include/counter.h index 985b30f7a86f..214247d7a40f 100644 --- a/sys/arm/include/counter.h +++ b/sys/arm/include/counter.h @@ -47,7 +47,7 @@ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { - return (atomic_load_64((uint64_t *)((char *)p + sizeof(struct pcpu) * + return (atomic_load_64((uint64_t *)((char *)p + UMA_PCPU_ALLOC_SIZE * cpu))); } @@ -68,7 +68,7 @@ static void counter_u64_zero_one_cpu(void *arg) { - atomic_store_64((uint64_t *)((char *)arg + sizeof(struct pcpu) * + atomic_store_64((uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid)), 0); } diff --git a/sys/arm64/include/counter.h b/sys/arm64/include/counter.h index 18b6606566d0..eb26fa952273 100644 --- a/sys/arm64/include/counter.h +++ b/sys/arm64/include/counter.h @@ -44,7 +44,7 @@ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ALLOC_SIZE * cpu)); } static inline uint64_t @@ -64,7 +64,7 @@ static void counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid))) = 0; } diff --git a/sys/i386/include/counter.h b/sys/i386/include/counter.h index 85ee2f4a0206..7fd26d2a9600 100644 --- a/sys/i386/include/counter.h +++ b/sys/i386/include/counter.h @@ -104,13 +104,13 @@ counter_u64_fetch_inline(uint64_t *p) critical_enter(); CPU_FOREACH(i) { res += *(uint64_t *)((char *)p + - sizeof(struct pcpu) * i); + UMA_PCPU_ALLOC_SIZE * i); } critical_exit(); } else { CPU_FOREACH(i) res += counter_u64_read_one_8b((uint64_t *)((char *)p + - sizeof(struct pcpu) * i)); + UMA_PCPU_ALLOC_SIZE * i)); } return (res); } @@ -137,7 +137,7 @@ counter_u64_zero_one_cpu(void *arg) { uint64_t *p; - p = (uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid)); + p = (uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid)); counter_u64_zero_one_8b(p); } @@ -149,7 +149,7 @@ counter_u64_zero_inline(counter_u64_t c) if ((cpu_feature & CPUID_CX8) == 0) { critical_enter(); CPU_FOREACH(i) - *(uint64_t *)((char *)c + sizeof(struct pcpu) * i) = 0; + *(uint64_t *)((char *)c + UMA_PCPU_ALLOC_SIZE * i) = 0; critical_exit(); } else { smp_rendezvous(smp_no_rendezvous_barrier, diff --git a/sys/kern/subr_counter.c b/sys/kern/subr_counter.c index e93587c95e6b..30857a9989f4 100644 --- a/sys/kern/subr_counter.c +++ b/sys/kern/subr_counter.c @@ -42,7 +42,16 @@ __FBSDID("$FreeBSD$"); #define IN_SUBR_COUNTER_C #include - + +static void +counter_u64_zero_sync(counter_u64_t c) +{ + int cpu; + + CPU_FOREACH(cpu) + *(uint64_t*)zpcpu_get_cpu(c, cpu) = 0; +} + void counter_u64_zero(counter_u64_t c) { @@ -64,7 +73,7 @@ counter_u64_alloc(int flags) r = uma_zalloc_pcpu(pcpu_zone_64, flags); if (r != NULL) - counter_u64_zero(r); + counter_u64_zero_sync(r); return (r); } diff --git a/sys/mips/include/counter.h b/sys/mips/include/counter.h index cc7363116d6e..0694a9c28cdb 100644 --- a/sys/mips/include/counter.h +++ b/sys/mips/include/counter.h @@ -47,7 +47,7 @@ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ALLOC_SIZE * cpu)); } static inline uint64_t @@ -68,7 +68,7 @@ static void counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid))) = 0; } diff --git a/sys/powerpc/include/counter.h b/sys/powerpc/include/counter.h index a815a94c97c6..d32a396e4238 100644 --- a/sys/powerpc/include/counter.h +++ b/sys/powerpc/include/counter.h @@ -50,7 +50,7 @@ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ALLOC_SIZE * cpu)); } static inline uint64_t @@ -70,7 +70,7 @@ static void counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid))) = 0; } @@ -113,7 +113,7 @@ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ALLOC_SIZE * cpu)); } static inline uint64_t @@ -134,7 +134,7 @@ static void counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid))) = 0; } diff --git a/sys/riscv/include/counter.h b/sys/riscv/include/counter.h index b25331cc23ca..7748d937e0f5 100644 --- a/sys/riscv/include/counter.h +++ b/sys/riscv/include/counter.h @@ -46,7 +46,7 @@ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ALLOC_SIZE * cpu)); } static inline uint64_t @@ -67,7 +67,7 @@ static void counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid))) = 0; } diff --git a/sys/sparc64/include/counter.h b/sys/sparc64/include/counter.h index b36cc9896cbb..128d5c0fce7e 100644 --- a/sys/sparc64/include/counter.h +++ b/sys/sparc64/include/counter.h @@ -47,7 +47,7 @@ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ALLOC_SIZE * cpu)); } static inline uint64_t @@ -68,7 +68,7 @@ static void counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid))) = 0; } diff --git a/sys/sys/pcpu.h b/sys/sys/pcpu.h index 127c10d6e110..9ba2adfdbff8 100644 --- a/sys/sys/pcpu.h +++ b/sys/sys/pcpu.h @@ -207,19 +207,21 @@ extern struct pcpu *cpuid_to_pcpu[]; #endif #define curvidata PCPU_GET(vidata) +#define UMA_PCPU_ALLOC_SIZE PAGE_SIZE + /* Accessor to elements allocated via UMA_ZONE_PCPU zone. */ static inline void * zpcpu_get(void *base) { - return ((char *)(base) + sizeof(struct pcpu) * curcpu); + return ((char *)(base) + UMA_PCPU_ALLOC_SIZE * curcpu); } static inline void * zpcpu_get_cpu(void *base, int cpu) { - return ((char *)(base) + sizeof(struct pcpu) * cpu); + return ((char *)(base) + UMA_PCPU_ALLOC_SIZE * cpu); } /* diff --git a/sys/vm/uma.h b/sys/vm/uma.h index ed33ae030728..87ace146bad2 100644 --- a/sys/vm/uma.h +++ b/sys/vm/uma.h @@ -279,8 +279,7 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, * mini-dumps. */ #define UMA_ZONE_PCPU 0x8000 /* - * Allocates mp_maxid + 1 slabs sized to - * sizeof(struct pcpu). + * Allocates mp_maxid + 1 slabs of PAGE_SIZE */ #define UMA_ZONE_NUMA 0x10000 /* * NUMA aware Zone. Implements a best diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index ff3436437f6c..73fab7c184dd 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -229,8 +229,10 @@ void uma_startup2(void); static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); +static void pcpu_page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); @@ -1172,6 +1174,58 @@ page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, return (p); } +static void * +pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) +{ + struct pglist alloctail; + vm_offset_t addr, zkva; + int cpu, flags; + vm_page_t p, p_next; +#ifdef NUMA + struct pcpu *pc; +#endif + + TAILQ_INIT(&alloctail); + MPASS(bytes == (mp_maxid+1)*PAGE_SIZE); + *pflag = UMA_SLAB_KERNEL; + + flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | + ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : + VM_ALLOC_NOWAIT); + for (cpu = 0; cpu <= mp_maxid; cpu++) { + if (CPU_ABSENT(cpu)) { + p = vm_page_alloc(NULL, 0, flags); + } else { +#ifndef NUMA + p = vm_page_alloc(NULL, 0, flags); +#else + pc = pcpu_find(cpu); + p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags); + if (__predict_false(p == NULL)) + p = vm_page_alloc(NULL, 0, flags); +#endif + } + if (__predict_false(p == NULL)) + goto fail; + TAILQ_INSERT_TAIL(&alloctail, p, listq); + } + if ((addr = kva_alloc(bytes)) == 0) + goto fail; + zkva = addr; + TAILQ_FOREACH(p, &alloctail, listq) { + pmap_qenter(zkva, &p, 1); + zkva += PAGE_SIZE; + } + return ((void*)addr); + fail: + TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { + vm_page_unwire(p, PQ_NONE); + vm_page_free(p); + } + return (NULL); +} + /* * Allocates a number of pages from within an object * @@ -1257,6 +1311,37 @@ page_free(void *mem, vm_size_t size, uint8_t flags) kmem_free(vmem, (vm_offset_t)mem, size); } +/* + * Frees pcpu zone allocations + * + * Arguments: + * mem A pointer to the memory to be freed + * size The size of the memory being freed + * flags The original p->us_flags field + * + * Returns: + * Nothing + */ +static void +pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) +{ + vm_offset_t sva, curva; + vm_paddr_t paddr; + vm_page_t m; + + MPASS(size == (mp_maxid+1)*PAGE_SIZE); + sva = (vm_offset_t)mem; + for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { + paddr = pmap_kextract(curva); + m = PHYS_TO_VM_PAGE(paddr); + vm_page_unwire(m, PQ_NONE); + vm_page_free(m); + } + pmap_qremove(sva, size >> PAGE_SHIFT); + kva_free(sva, size); +} + + /* * Zero fill initializer * @@ -1290,9 +1375,8 @@ keg_small_init(uma_keg_t keg) if (keg->uk_flags & UMA_ZONE_PCPU) { u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU; - slabsize = sizeof(struct pcpu); - keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu), - PAGE_SIZE); + slabsize = UMA_PCPU_ALLOC_SIZE; + keg->uk_ppera = ncpus; } else { slabsize = UMA_SLAB_SIZE; keg->uk_ppera = 1; @@ -1311,7 +1395,7 @@ keg_small_init(uma_keg_t keg) keg->uk_rsize = rsize; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || - keg->uk_rsize < sizeof(struct pcpu), + keg->uk_rsize < UMA_PCPU_ALLOC_SIZE, ("%s: size %u too large", __func__, keg->uk_rsize)); if (keg->uk_flags & UMA_ZONE_OFFPAGE) @@ -1529,6 +1613,8 @@ keg_ctor(void *mem, int size, void *udata, int flags) else if (keg->uk_ppera == 1) keg->uk_allocf = uma_small_alloc; #endif + else if (keg->uk_flags & UMA_ZONE_PCPU) + keg->uk_allocf = pcpu_page_alloc; else keg->uk_allocf = page_alloc; #ifdef UMA_MD_SMALL_ALLOC @@ -1536,6 +1622,9 @@ keg_ctor(void *mem, int size, void *udata, int flags) keg->uk_freef = uma_small_free; else #endif + if (keg->uk_flags & UMA_ZONE_PCPU) + keg->uk_freef = pcpu_page_free; + else keg->uk_freef = page_free; /* diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c index d51189eb7174..0ea747ccc30b 100644 --- a/sys/x86/acpica/srat.c +++ b/sys/x86/acpica/srat.c @@ -532,11 +532,15 @@ srat_set_cpus(void *dummy) if (!cpu->enabled) panic("SRAT: CPU with APIC ID %u is not known", pc->pc_apic_id); +#ifdef NUMA pc->pc_domain = cpu->domain; - CPU_SET(i, &cpuset_domain[cpu->domain]); +#else + pc->pc_domain = 0; +#endif + CPU_SET(i, &cpuset_domain[pc->pc_domain]); if (bootverbose) printf("SRAT: CPU %u has memory domain %d\n", i, - cpu->domain); + pc->pc_domain); } /* Last usage of the cpus array, unmap it. */