Add more fine-grained kernel options for NUMA support.

VM_NUMA_ALLOC is used to enable use of domain-aware memory allocation in
the virtual memory system.  DEVICE_NUMA is used to enable affinity
reporting for devices such as bus_get_domain().

MAXMEMDOM must still be set to a value greater than for any NUMA support
to be effective.  Note that 'cpuset -gd' always works if MAXMEMDOM is
enabled and the system supports NUMA.

Reviewed by:	kib
Differential Revision:	https://reviews.freebsd.org/D5782
This commit is contained in:
John Baldwin 2016-04-09 13:58:04 +00:00
parent 0ff814e854
commit 62d70a8174
10 changed files with 99 additions and 43 deletions

View File

@ -229,7 +229,15 @@ options MAXCPU=32
# MAXMEMDOM defines the maximum number of memory domains that can boot in the # MAXMEMDOM defines the maximum number of memory domains that can boot in the
# system. A default value should already be defined by every architecture. # system. A default value should already be defined by every architecture.
options MAXMEMDOM=1 options MAXMEMDOM=2
# VM_NUMA_ALLOC enables use of memory domain-aware allocation in the VM
# system.
options VM_NUMA_ALLOC
# DEVICE_NUMA enables reporting of domain affinity of I/O devices via
# bus_get_domain(), etc.
options DEVICE_NUMA
# ADAPTIVE_MUTEXES changes the behavior of blocking mutexes to spin # ADAPTIVE_MUTEXES changes the behavior of blocking mutexes to spin
# if the thread that currently owns the mutex is executing on another # if the thread that currently owns the mutex is executing on another

View File

@ -90,6 +90,7 @@ COMPAT_LINUXKPI opt_compat.h
COMPILING_LINT opt_global.h COMPILING_LINT opt_global.h
CY_PCI_FASTINTR CY_PCI_FASTINTR
DEADLKRES opt_watchdog.h DEADLKRES opt_watchdog.h
DEVICE_NUMA
EXT_RESOURCES opt_global.h EXT_RESOURCES opt_global.h
DIRECTIO DIRECTIO
FILEMON opt_dontuse.h FILEMON opt_dontuse.h
@ -603,6 +604,7 @@ VM_KMEM_SIZE opt_vm.h
VM_KMEM_SIZE_SCALE opt_vm.h VM_KMEM_SIZE_SCALE opt_vm.h
VM_KMEM_SIZE_MAX opt_vm.h VM_KMEM_SIZE_MAX opt_vm.h
VM_NRESERVLEVEL opt_vm.h VM_NRESERVLEVEL opt_vm.h
VM_NUMA_ALLOC opt_vm.h
VM_LEVEL_0_ORDER opt_vm.h VM_LEVEL_0_ORDER opt_vm.h
NO_SWAPPING opt_vm.h NO_SWAPPING opt_vm.h
MALLOC_MAKE_FAILURES opt_vm.h MALLOC_MAKE_FAILURES opt_vm.h

View File

@ -31,6 +31,8 @@
__FBSDID("$FreeBSD$"); __FBSDID("$FreeBSD$");
#include "opt_acpi.h" #include "opt_acpi.h"
#include "opt_device_numa.h"
#include <sys/param.h> #include <sys/param.h>
#include <sys/kernel.h> #include <sys/kernel.h>
#include <sys/proc.h> #include <sys/proc.h>
@ -1083,7 +1085,7 @@ acpi_hint_device_unit(device_t acdev, device_t child, const char *name,
int int
acpi_parse_pxm(device_t dev, int *domain) acpi_parse_pxm(device_t dev, int *domain)
{ {
#if MAXMEMDOM > 1 #ifdef DEVICE_NUMA
ACPI_HANDLE h; ACPI_HANDLE h;
int d, pxm; int d, pxm;

View File

@ -502,9 +502,7 @@ SYSCTL_DECL(_debug_acpi);
* *
* Returns the VM domain ID if found, or -1 if not found / invalid. * Returns the VM domain ID if found, or -1 if not found / invalid.
*/ */
#if MAXMEMDOM > 1
extern int acpi_map_pxm_to_vm_domainid(int pxm); extern int acpi_map_pxm_to_vm_domainid(int pxm);
#endif
extern int acpi_get_domain(device_t dev, device_t child, int *domain); extern int acpi_get_domain(device_t dev, device_t child, int *domain);
extern int acpi_parse_pxm(device_t dev, int *domain); extern int acpi_parse_pxm(device_t dev, int *domain);

View File

@ -831,7 +831,7 @@ struct cpuset *
cpuset_thread0(void) cpuset_thread0(void)
{ {
struct cpuset *set; struct cpuset *set;
int error; int error, i;
cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0); NULL, NULL, UMA_ALIGN_PTR, 0);
@ -863,9 +863,15 @@ cpuset_thread0(void)
*/ */
cpuset_unr = new_unrhdr(2, INT_MAX, NULL); cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
/* MD Code is responsible for initializing sets if vm_ndomains > 1. */ /*
if (vm_ndomains == 1) * If MD code has not initialized per-domain cpusets, place all
CPU_COPY(&all_cpus, &cpuset_domain[0]); * CPUs in domain 0.
*/
for (i = 0; i < MAXMEMDOM; i++)
if (!CPU_EMPTY(&cpuset_domain[i]))
goto domains_set;
CPU_COPY(&all_cpus, &cpuset_domain[0]);
domains_set:
return (set); return (set);
} }
@ -1118,7 +1124,7 @@ sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
error = intr_getaffinity(uap->id, mask); error = intr_getaffinity(uap->id, mask);
break; break;
case CPU_WHICH_DOMAIN: case CPU_WHICH_DOMAIN:
if (uap->id < 0 || uap->id >= vm_ndomains) if (uap->id < 0 || uap->id >= MAXMEMDOM)
error = ESRCH; error = ESRCH;
else else
CPU_COPY(&cpuset_domain[uap->id], mask); CPU_COPY(&cpuset_domain[uap->id], mask);

View File

@ -39,7 +39,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h> #include <sys/kernel.h>
#include <sys/malloc.h> #include <sys/malloc.h>
#include <sys/mutex.h> #include <sys/mutex.h>
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
#include <sys/proc.h> #include <sys/proc.h>
#endif #endif
#include <sys/queue.h> #include <sys/queue.h>
@ -64,7 +64,7 @@ __FBSDID("$FreeBSD$");
static __inline int static __inline int
vm_domain_rr_selectdomain(int skip_domain) vm_domain_rr_selectdomain(int skip_domain)
{ {
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
struct thread *td; struct thread *td;
td = curthread; td = curthread;
@ -188,8 +188,13 @@ vm_domain_policy_validate(const struct vm_domain_policy *vp)
return (-1); return (-1);
case VM_POLICY_FIXED_DOMAIN: case VM_POLICY_FIXED_DOMAIN:
case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
#ifdef VM_NUMA_ALLOC
if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains) if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains)
return (0); return (0);
#else
if (vp->p.domain == 0)
return (0);
#endif
return (-1); return (-1);
default: default:
return (-1); return (-1);
@ -221,6 +226,7 @@ vm_domain_iterator_set(struct vm_domain_iterator *vi,
vm_domain_policy_type_t vt, int domain) vm_domain_policy_type_t vt, int domain)
{ {
#ifdef VM_NUMA_ALLOC
switch (vt) { switch (vt) {
case VM_POLICY_FIXED_DOMAIN: case VM_POLICY_FIXED_DOMAIN:
vi->policy = VM_POLICY_FIXED_DOMAIN; vi->policy = VM_POLICY_FIXED_DOMAIN;
@ -249,6 +255,10 @@ vm_domain_iterator_set(struct vm_domain_iterator *vi,
vi->n = vm_ndomains; vi->n = vm_ndomains;
break; break;
} }
#else
vi->domain = 0;
vi->n = 1;
#endif
return (0); return (0);
} }
@ -259,6 +269,8 @@ static inline void
_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, _vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
const struct vm_domain_policy *vt) const struct vm_domain_policy *vt)
{ {
#ifdef VM_NUMA_ALLOC
/* /*
* Initialise the iterator. * Initialise the iterator.
* *
@ -300,6 +312,10 @@ _vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
vi->n = vm_ndomains; vi->n = vm_ndomains;
break; break;
} }
#else
vi->domain = 0;
vi->n = 1;
#endif
} }
void void
@ -334,6 +350,7 @@ vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain)
if (vi->n <= 0) if (vi->n <= 0)
return (-1); return (-1);
#ifdef VM_NUMA_ALLOC
switch (vi->policy) { switch (vi->policy) {
case VM_POLICY_FIXED_DOMAIN: case VM_POLICY_FIXED_DOMAIN:
case VM_POLICY_FIRST_TOUCH: case VM_POLICY_FIRST_TOUCH:
@ -358,6 +375,10 @@ vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain)
vi->n--; vi->n--;
break; break;
} }
#else
*domain = 0;
vi->n--;
#endif
return (0); return (0);
} }

View File

@ -1656,12 +1656,12 @@ static void
vm_pageout(void) vm_pageout(void)
{ {
int error; int error;
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
int i; int i;
#endif #endif
swap_pager_swap_init(); swap_pager_swap_init();
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
for (i = 1; i < vm_ndomains; i++) { for (i = 1; i < vm_ndomains; i++) {
error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
curproc, NULL, 0, 0, "dom%d", i); curproc, NULL, 0, 0, "dom%d", i);

View File

@ -48,9 +48,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h> #include <sys/kernel.h>
#include <sys/malloc.h> #include <sys/malloc.h>
#include <sys/mutex.h> #include <sys/mutex.h>
#if MAXMEMDOM > 1
#include <sys/proc.h> #include <sys/proc.h>
#endif
#include <sys/queue.h> #include <sys/queue.h>
#include <sys/rwlock.h> #include <sys/rwlock.h>
#include <sys/sbuf.h> #include <sys/sbuf.h>
@ -73,8 +71,10 @@ __FBSDID("$FreeBSD$");
_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
"Too many physsegs."); "Too many physsegs.");
#ifdef VM_NUMA_ALLOC
struct mem_affinity *mem_affinity; struct mem_affinity *mem_affinity;
int *mem_locality; int *mem_locality;
#endif
int vm_ndomains = 1; int vm_ndomains = 1;
@ -144,7 +144,7 @@ static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
@ -159,7 +159,7 @@ SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
static struct mtx vm_default_policy_mtx; static struct mtx vm_default_policy_mtx;
MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex", MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
MTX_DEF); MTX_DEF);
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
static struct vm_domain_policy vm_default_policy = static struct vm_domain_policy vm_default_policy =
VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
#else #else
@ -277,7 +277,7 @@ vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
static __inline int static __inline int
vm_rr_selectdomain(void) vm_rr_selectdomain(void)
{ {
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
struct thread *td; struct thread *td;
td = curthread; td = curthread;
@ -303,13 +303,13 @@ vm_rr_selectdomain(void)
static void static void
vm_policy_iterator_init(struct vm_domain_iterator *vi) vm_policy_iterator_init(struct vm_domain_iterator *vi)
{ {
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
struct vm_domain_policy lcl; struct vm_domain_policy lcl;
#endif #endif
vm_domain_iterator_init(vi); vm_domain_iterator_init(vi);
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
/* Copy out the thread policy */ /* Copy out the thread policy */
vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy); vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
if (lcl.p.policy != VM_POLICY_NONE) { if (lcl.p.policy != VM_POLICY_NONE) {
@ -433,7 +433,7 @@ int
vm_phys_mem_affinity(int f, int t) vm_phys_mem_affinity(int f, int t)
{ {
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
if (mem_locality == NULL) if (mem_locality == NULL)
return (-1); return (-1);
if (f >= vm_ndomains || t >= vm_ndomains) if (f >= vm_ndomains || t >= vm_ndomains)
@ -444,7 +444,7 @@ vm_phys_mem_affinity(int f, int t)
#endif #endif
} }
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
/* /*
* Outputs the VM locality table. * Outputs the VM locality table.
*/ */
@ -520,6 +520,7 @@ _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
static void static void
vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
{ {
#ifdef VM_NUMA_ALLOC
int i; int i;
if (mem_affinity == NULL) { if (mem_affinity == NULL) {
@ -544,6 +545,9 @@ vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
mem_affinity[i].domain); mem_affinity[i].domain);
start = mem_affinity[i].end; start = mem_affinity[i].end;
} }
#else
_vm_phys_create_seg(start, end, 0);
#endif
} }
/* /*

View File

@ -99,7 +99,7 @@ int vm_phys_mem_affinity(int f, int t);
static inline struct vm_domain * static inline struct vm_domain *
vm_phys_domain(vm_page_t m) vm_phys_domain(vm_page_t m)
{ {
#if MAXMEMDOM > 1 #ifdef VM_NUMA_ALLOC
int domn, segind; int domn, segind;
/* XXXKIB try to assert that the page is managed */ /* XXXKIB try to assert that the page is managed */

View File

@ -28,6 +28,8 @@
#include <sys/cdefs.h> #include <sys/cdefs.h>
__FBSDID("$FreeBSD$"); __FBSDID("$FreeBSD$");
#include "opt_vm.h"
#include <sys/param.h> #include <sys/param.h>
#include <sys/bus.h> #include <sys/bus.h>
#include <sys/kernel.h> #include <sys/kernel.h>
@ -62,7 +64,8 @@ int num_mem;
static ACPI_TABLE_SRAT *srat; static ACPI_TABLE_SRAT *srat;
static vm_paddr_t srat_physaddr; static vm_paddr_t srat_physaddr;
static int vm_domains[VM_PHYSSEG_MAX]; static int domain_pxm[MAXMEMDOM];
static int ndomain;
static ACPI_TABLE_SLIT *slit; static ACPI_TABLE_SLIT *slit;
static vm_paddr_t slit_physaddr; static vm_paddr_t slit_physaddr;
@ -145,8 +148,10 @@ parse_slit(void)
acpi_unmap_table(slit); acpi_unmap_table(slit);
slit = NULL; slit = NULL;
#ifdef VM_NUMA_ALLOC
/* Tell the VM about it! */ /* Tell the VM about it! */
mem_locality = vm_locality_table; mem_locality = vm_locality_table;
#endif
return (0); return (0);
} }
@ -340,48 +345,46 @@ renumber_domains(void)
int i, j, slot; int i, j, slot;
/* Enumerate all the domains. */ /* Enumerate all the domains. */
vm_ndomains = 0; ndomain = 0;
for (i = 0; i < num_mem; i++) { for (i = 0; i < num_mem; i++) {
/* See if this domain is already known. */ /* See if this domain is already known. */
for (j = 0; j < vm_ndomains; j++) { for (j = 0; j < ndomain; j++) {
if (vm_domains[j] >= mem_info[i].domain) if (domain_pxm[j] >= mem_info[i].domain)
break; break;
} }
if (j < vm_ndomains && vm_domains[j] == mem_info[i].domain) if (j < ndomain && domain_pxm[j] == mem_info[i].domain)
continue; continue;
/* Insert the new domain at slot 'j'. */ /* Insert the new domain at slot 'j'. */
slot = j; slot = j;
for (j = vm_ndomains; j > slot; j--) for (j = ndomain; j > slot; j--)
vm_domains[j] = vm_domains[j - 1]; domain_pxm[j] = domain_pxm[j - 1];
vm_domains[slot] = mem_info[i].domain; domain_pxm[slot] = mem_info[i].domain;
vm_ndomains++; ndomain++;
if (vm_ndomains > MAXMEMDOM) { if (ndomain > MAXMEMDOM) {
vm_ndomains = 1; ndomain = 1;
printf("SRAT: Too many memory domains\n"); printf("SRAT: Too many memory domains\n");
return (EFBIG); return (EFBIG);
} }
} }
/* Renumber each domain to its index in the sorted 'domains' list. */ /* Renumber each domain to its index in the sorted 'domain_pxm' list. */
for (i = 0; i < vm_ndomains; i++) { for (i = 0; i < ndomain; i++) {
/* /*
* If the domain is already the right value, no need * If the domain is already the right value, no need
* to renumber. * to renumber.
*/ */
if (vm_domains[i] == i) if (domain_pxm[i] == i)
continue; continue;
/* Walk the cpu[] and mem_info[] arrays to renumber. */ /* Walk the cpu[] and mem_info[] arrays to renumber. */
for (j = 0; j < num_mem; j++) for (j = 0; j < num_mem; j++)
if (mem_info[j].domain == vm_domains[i]) if (mem_info[j].domain == domain_pxm[i])
mem_info[j].domain = i; mem_info[j].domain = i;
for (j = 0; j <= MAX_APIC_ID; j++) for (j = 0; j <= MAX_APIC_ID; j++)
if (cpus[j].enabled && cpus[j].domain == vm_domains[i]) if (cpus[j].enabled && cpus[j].domain == domain_pxm[i])
cpus[j].domain = i; cpus[j].domain = i;
} }
KASSERT(vm_ndomains > 0,
("renumber_domains: invalid final vm_ndomains setup"));
return (0); return (0);
} }
@ -416,8 +419,11 @@ parse_srat(void)
return (-1); return (-1);
} }
#ifdef VM_NUMA_ALLOC
/* Point vm_phys at our memory affinity table. */ /* Point vm_phys at our memory affinity table. */
vm_ndomains = ndomain;
mem_affinity = mem_info; mem_affinity = mem_info;
#endif
return (0); return (0);
} }
@ -495,12 +501,21 @@ acpi_map_pxm_to_vm_domainid(int pxm)
{ {
int i; int i;
for (i = 0; i < vm_ndomains; i++) { for (i = 0; i < ndomain; i++) {
if (vm_domains[i] == pxm) if (domain_pxm[i] == pxm)
return (i); return (i);
} }
return (-1); return (-1);
} }
#else /* MAXMEMDOM == 1 */
int
acpi_map_pxm_to_vm_domainid(int pxm)
{
return (-1);
}
#endif /* MAXMEMDOM > 1 */ #endif /* MAXMEMDOM > 1 */