Bind TCP HPTS (pacer) threads to NUMA domains
Bind the TCP pacer threads to NUMA domains and build per-domain pacer-thread lookup tables. These tables allow us to use the inpcb's NUMA domain information to match an inpcb with a pacer thread on the same domain. The motivation for this is to keep the TCP connection local to a NUMA domain as much as possible. Thanks to jhb for pre-reviewing an earlier version of the patch. Reviewed by: rrs Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D20134
This commit is contained in:
parent
408cf30173
commit
fbc304aae0
@ -380,6 +380,25 @@ intr_event_bind_ithread(struct intr_event *ie, int cpu)
|
|||||||
return (_intr_event_bind(ie, cpu, false, true));
|
return (_intr_event_bind(ie, cpu, false, true));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Bind an interrupt event's ithread to the specified cpuset.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs)
|
||||||
|
{
|
||||||
|
lwpid_t id;
|
||||||
|
|
||||||
|
mtx_lock(&ie->ie_lock);
|
||||||
|
if (ie->ie_thread != NULL) {
|
||||||
|
id = ie->ie_thread->it_thread->td_tid;
|
||||||
|
mtx_unlock(&ie->ie_lock);
|
||||||
|
return (cpuset_setthread(id, cs));
|
||||||
|
} else {
|
||||||
|
mtx_unlock(&ie->ie_lock);
|
||||||
|
}
|
||||||
|
return (ENODEV);
|
||||||
|
}
|
||||||
|
|
||||||
static struct intr_event *
|
static struct intr_event *
|
||||||
intr_lookup(int irq)
|
intr_lookup(int irq)
|
||||||
{
|
{
|
||||||
|
@ -131,6 +131,7 @@ __FBSDID("$FreeBSD$");
|
|||||||
#include <sys/kern_prefetch.h>
|
#include <sys/kern_prefetch.h>
|
||||||
|
|
||||||
#include <vm/uma.h>
|
#include <vm/uma.h>
|
||||||
|
#include <vm/vm.h>
|
||||||
|
|
||||||
#include <net/route.h>
|
#include <net/route.h>
|
||||||
#include <net/vnet.h>
|
#include <net/vnet.h>
|
||||||
@ -171,7 +172,7 @@ MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
|
|||||||
#include <net/rss_config.h>
|
#include <net/rss_config.h>
|
||||||
static int tcp_bind_threads = 1;
|
static int tcp_bind_threads = 1;
|
||||||
#else
|
#else
|
||||||
static int tcp_bind_threads = 0;
|
static int tcp_bind_threads = 2;
|
||||||
#endif
|
#endif
|
||||||
TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
|
TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
|
||||||
|
|
||||||
@ -207,6 +208,13 @@ static int32_t logging_on = 0;
|
|||||||
static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
|
static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
|
||||||
static int32_t tcp_hpts_precision = 120;
|
static int32_t tcp_hpts_precision = 120;
|
||||||
|
|
||||||
|
struct hpts_domain_info {
|
||||||
|
int count;
|
||||||
|
int cpu[MAXCPU];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct hpts_domain_info hpts_domains[MAXMEMDOM];
|
||||||
|
|
||||||
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
|
||||||
&tcp_hpts_precision, 120,
|
&tcp_hpts_precision, 120,
|
||||||
"Value for PRE() precision of callout");
|
"Value for PRE() precision of callout");
|
||||||
@ -1079,7 +1087,9 @@ hpts_random_cpu(struct inpcb *inp){
|
|||||||
static uint16_t
|
static uint16_t
|
||||||
hpts_cpuid(struct inpcb *inp){
|
hpts_cpuid(struct inpcb *inp){
|
||||||
u_int cpuid;
|
u_int cpuid;
|
||||||
|
#ifdef NUMA
|
||||||
|
struct hpts_domain_info *di;
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If one has been set use it i.e. we want both in and out on the
|
* If one has been set use it i.e. we want both in and out on the
|
||||||
@ -1103,11 +1113,21 @@ hpts_cpuid(struct inpcb *inp){
|
|||||||
* unknown cpuids to curcpu. Not the best, but apparently better
|
* unknown cpuids to curcpu. Not the best, but apparently better
|
||||||
* than defaulting to swi 0.
|
* than defaulting to swi 0.
|
||||||
*/
|
*/
|
||||||
if (inp->inp_flowtype != M_HASHTYPE_NONE) {
|
|
||||||
|
if (inp->inp_flowtype == M_HASHTYPE_NONE)
|
||||||
|
return (hpts_random_cpu(inp));
|
||||||
|
/*
|
||||||
|
* Hash to a thread based on the flowid. If we are using numa,
|
||||||
|
* then restrict the hash to the numa domain where the inp lives.
|
||||||
|
*/
|
||||||
|
#ifdef NUMA
|
||||||
|
if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) {
|
||||||
|
di = &hpts_domains[inp->inp_numa_domain];
|
||||||
|
cpuid = di->cpu[inp->inp_flowid % di->count];
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
cpuid = inp->inp_flowid % mp_ncpus;
|
cpuid = inp->inp_flowid % mp_ncpus;
|
||||||
return (cpuid);
|
|
||||||
}
|
|
||||||
cpuid = hpts_random_cpu(inp);
|
|
||||||
return (cpuid);
|
return (cpuid);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -1781,8 +1801,11 @@ tcp_init_hptsi(void *st)
|
|||||||
struct timeval tv;
|
struct timeval tv;
|
||||||
sbintime_t sb;
|
sbintime_t sb;
|
||||||
struct tcp_hpts_entry *hpts;
|
struct tcp_hpts_entry *hpts;
|
||||||
|
struct pcpu *pc;
|
||||||
|
cpuset_t cs;
|
||||||
char unit[16];
|
char unit[16];
|
||||||
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
|
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
|
||||||
|
int count, domain;
|
||||||
|
|
||||||
tcp_pace.rp_proc = NULL;
|
tcp_pace.rp_proc = NULL;
|
||||||
tcp_pace.rp_num_hptss = ncpus;
|
tcp_pace.rp_num_hptss = ncpus;
|
||||||
@ -1861,6 +1884,11 @@ tcp_init_hptsi(void *st)
|
|||||||
}
|
}
|
||||||
callout_init(&hpts->co, 1);
|
callout_init(&hpts->co, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Don't try to bind to NUMA domains if we don't have any */
|
||||||
|
if (vm_ndomains == 1 && tcp_bind_threads == 2)
|
||||||
|
tcp_bind_threads = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now lets start ithreads to handle the hptss.
|
* Now lets start ithreads to handle the hptss.
|
||||||
*/
|
*/
|
||||||
@ -1875,9 +1903,20 @@ tcp_init_hptsi(void *st)
|
|||||||
hpts, i, error);
|
hpts, i, error);
|
||||||
}
|
}
|
||||||
created++;
|
created++;
|
||||||
if (tcp_bind_threads) {
|
if (tcp_bind_threads == 1) {
|
||||||
if (intr_event_bind(hpts->ie, i) == 0)
|
if (intr_event_bind(hpts->ie, i) == 0)
|
||||||
bound++;
|
bound++;
|
||||||
|
} else if (tcp_bind_threads == 2) {
|
||||||
|
pc = pcpu_find(i);
|
||||||
|
domain = pc->pc_domain;
|
||||||
|
CPU_COPY(&cpuset_domain[domain], &cs);
|
||||||
|
if (intr_event_bind_ithread_cpuset(hpts->ie, &cs)
|
||||||
|
== 0) {
|
||||||
|
bound++;
|
||||||
|
count = hpts_domains[domain].count;
|
||||||
|
hpts_domains[domain].cpu[count] = i;
|
||||||
|
hpts_domains[domain].count++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
tv.tv_sec = 0;
|
tv.tv_sec = 0;
|
||||||
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
|
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
|
||||||
@ -1893,9 +1932,20 @@ tcp_init_hptsi(void *st)
|
|||||||
C_PREL(tcp_hpts_precision));
|
C_PREL(tcp_hpts_precision));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("TCP Hpts created %d swi interrupt thread and bound %d\n",
|
/*
|
||||||
created, bound);
|
* If we somehow have an empty domain, fall back to choosing
|
||||||
return;
|
* among all htps threads.
|
||||||
|
*/
|
||||||
|
for (i = 0; i < vm_ndomains; i++) {
|
||||||
|
if (hpts_domains[i].count == 0) {
|
||||||
|
tcp_bind_threads = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
|
||||||
|
created, bound,
|
||||||
|
tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
|
||||||
}
|
}
|
||||||
|
|
||||||
SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
|
SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
|
||||||
|
@ -176,6 +176,8 @@ int intr_event_add_handler(struct intr_event *ie, const char *name,
|
|||||||
int intr_event_bind(struct intr_event *ie, int cpu);
|
int intr_event_bind(struct intr_event *ie, int cpu);
|
||||||
int intr_event_bind_irqonly(struct intr_event *ie, int cpu);
|
int intr_event_bind_irqonly(struct intr_event *ie, int cpu);
|
||||||
int intr_event_bind_ithread(struct intr_event *ie, int cpu);
|
int intr_event_bind_ithread(struct intr_event *ie, int cpu);
|
||||||
|
int intr_event_bind_ithread_cpuset(struct intr_event *ie,
|
||||||
|
cpuset_t *mask);
|
||||||
int intr_event_create(struct intr_event **event, void *source,
|
int intr_event_create(struct intr_event **event, void *source,
|
||||||
int flags, int irq, void (*pre_ithread)(void *),
|
int flags, int irq, void (*pre_ithread)(void *),
|
||||||
void (*post_ithread)(void *), void (*post_filter)(void *),
|
void (*post_ithread)(void *), void (*post_filter)(void *),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user