x86: Add dynamic interrupt rebalancing

Add an option to dynamically rebalance interrupts across cores
(hw.intrbalance); off by default.

The goal is to minimize preemption. By placing interrupt sources on distinct
CPUs, ithreads get preferentially scheduled on distinct CPUs.  Overall
preemption is reduced and latency is reduced. In our workflow it reduced
"fighting" between two high-frequency interrupt sources.  Reduced latency
was proven by, e.g., SPEC2008.

Submitted by:	jeff@ (earlier version)
Reviewed by:	kib@
Sponsored by:	Dell EMC Isilon
Differential Revision:	https://reviews.freebsd.org/D10435
This commit is contained in:
Conrad Meyer 2017-08-16 18:48:53 +00:00
parent 96dd05dd7d
commit dc6a82801d
3 changed files with 141 additions and 7 deletions

View File

@ -130,6 +130,7 @@ struct intsrc {
u_long *is_straycount;
u_int is_index;
u_int is_handlers;
u_int is_cpu;
};
struct trapframe;

View File

@ -130,6 +130,7 @@ struct intsrc {
u_long *is_straycount;
u_int is_index;
u_int is_handlers;
u_int is_cpu;
};
struct trapframe;

View File

@ -45,10 +45,14 @@
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sbuf.h>
#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
#include <sys/vmmeter.h>
#include <machine/clock.h>
#include <machine/intr_machdep.h>
@ -71,6 +75,12 @@ typedef void (*mask_fn)(void *);
static int intrcnt_index;
static struct intsrc *interrupt_sources[NUM_IO_INTS];
static struct intsrc *interrupt_sorted[NUM_IO_INTS];
CTASSERT(sizeof(interrupt_sources) == sizeof(interrupt_sorted));
static int intrbalance;
SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RW, &intrbalance, 0,
"Interrupt auto-balance interval (seconds). Zero disables.");
static struct timeout_task intrbalance_task;
static struct sx intrsrc_lock;
static struct mtx intrpic_lock;
static struct mtx intrcnt_lock;
@ -325,6 +335,8 @@ intr_assign_cpu(void *arg, int cpu)
isrc = arg;
sx_xlock(&intrsrc_lock);
error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
if (error == 0)
isrc->is_cpu = cpu;
sx_xunlock(&intrsrc_lock);
} else
error = 0;
@ -559,6 +571,7 @@ static void
intr_shuffle_irqs(void *arg __unused)
{
struct intsrc *isrc;
u_int cpu;
int i;
/* Don't bother on UP. */
@ -578,13 +591,15 @@ intr_shuffle_irqs(void *arg __unused)
* this is careful to only advance the
* round-robin if the CPU assignment succeeds.
*/
if (isrc->is_event->ie_cpu != NOCPU)
(void)isrc->is_pic->pic_assign_cpu(isrc,
cpu_apic_ids[isrc->is_event->ie_cpu]);
else if (isrc->is_pic->pic_assign_cpu(isrc,
cpu_apic_ids[current_cpu]) == 0)
(void)intr_next_cpu();
cpu = isrc->is_event->ie_cpu;
if (cpu == NOCPU)
cpu = current_cpu;
if (isrc->is_pic->pic_assign_cpu(isrc,
cpu_apic_ids[cpu]) == 0) {
isrc->is_cpu = cpu;
if (isrc->is_event->ie_cpu == NOCPU)
intr_next_cpu();
}
}
}
sx_xunlock(&intrsrc_lock);
@ -592,6 +607,123 @@ intr_shuffle_irqs(void *arg __unused)
SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
NULL);
#endif
/*
* TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
*/
static int
sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
{
struct sbuf sbuf;
struct intsrc *isrc;
int error;
int i;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
sx_slock(&intrsrc_lock);
for (i = 0; i < NUM_IO_INTS; i++) {
isrc = interrupt_sources[i];
if (isrc == NULL)
continue;
sbuf_printf(&sbuf, "%s:%d @%d: %ld\n",
isrc->is_event->ie_fullname,
isrc->is_index,
isrc->is_cpu,
*isrc->is_count);
}
sx_sunlock(&intrsrc_lock);
error = sbuf_finish(&sbuf);
sbuf_delete(&sbuf);
return (error);
}
SYSCTL_PROC(_hw, OID_AUTO, intrs, CTLTYPE_STRING | CTLFLAG_RW,
0, 0, sysctl_hw_intrs, "A", "interrupt:number @cpu: count");
/*
* Compare two, possibly NULL, entries in the interrupt source array
* by load.
*/
static int
intrcmp(const void *one, const void *two)
{
const struct intsrc *i1, *i2;
i1 = *(const struct intsrc * const *)one;
i2 = *(const struct intsrc * const *)two;
if (i1 != NULL && i2 != NULL)
return (*i1->is_count - *i2->is_count);
if (i1 != NULL)
return (1);
if (i2 != NULL)
return (-1);
return (0);
}
/*
* Balance IRQs across available CPUs according to load.
*/
static void
intr_balance(void *dummy __unused, int pending __unused)
{
struct intsrc *isrc;
int interval;
u_int cpu;
int i;
interval = intrbalance;
if (interval == 0)
goto out;
/*
* Sort interrupts according to count.
*/
sx_xlock(&intrsrc_lock);
memcpy(interrupt_sorted, interrupt_sources, sizeof(interrupt_sorted));
qsort(interrupt_sorted, NUM_IO_INTS, sizeof(interrupt_sorted[0]),
intrcmp);
/*
* Restart the scan from the same location to avoid moving in the
* common case.
*/
current_cpu = 0;
/*
* Assign round-robin from most loaded to least.
*/
for (i = NUM_IO_INTS - 1; i >= 0; i--) {
isrc = interrupt_sorted[i];
if (isrc == NULL || isrc->is_event->ie_cpu != NOCPU)
continue;
cpu = current_cpu;
intr_next_cpu();
if (isrc->is_cpu != cpu &&
isrc->is_pic->pic_assign_cpu(isrc,
cpu_apic_ids[cpu]) == 0)
isrc->is_cpu = cpu;
}
sx_xunlock(&intrsrc_lock);
out:
taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
interval ? hz * interval : hz * 60);
}
static void
intr_balance_init(void *dummy __unused)
{
TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
NULL);
taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
}
SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
#else
/*
* Always route interrupts to the current processor in the UP case.