From ac75ee9fa3027e2353cf38ae0bd3ed4ad69d1d63 Mon Sep 17 00:00:00 2001
From: Adrian Chadd <adrian@FreeBSD.org>
Date: Sat, 10 May 2014 00:53:36 +0000
Subject: [PATCH] Add in support to optionally pin the swi threads.

Under enough load, the swi's can actually be preempted and migrated
to other currently free cores.  When doing RSS experiments, this lead
to the per-CPU TCP timers not lining up any more with the RX CPU said
flows were ending up on, leading to increased lock contention.

Since there was a little pushback on flipping them on by default,
I've left the default at "don't pin."

The other less obvious problem here is that the default swi
is also the same as the destination swi for CPU #0.  So if one
pins the swi on CPU #0, there's no default floating swi.

A nice future project would be to create a separate swi for
the "default" floating swi, as well as per-CPU swis that are
(optionally) pinned.

Tested:

* parallel TCP tests (2 x 1g unfortunately for now);
  CPU: Intel(R) Xeon(R) CPU E5-2650

Note:

This is based on some initial investigation into RSS/TCP stack lock
contention on FreeBSD-HEAD whilst at Netflix in January 2014.
---
 sys/kern/kern_timeout.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
index ad6db5bcc44c..ef20ed109c91 100644
--- a/sys/kern/kern_timeout.c
+++ b/sys/kern/kern_timeout.c
@@ -104,6 +104,14 @@ static int ncallout;
 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN, &ncallout, 0,
     "Number of entries in callwheel and size of timeout() preallocation");
 
+static int pin_default_swi = 0;
+static int pin_pcpu_swi = 0;
+
+SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN, &pin_default_swi,
+    0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
+SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN, &pin_pcpu_swi,
+    0, "Pin the per-CPU swis (except PCPU 0, which is also default");
+
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
@@ -272,6 +280,12 @@ callout_callwheel_init(void *dummy)
 	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
+	/*
+	 * Fetch whether we're pinning the swi's or not.
+	 */
+	TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
+	TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
+
 	/*
 	 * Only cpu0 handles timeout(9) and receives a preallocation.
 	 *
@@ -355,6 +369,7 @@ start_softclock(void *dummy)
 	char name[MAXCOMLEN];
 #ifdef SMP
 	int cpu;
+	struct intr_event *ie;
 #endif
 
 	cc = CC_CPU(timeout_cpu);
@@ -362,6 +377,13 @@ start_softclock(void *dummy)
 	if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK,
 	    INTR_MPSAFE, &cc->cc_cookie))
 		panic("died while creating standard software ithreads");
+	if (pin_default_swi &&
+	    (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) {
+		printf("%s: timeout clock couldn't be pinned to cpu %d\n",
+		    __func__,
+		    timeout_cpu);
+	}
+
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		if (cpu == timeout_cpu)
@@ -370,9 +392,16 @@ start_softclock(void *dummy)
 		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(9). */
 		callout_cpu_init(cc);
 		snprintf(name, sizeof(name), "clock (%d)", cpu);
-		if (swi_add(NULL, name, softclock, cc, SWI_CLOCK,
+		ie = NULL;
+		if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
+		if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) {
+			printf("%s: per-cpu clock couldn't be pinned to "
+			    "cpu %d\n",
+			    __func__,
+			    cpu);
+		}
 	}
 #endif
 }