Decrease latency by not wrapping the idle loop's potentially lengthy

search for a thread to steal inside a critical section. Since this allows the search to be preempted, restart the search if preemption happens since the search results found earlier may no longer be valid. Decrease the latency of starting a thread that may be assigned to this CPU during the search by polling for incoming threads during the search and switching to that thread instead of continuing the search. Test for stale search results and restart the search before going through the expense of calling tdq_lock_pair(). Retry some tests after grabbing the locks since things may have changed while waiting to get both locks. Eliminate special case handling for stealing from an SMT peer that uses 1 as the steal threshold. This can only succeed if a thread has been assigned but our SMT peer has not yet started executing it. This is quite rare and when it happens the other SMT thread is generally waiting for the same tdq lock that we hold. Basically both SMT threads are racing to grab the same spin lock. Add the kern.sched.always_steal knob from a ULE patch by jeff@. Incorporate another idea from Jeff's ULE patch. If the sched_switch() detects that the CPU is about to go idle, try to steal a thread before switching to the idle thread. Since the search for a thread to steal has to be done inside a critical section in this context, limit the impact on latency by adding the knob kern.sched.trysteal_limit to limit the topological distance of the search and don't restart the search if we detect stale results. If this search can't find an stealable thread, the idle loop can do a more complete search. Also poll for threads being assigned to this CPU during the search and switch to them instead of continuing the search. This change is responsibile for the majority of the improvement in parallel buildworld times. In sched_balance_group() change the minimum threshold from stealing a thread from 1 to 2. Poaching a newly assigned thread from a CPU that is waking up hasn't yet switched to that thread from idle is likely very rare and is likely to have the same lock race as is seen when stealing threads in the idle loop. Also use tdq_notify() to kick the destintation CPU instead of always sending an IPI. Update a stale comment, the number of transferable threads is not calculated. Reviewed by: kib (earlier version) Comments by: avg, jeff, mav MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D12130
svn path=/head/; revision=329844
2018-02-23 00:12:51 +00:00 · 2018-02-23 00:12:51 +00:00 · 97e9382d56 · 2020-12-20 02:59:44 +00:00
commit 97e9382d56
parent dcd935dfd1
1 changed files with 175 additions and 48 deletions
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@ -240,9 +240,9 @@ struct tdq {
 	volatile int	tdq_load;		/* Aggregate load. */
 	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
-	int		tdq_transferable;	/* Transferable thread count. */
-	short		tdq_switchcnt;		/* Switches this tick. */
-	short		tdq_oldswitchcnt;	/* Switches last tick. */
+	volatile int	tdq_transferable;	/* Transferable thread count. */
+	volatile short	tdq_switchcnt;		/* Switches this tick. */
+	volatile short	tdq_oldswitchcnt;	/* Switches last tick. */
 	u_char		tdq_lowpri;		/* Lowest priority thread. */
 	u_char		tdq_ipipending;		/* IPI pending. */
 	u_char		tdq_idx;		/* Current insert index. */
@ -274,6 +274,8 @@ static int balance_interval = 128;	/* Default set in sched_initticks(). */
 static int affinity;
 static int steal_idle = 1;
 static int steal_thresh = 2;
+static int always_steal = 0;
+static int trysteal_limit = 2;

 /*
 * One thread queue per processor.
@ -319,7 +321,7 @@ void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
 static void tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
-static int tdq_move(struct tdq *, struct tdq *);
+static struct thread *tdq_move(struct tdq *, struct tdq *);
 static int tdq_idled(struct tdq *);
 static void tdq_notify(struct tdq *, struct thread *);
 static struct thread *tdq_steal(struct tdq *, int);
@ -841,7 +843,7 @@ sched_balance_group(struct cpu_group *cg)

 	CPU_FILL(&hmask);
 	for (;;) {
-		high = sched_highest(cg, hmask, 1);
+		high = sched_highest(cg, hmask, 2);
 		/* Stop if there is no more CPU with transferrable threads. */
 		if (high == -1)
 			break;
@ -924,33 +926,32 @@ tdq_unlock_pair(struct tdq *one, struct tdq *two)
 static int
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
-	int moved;
+	struct thread *td;
 	int cpu;

 	tdq_lock_pair(high, low);
-	moved = 0;
+	td = NULL;
 	/*
-	 * Determine what the imbalance is and then adjust that to how many
-	 * threads we actually have to give up (transferable).
+	 * Transfer a thread from high to low.
 	 */
 	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
-	    (moved = tdq_move(high, low)) > 0) {
+	    (td = tdq_move(high, low)) != NULL) {
 		/*
-		 * In case the target isn't the current cpu IPI it to force a
-		 * reschedule with the new workload.
+		 * In case the target isn't the current cpu notify it of the
+		 * new load, possibly sending an IPI to force it to reschedule.
 		 */
 		cpu = TDQ_ID(low);
 		if (cpu != PCPU_GET(cpuid))
-			ipi_cpu(cpu, IPI_PREEMPT);
+			tdq_notify(low, td);
 	}
 	tdq_unlock_pair(high, low);
-	return (moved);
+	return (td != NULL);
 }

 /*
 * Move a thread from one thread queue to another.
 */
-static int
+static struct thread *
 tdq_move(struct tdq *from, struct tdq *to)
 {
 	struct td_sched *ts;
@ -965,7 +966,7 @@ tdq_move(struct tdq *from, struct tdq *to)
 	cpu = TDQ_ID(to);
 	td = tdq_steal(tdq, cpu);
 	if (td == NULL)
-		return (0);
+		return (NULL);
 	ts = td_get_sched(td);
 	/*
 	 * Although the run queue is locked the thread may be blocked.  Lock
@ -978,7 +979,7 @@ tdq_move(struct tdq *from, struct tdq *to)
 	ts->ts_cpu = cpu;
 	td->td_lock = TDQ_LOCKPTR(to);
 	tdq_add(to, td, SRQ_YIELDING);
-	return (1);
+	return (td);
 }

 /*
@ -991,51 +992,80 @@ tdq_idled(struct tdq *tdq)
 	struct cpu_group *cg;
 	struct tdq *steal;
 	cpuset_t mask;
-	int thresh;
-	int cpu;
+	int cpu, switchcnt;

-	if (smp_started == 0 || steal_idle == 0)
+	if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL)
 		return (1);
 	CPU_FILL(&mask);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
-	/* We don't want to be preempted while we're iterating. */
-	spinlock_enter();
-	for (cg = tdq->tdq_cg; cg != NULL; ) {
-		if ((cg->cg_flags & CG_FLAG_THREAD) == 0)
-			thresh = steal_thresh;
-		else
-			thresh = 1;
-		cpu = sched_highest(cg, mask, thresh);
+    restart:
+	switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+	for (cg = tdq->tdq_cg; ; ) {
+		cpu = sched_highest(cg, mask, steal_thresh);
+		/*
+		 * We were assigned a thread but not preempted.  Returning
+		 * 0 here will cause our caller to switch to it.
+		 */
+		if (tdq->tdq_load)
+			return (0);
 		if (cpu == -1) {
 			cg = cg->cg_parent;
+			if (cg == NULL)
+				return (1);
 			continue;
 		}
 		steal = TDQ_CPU(cpu);
-		CPU_CLR(cpu, &mask);
+		/*
+		 * The data returned by sched_highest() is stale and
+		 * the chosen CPU no longer has an eligible thread.
+		 *
+		 * Testing this ahead of tdq_lock_pair() only catches
+		 * this situation about 20% of the time on an 8 core
+		 * 16 thread Ryzen 7, but it still helps performance.
+		 */
+		if (steal->tdq_load < steal_thresh ||
+		    steal->tdq_transferable == 0)
+			goto restart;
 		tdq_lock_pair(tdq, steal);
-		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
+		/*
+		 * We were assigned a thread while waiting for the locks.
+		 * Switch to it now instead of stealing a thread.
+		 */
+		if (tdq->tdq_load)
+			break;
+		/*
+		 * The data returned by sched_highest() is stale and
+		 * the chosen CPU no longer has an eligible thread, or
+		 * we were preempted and the CPU loading info may be out
+		 * of date.  The latter is rare.  In either case restart
+		 * the search.
+		 */
+		if (steal->tdq_load < steal_thresh ||
+		    steal->tdq_transferable == 0 ||
+		    switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) {
 			tdq_unlock_pair(tdq, steal);
-			continue;
+			goto restart;
 		}
 		/*
-		 * If a thread was added while interrupts were disabled don't
-		 * steal one here.  If we fail to acquire one due to affinity
-		 * restrictions loop again with this cpu removed from the
-		 * set.
+		 * Steal the thread and switch to it.
 		 */
-		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
-			tdq_unlock_pair(tdq, steal);
-			continue;
-		}
-		spinlock_exit();
-		TDQ_UNLOCK(steal);
-		mi_switch(SW_VOL | SWT_IDLE, NULL);
-		thread_unlock(curthread);
-
-		return (0);
+		if (tdq_move(steal, tdq) != NULL)
+			break;
+		/*
+		 * We failed to acquire a thread even though it looked
+		 * like one was available.  This could be due to affinity
+		 * restrictions or for other reasons.  Loop again after
+		 * removing this CPU from the set.  The restart logic
+		 * above does not restore this CPU to the set due to the
+		 * likelyhood of failing here again.
+		 */
+		CPU_CLR(cpu, &mask);
+		tdq_unlock_pair(tdq, steal);
 	}
-	spinlock_exit();
-	return (1);
+	TDQ_UNLOCK(steal);
+	mi_switch(SW_VOL | SWT_IDLE, NULL);
+	thread_unlock(curthread);
+	return (0);
 }

 /*
@ -1834,6 +1864,88 @@ sched_lend_user_prio(struct thread *td, u_char prio)
 		td->td_flags |= TDF_NEEDRESCHED;
 }

+/*
+ * This tdq is about to idle.  Try to steal a thread from another CPU before
+ * choosing the idle thread.
+ */
+static void
+tdq_trysteal(struct tdq *tdq)
+{
+	struct cpu_group *cg;
+	struct tdq *steal;
+	cpuset_t mask;
+	int cpu, i;
+
+	if (smp_started == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL)
+		return;
+	CPU_FILL(&mask);
+	CPU_CLR(PCPU_GET(cpuid), &mask);
+	/* We don't want to be preempted while we're iterating. */
+	spinlock_enter();
+	TDQ_UNLOCK(tdq);
+	for (i = 1, cg = tdq->tdq_cg; ; ) {
+		cpu = sched_highest(cg, mask, steal_thresh);
+		/*
+		 * If a thread was added while interrupts were disabled don't
+		 * steal one here.
+		 */
+		if (tdq->tdq_load > 0) {
+			TDQ_LOCK(tdq);
+			break;
+		}
+		if (cpu == -1) {
+			i++;
+			cg = cg->cg_parent;
+			if (cg == NULL || i > trysteal_limit) {
+				TDQ_LOCK(tdq);
+				break;
+			}
+			continue;
+		}
+		steal = TDQ_CPU(cpu);
+		/*
+		 * The data returned by sched_highest() is stale and
+                 * the chosen CPU no longer has an eligible thread.
+		 */
+		if (steal->tdq_load < steal_thresh ||
+		    steal->tdq_transferable == 0)
+			continue;
+		tdq_lock_pair(tdq, steal);
+		/*
+		 * If we get to this point, unconditonally exit the loop
+		 * to bound the time spent in the critcal section.
+		 *
+		 * If a thread was added while interrupts were disabled don't
+		 * steal one here.
+		 */
+		if (tdq->tdq_load > 0) {
+			TDQ_UNLOCK(steal);
+			break;
+		}
+		/*
+		 * The data returned by sched_highest() is stale and
+                 * the chosen CPU no longer has an eligible thread.
+		 */
+		if (steal->tdq_load < steal_thresh ||
+		    steal->tdq_transferable == 0) {
+			TDQ_UNLOCK(steal);
+			break;
+		}
+		/*
+		 * If we fail to acquire one due to affinity restrictions,
+		 * bail out and let the idle thread to a more complete search
+		 * outside of a critical section.
+		 */
+		if (tdq_move(steal, tdq) == NULL) {
+			TDQ_UNLOCK(steal);
+			break;
+		}
+		TDQ_UNLOCK(steal);
+		break;
+	}
+	spinlock_exit();
+}
+
 /*
 * Handle migration from sched_switch().  This happens only for
 * cpu binding.
@ -1946,6 +2058,8 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 		TDQ_LOCK(tdq);
 		mtx = thread_lock_block(td);
 		tdq_load_rem(tdq, td);
+		if (tdq->tdq_load == 0)
+			tdq_trysteal(tdq);
 	}

 #if (KTR_COMPILE & KTR_SCHED) != 0
@ -2677,7 +2791,7 @@ sched_idletd(void *dummy)
 		}
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 #ifdef SMP
-		if (switchcnt != oldswitchcnt) {
+		if (always_steal || switchcnt != oldswitchcnt) {
 			oldswitchcnt = switchcnt;
 			if (tdq_idled(tdq) == 0)
 				continue;
@ -2714,6 +2828,15 @@ sched_idletd(void *dummy)
 		 * to avoid race with tdq_notify.
 		 */
 		atomic_thread_fence_seq_cst();
+		/*
+		 * Checking for again after the fence picks up assigned
+		 * threads often enough to make it worthwhile to do so in
+		 * order to avoid calling cpu_idle().
+		 */
+		if (tdq->tdq_load != 0) {
+			tdq->tdq_cpu_idle = 0;
+			continue;
+		}
 		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
 		tdq->tdq_cpu_idle = 0;

@ -2948,6 +3071,10 @@ SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
    "Attempts to steal work from other cores before idling");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
    "Minimum load on remote CPU before we'll steal");
+SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit,
+    0, "Topological distance limit for stealing threads in sched_switch()");
+SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0,
+    "Always run the stealer from the idle thread");
 SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
    CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
    "XML dump of detected CPU topology");