From 155b6ca12beb0400b88ba43ea5ee2a83cfd09f5a Mon Sep 17 00:00:00 2001
From: Jeff Roberson <jeff@FreeBSD.org>
Date: Fri, 5 Jan 2007 23:45:38 +0000
Subject: [PATCH]  - Fix a comparison in sched_choose() that caused cpus to be
 constantly    marked idle, thus breaking cpu load balancing.  - Change
 sched_interact_update() to fix cases where the stored history    has expanded
 significantly rather than handling them in the callers.  This    fixes a case
 where sched_priority() could compute a bad value.  - Add a sysctl to disable
 the global load balancer for experimentation.

---
 sys/kern/sched_ule.c | 60 ++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index ca65043238cd..e117782d3bd9 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -167,6 +167,7 @@ static int sched_interact = SCHED_INTERACT_THRESH;
 static int realstathz;
 static int tickincr;
 static int sched_slice;
+static int sched_rebalance;
 
 /*
  * tdq - per processor runqs and statistics.
@@ -428,10 +429,12 @@ sched_smp_tick(void)
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
-	if (ticks >= bal_tick)
-		sched_balance();
-	if (ticks >= gbal_tick && balance_groups)
-		sched_balance_groups();
+	if (sched_rebalance) {
+		if (ticks >= bal_tick)
+			sched_balance();
+		if (ticks >= gbal_tick && balance_groups)
+			sched_balance_groups();
+	}
 	/*
 	 * We could have been assigned a non real-time thread without an
 	 * IPI.
@@ -688,6 +691,9 @@ tdq_notify(struct td_sched *ts, int cpu)
 		*(volatile struct td_sched **)&ts->ts_assign = tdq->tdq_assigned;
 	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
 		(uintptr_t)ts->ts_assign, (uintptr_t)ts));
+	/* Only ipi for realtime/ithd priorities */
+	if (ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE)
+		return;
 	/*
 	 * Without sched_lock we could lose a race where we set NEEDRESCHED
 	 * on a thread that is switched out before the IPI is delivered.  This
@@ -696,8 +702,7 @@ tdq_notify(struct td_sched *ts, int cpu)
 	 */
 	pcpu = pcpu_find(cpu);
 	td = pcpu->pc_curthread;
-	if (ts->ts_thread->td_priority < td->td_priority ||
-	    td == pcpu->pc_idlethread) {
+	if (ts->ts_thread->td_priority < td->td_priority) {
 		td->td_flags |= TDF_NEEDRESCHED;
 		ipi_selected(1 << cpu, IPI_AST);
 	}
@@ -1074,29 +1079,44 @@ sched_priority(struct thread *td)
 /*
  * This routine enforces a maximum limit on the amount of scheduling history
  * kept.  It is called after either the slptime or runtime is adjusted.
- * This routine will not operate correctly when slp or run times have been
- * adjusted to more than double their maximum.
  */
 static void
 sched_interact_update(struct thread *td)
 {
+	struct td_sched *ts;
 	int sum;
 
-	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
+	ts = td->td_sched;
+	sum = ts->skg_runtime + ts->skg_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
 		return;
+	/*
+	 * This only happens from two places:
+	 * 1) We have added an unusual amount of run time from fork_exit.
+	 * 2) We have added an unusual amount of sleep time from sched_sleep().
+	 */
+	if (sum > SCHED_SLP_RUN_MAX * 2) {
+		if (ts->skg_runtime > ts->skg_slptime) {
+			ts->skg_runtime = SCHED_SLP_RUN_MAX;
+			ts->skg_slptime = 1;
+		} else {
+			ts->skg_slptime = SCHED_SLP_RUN_MAX;
+			ts->skg_runtime = 1;
+		}
+		return;
+	}
 	/*
 	 * If we have exceeded by more than 1/5th then the algorithm below
 	 * will not bring us back into range.  Dividing by two here forces
 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 	 */
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
-		td->td_sched->skg_runtime /= 2;
-		td->td_sched->skg_slptime /= 2;
+		ts->skg_runtime /= 2;
+		ts->skg_slptime /= 2;
 		return;
 	}
-	td->td_sched->skg_runtime = (td->td_sched->skg_runtime / 5) * 4;
-	td->td_sched->skg_slptime = (td->td_sched->skg_slptime / 5) * 4;
+	ts->skg_runtime = (ts->skg_runtime / 5) * 4;
+	ts->skg_slptime = (ts->skg_slptime / 5) * 4;
 }
 
 static void
@@ -1427,13 +1447,8 @@ sched_wakeup(struct thread *td)
 		int hzticks;
 
 		hzticks = (ticks - slptime) << SCHED_TICK_SHIFT;
-		if (hzticks >= SCHED_SLP_RUN_MAX) {
-			td->td_sched->skg_slptime = SCHED_SLP_RUN_MAX;
-			td->td_sched->skg_runtime = 1;
-		} else {
-			td->td_sched->skg_slptime += hzticks;
-			sched_interact_update(td);
-		}
+		td->td_sched->skg_slptime += hzticks;
+		sched_interact_update(td);
 		sched_pctcpu_update(td->td_sched);
 		sched_priority(td);
 	}
@@ -1695,7 +1710,7 @@ restart:
 	ts = tdq_choose(tdq);
 	if (ts) {
 #ifdef SMP
-		if (ts->ts_thread->td_priority <= PRI_MIN_IDLE)
+		if (ts->ts_thread->td_priority > PRI_MIN_IDLE)
 			if (tdq_idled(tdq) == 0)
 				goto restart;
 #endif
@@ -1767,7 +1782,7 @@ sched_add(struct thread *td, int flags)
 		 * for minimum latency.  Interrupt handlers may also have
 		 * to complete on the cpu that dispatched them.
 		 */
-		if (td->td_pinned == 0)
+		if (td->td_pinned == 0 && class == PRI_ITHD)
 			ts->ts_cpu = PCPU_GET(cpuid);
 	} else if (td->td_priority <= PRI_MAX_TIMESHARE)
 		ts->ts_runq = &tdq->tdq_timeshare;
@@ -1961,6 +1976,7 @@ SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RD, &sched_rebalance, 0, "");
 
 /* ps compat */
 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */