- Refine the load balancer to improve buildkernel times on dual core

machines. - Leave the long-term load balancer running by default once per second. - Enable stealing load from the idle thread only when the remote processor has more than two transferable tasks. Setting this to one further improves buildworld. Setting it higher improves mysql. - Remove the bogus pick_zero option. I had not intended to commit this. - Entirely disallow migration for threads with SRQ_YIELDING set. This balances out the extra migration allowed for with the load balancers. It also makes pick_pri perform better as I had anticipated. Tested by: Dmitry Morozovsky <marck@rinet.ru> Approved by: re
2007-07-19 20:03:15 +00:00 · 2007-07-19 20:03:15 +00:00 · e2ebe96ef4
commit e2ebe96ef4
parent 550dacee12
1 changed files with 29 additions and 47 deletions
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@ -177,8 +177,6 @@ static int tickincr;
 static int sched_slice;
 static int preempt_thresh = PRI_MIN_KERN;

-#define	SCHED_BAL_SECS	2	/* How often we run the rebalance algorithm. */
-
 /*
 * tdq - per processor runqs and statistics.  All fields are protected by the
 * tdq_lock.  The load and lowpri may be accessed without to avoid excess
@ -229,14 +227,14 @@ struct tdq_group {
 /*
 * Run-time tunables.
 */
-static int rebalance = 0;
-static int pick_pri = 0;
-static int pick_zero = 0;
+static int rebalance = 1;
+static int balance_secs = 1;
+static int pick_pri = 1;
 static int affinity;
 static int tryself = 1;
-static int tryselfidle = 1;
 static int steal_htt = 0;
-static int steal_idle = 0;
+static int steal_idle = 1;
+static int steal_thresh = 2;
 static int topology = 0;

 /*
@ -514,7 +512,7 @@ sched_balance(void *arg)
 	int cnt;
 	int i;

-	callout_reset(&balco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
+	callout_reset(&balco, max(hz / 2, random() % (hz * balance_secs)),
 	    sched_balance, NULL);
 	if (smp_started == 0 || rebalance == 0)
 		return;
@ -547,7 +545,7 @@ sched_balance_groups(void *arg)
 {
 	int i;

-	callout_reset(&gbalco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
+	callout_reset(&gbalco, max(hz / 2, random() % (hz * balance_secs)),
 	    sched_balance_groups, NULL);
 	if (smp_started == 0 || rebalance == 0)
 		return;
@ -735,11 +733,11 @@ tdq_idled(struct tdq *tdq)
 			highload = load;
 			highcpu = cpu;
 		}
-		if (highload < 2)
+		if (highload < steal_thresh)
 			break;
 		steal = TDQ_CPU(highcpu);
 		TDQ_LOCK(steal);
-		if (steal->tdq_transferable > 1 &&
+		if (steal->tdq_transferable >= steal_thresh &&
 		    (ts = tdq_steal(steal, 1)) != NULL)
 			goto steal;
 		TDQ_UNLOCK(steal);
@ -864,11 +862,9 @@ runq_steal(struct runq *rq)
 	struct rqhead *rqh;
 	struct rqbits *rqb;
 	struct td_sched *ts;
-	int first;
 	int word;
 	int bit;

-	first = 0;
 	rqb = &rq->rq_status;
 	for (word = 0; word < RQB_LEN; word++) {
 		if (rqb->rqb_bits[word] == 0)
@ -877,11 +873,9 @@ runq_steal(struct runq *rq)
 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
-			TAILQ_FOREACH(ts, rqh, ts_procq) {
-				if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
+			TAILQ_FOREACH(ts, rqh, ts_procq)
+				if (THREAD_CAN_MIGRATE(ts->ts_thread))
 					return (ts);
-				first = 1;
-			}
 		}
 	}
 	return (NULL);
@ -1037,6 +1031,14 @@ sched_pickcpu(struct td_sched *ts, int flags)
 	cpu = self = PCPU_GET(cpuid);
 	if (smp_started == 0)
 		return (self);
+	/*
+	 * Don't migrate a running thread from sched_switch().
+	 */
+	if (flags & SRQ_OURSELF) {
+		CTR1(KTR_ULE, "YIELDING %d",
+		    curthread->td_priority);
+		return (self);
+	}
 	pri = ts->ts_thread->td_priority;
 	cpu = ts->ts_cpu;
 	/*
@ -1060,23 +1062,6 @@ sched_pickcpu(struct td_sched *ts, int flags)
 		    tdq->tdq_lowpri);
 		return (ts->ts_cpu);
 	}
-	/*
-	 * Try ourself first; If we're running something lower priority this
-	 * may have some locality with the waking thread and execute faster
-	 * here.
-	 */
-	if (tryself) {
-		/*
-		 * If we're being awoken by an interrupt thread or the waker
-		 * is going right to sleep run here as well.
-		 */
-		if ((TDQ_SELF()->tdq_load <= 1) && (flags & (SRQ_YIELDING) || 
-		    curthread->td_pri_class == PRI_ITHD)) {
-			CTR2(KTR_ULE, "tryself load %d flags %d",
-			    TDQ_SELF()->tdq_load, flags);
-			return (self);
-		}
-	}
 	/*
 	 * Look for an idle group.
 	 */
@ -1084,18 +1069,15 @@ sched_pickcpu(struct td_sched *ts, int flags)
 	cpu = ffs(tdq_idle);
 	if (cpu)
 		return (--cpu);
-	if (tryselfidle && pri < curthread->td_priority) {
-		CTR1(KTR_ULE, "tryselfidle %d",
+	/*
+	 * If there are no idle cores see if we can run the thread locally.  This may
+	 * improve locality among sleepers and wakers when there is shared data.
+	 */
+	if (tryself && pri < curthread->td_priority) {
+		CTR1(KTR_ULE, "tryself %d",
 		    curthread->td_priority);
 		return (self);
 	}
-	/*
-	 * XXX Under heavy load mysql performs way better if you
-	 * serialize the non-running threads on one cpu.  This is
-	 * a horrible hack.
-	 */
-	if (pick_zero)
-		return (0);
 	/*
 	 * Now search for the cpu running the lowest priority thread with
 	 * the least load.
@ -2546,19 +2528,19 @@ SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
    "Pick the target cpu based on priority rather than load.");
-SYSCTL_INT(_kern_sched, OID_AUTO, pick_zero, CTLFLAG_RW, &pick_zero, 0,
-    "If there are no idle cpus pick cpu0");
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
    "Number of hz ticks to keep thread affinity for");
 SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, tryselfidle, CTLFLAG_RW,
-    &tryselfidle, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
    "Enables the long-term load balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance_secs, CTLFLAG_RW, &balance_secs, 0,
+    "Average frequence in seconds to run the long-term balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
    "Steals work from another hyper-threaded core on idle");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
    "Attempts to steal work from other cores before idling");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
+    "Minimum load on remote cpu before we'll steal");
 SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0,
    "True when a topology has been specified by the MD code.");
 #endif