- Refine the load balancer to improve buildkernel times on dual core

machines.
 - Leave the long-term load balancer running by default once per second.
 - Enable stealing load from the idle thread only when the remote processor
   has more than two transferable tasks.  Setting this to one further
   improves buildworld.  Setting it higher improves mysql.
 - Remove the bogus pick_zero option.  I had not intended to commit this.
 - Entirely disallow migration for threads with SRQ_YIELDING set.  This
   balances out the extra migration allowed for with the load balancers.
   It also makes pick_pri perform better as I had anticipated.

Tested by:	Dmitry Morozovsky <marck@rinet.ru>
Approved by:	re
This commit is contained in:
jeff 2007-07-19 20:03:15 +00:00
parent 550dacee12
commit e2ebe96ef4

View File

@ -177,8 +177,6 @@ static int tickincr;
static int sched_slice;
static int preempt_thresh = PRI_MIN_KERN;
#define SCHED_BAL_SECS 2 /* How often we run the rebalance algorithm. */
/*
* tdq - per processor runqs and statistics. All fields are protected by the
* tdq_lock. The load and lowpri may be accessed without to avoid excess
@ -229,14 +227,14 @@ struct tdq_group {
/*
* Run-time tunables.
*/
static int rebalance = 0;
static int pick_pri = 0;
static int pick_zero = 0;
static int rebalance = 1;
static int balance_secs = 1;
static int pick_pri = 1;
static int affinity;
static int tryself = 1;
static int tryselfidle = 1;
static int steal_htt = 0;
static int steal_idle = 0;
static int steal_idle = 1;
static int steal_thresh = 2;
static int topology = 0;
/*
@ -514,7 +512,7 @@ sched_balance(void *arg)
int cnt;
int i;
callout_reset(&balco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
callout_reset(&balco, max(hz / 2, random() % (hz * balance_secs)),
sched_balance, NULL);
if (smp_started == 0 || rebalance == 0)
return;
@ -547,7 +545,7 @@ sched_balance_groups(void *arg)
{
int i;
callout_reset(&gbalco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
callout_reset(&gbalco, max(hz / 2, random() % (hz * balance_secs)),
sched_balance_groups, NULL);
if (smp_started == 0 || rebalance == 0)
return;
@ -735,11 +733,11 @@ tdq_idled(struct tdq *tdq)
highload = load;
highcpu = cpu;
}
if (highload < 2)
if (highload < steal_thresh)
break;
steal = TDQ_CPU(highcpu);
TDQ_LOCK(steal);
if (steal->tdq_transferable > 1 &&
if (steal->tdq_transferable >= steal_thresh &&
(ts = tdq_steal(steal, 1)) != NULL)
goto steal;
TDQ_UNLOCK(steal);
@ -864,11 +862,9 @@ runq_steal(struct runq *rq)
struct rqhead *rqh;
struct rqbits *rqb;
struct td_sched *ts;
int first;
int word;
int bit;
first = 0;
rqb = &rq->rq_status;
for (word = 0; word < RQB_LEN; word++) {
if (rqb->rqb_bits[word] == 0)
@ -877,11 +873,9 @@ runq_steal(struct runq *rq)
if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
continue;
rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
TAILQ_FOREACH(ts, rqh, ts_procq) {
if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
TAILQ_FOREACH(ts, rqh, ts_procq)
if (THREAD_CAN_MIGRATE(ts->ts_thread))
return (ts);
first = 1;
}
}
}
return (NULL);
@ -1037,6 +1031,14 @@ sched_pickcpu(struct td_sched *ts, int flags)
cpu = self = PCPU_GET(cpuid);
if (smp_started == 0)
return (self);
/*
* Don't migrate a running thread from sched_switch().
*/
if (flags & SRQ_OURSELF) {
CTR1(KTR_ULE, "YIELDING %d",
curthread->td_priority);
return (self);
}
pri = ts->ts_thread->td_priority;
cpu = ts->ts_cpu;
/*
@ -1060,23 +1062,6 @@ sched_pickcpu(struct td_sched *ts, int flags)
tdq->tdq_lowpri);
return (ts->ts_cpu);
}
/*
* Try ourself first; If we're running something lower priority this
* may have some locality with the waking thread and execute faster
* here.
*/
if (tryself) {
/*
* If we're being awoken by an interrupt thread or the waker
* is going right to sleep run here as well.
*/
if ((TDQ_SELF()->tdq_load <= 1) && (flags & (SRQ_YIELDING) ||
curthread->td_pri_class == PRI_ITHD)) {
CTR2(KTR_ULE, "tryself load %d flags %d",
TDQ_SELF()->tdq_load, flags);
return (self);
}
}
/*
* Look for an idle group.
*/
@ -1084,18 +1069,15 @@ sched_pickcpu(struct td_sched *ts, int flags)
cpu = ffs(tdq_idle);
if (cpu)
return (--cpu);
if (tryselfidle && pri < curthread->td_priority) {
CTR1(KTR_ULE, "tryselfidle %d",
/*
* If there are no idle cores see if we can run the thread locally. This may
* improve locality among sleepers and wakers when there is shared data.
*/
if (tryself && pri < curthread->td_priority) {
CTR1(KTR_ULE, "tryself %d",
curthread->td_priority);
return (self);
}
/*
* XXX Under heavy load mysql performs way better if you
* serialize the non-running threads on one cpu. This is
* a horrible hack.
*/
if (pick_zero)
return (0);
/*
* Now search for the cpu running the lowest priority thread with
* the least load.
@ -2546,19 +2528,19 @@ SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
#ifdef SMP
SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
"Pick the target cpu based on priority rather than load.");
SYSCTL_INT(_kern_sched, OID_AUTO, pick_zero, CTLFLAG_RW, &pick_zero, 0,
"If there are no idle cpus pick cpu0");
SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
"Number of hz ticks to keep thread affinity for");
SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, tryselfidle, CTLFLAG_RW,
&tryselfidle, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
"Enables the long-term load balancer");
SYSCTL_INT(_kern_sched, OID_AUTO, balance_secs, CTLFLAG_RW, &balance_secs, 0,
"Average frequence in seconds to run the long-term balancer");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
"Steals work from another hyper-threaded core on idle");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
"Attempts to steal work from other cores before idling");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
"Minimum load on remote cpu before we'll steal");
SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0,
"True when a topology has been specified by the MD code.");
#endif