Rework CPU load balancing in SCHED_ULE:

- In sched_pickcpu() be more careful taking previous CPU on SMT systems.
Do it only if all other logical CPUs of that physical one are idle to avoid
extra resource sharing.
 - In sched_pickcpu() change general logic of CPU selection. First
look for idle CPU, sharing last level cache with previously used one,
skipping SMT CPU groups. If none found, search all CPUs for the least loaded
one, where the thread with its priority can run now. If none found, search
just for the least loaded CPU.
 - Make cpu_search() compare lowest/highest CPU load when comparing CPU
groups with equal load. That allows to differentiate 1+1 and 2+0 loads.
 - Make cpu_search() to prefer specified (previous) CPU or group if load
is equal. This improves cache affinity for more complicated topologies.
 - Randomize CPU selection if above factors are equal. Previous code tend
to prefer CPUs with lower IDs, causing unneeded collisions.
 - Rework periodic balancer in sched_balance_group(). With cpu_search()
more intelligent now, make balansing process flat, removing recursion
over the topology tree. That fixes double swap problem and makes load
distribution more even and predictable.

All together this gives 10-15% performance improvement in many tests on
CPUs with SMT, such as Core i7, for number of threads is less then number
of logical CPUs. In some tests it also gives positive effect to systems
without SMT.

Reviewed by:	jeff
Tested by:	flo, hackers@
MFC after:	1 month
Sponsored by:	iXsystems, Inc.
This commit is contained in:
Alexander Motin 2012-02-27 10:31:54 +00:00
parent 648b0509e6
commit 36acfc6507

View File

@ -258,7 +258,6 @@ struct cpu_group *cpu_top; /* CPU topology */
static int rebalance = 1;
static int balance_interval = 128; /* Default set in sched_initticks(). */
static int affinity;
static int steal_htt = 1;
static int steal_idle = 1;
static int steal_thresh = 2;
@ -268,6 +267,7 @@ static int steal_thresh = 2;
static struct tdq tdq_cpu[MAXCPU];
static struct tdq *balance_tdq;
static int balance_ticks;
static DPCPU_DEFINE(uint32_t, randomval);
#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)])
#define TDQ_CPU(x) (&tdq_cpu[(x)])
@ -553,9 +553,11 @@ tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
#ifdef SMP
struct cpu_search {
cpuset_t cs_mask;
u_int cs_load;
u_int cs_cpu;
int cs_limit; /* Min priority for low min load for high. */
u_int cs_prefer;
int cs_pri; /* Min priority for low. */
int cs_limit; /* Max load for low, min load for high. */
int cs_cpu;
int cs_load;
};
#define CPU_SEARCH_LOWEST 0x1
@ -566,43 +568,13 @@ struct cpu_search {
for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \
if (CPU_ISSET(cpu, &mask))
static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
struct cpu_search *high, const int match);
int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
struct cpu_search *high);
/*
* This routine compares according to the match argument and should be
* reduced in actual instantiations via constant propagation and dead code
* elimination.
*/
static __inline int
cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
const int match)
{
struct tdq *tdq;
tdq = TDQ_CPU(cpu);
if (match & CPU_SEARCH_LOWEST)
if (CPU_ISSET(cpu, &low->cs_mask) &&
tdq->tdq_load < low->cs_load &&
tdq->tdq_lowpri > low->cs_limit) {
low->cs_cpu = cpu;
low->cs_load = tdq->tdq_load;
}
if (match & CPU_SEARCH_HIGHEST)
if (CPU_ISSET(cpu, &high->cs_mask) &&
tdq->tdq_load >= high->cs_limit &&
tdq->tdq_load > high->cs_load &&
tdq->tdq_transferable) {
high->cs_cpu = cpu;
high->cs_load = tdq->tdq_load;
}
return (tdq->tdq_load);
}
/*
* Search the tree of cpu_groups for the lowest or highest loaded cpu
* according to the match argument. This routine actually compares the
@ -615,33 +587,42 @@ cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
* also recursive to the depth of the tree.
*/
static __inline int
cpu_search(struct cpu_group *cg, struct cpu_search *low,
cpu_search(const struct cpu_group *cg, struct cpu_search *low,
struct cpu_search *high, const int match)
{
int total;
struct cpu_search lgroup;
struct cpu_search hgroup;
cpuset_t cpumask;
struct cpu_group *child;
struct tdq *tdq;
int cpu, i, hload, lload, load, total, rnd;
total = 0;
if (cg->cg_children) {
struct cpu_search lgroup;
struct cpu_search hgroup;
struct cpu_group *child;
u_int lload;
int hload;
int load;
int i;
lload = -1;
cpumask = cg->cg_mask;
if (match & CPU_SEARCH_LOWEST) {
lload = INT_MAX;
low->cs_load = INT_MAX;
lgroup = *low;
}
if (match & CPU_SEARCH_HIGHEST) {
hload = -1;
for (i = 0; i < cg->cg_children; i++) {
high->cs_load = -1;
hgroup = *high;
}
/* Iterate through the child CPU groups and then remaining CPUs. */
for (i = 0, cpu = 0; i <= cg->cg_children; ) {
if (i >= cg->cg_children) {
while (cpu <= mp_maxid && !CPU_ISSET(cpu, &cpumask))
cpu++;
if (cpu > mp_maxid)
break;
child = NULL;
} else
child = &cg->cg_child[i];
if (match & CPU_SEARCH_LOWEST) {
lgroup = *low;
lgroup.cs_load = -1;
}
if (match & CPU_SEARCH_HIGHEST) {
hgroup = *high;
lgroup.cs_load = 0;
}
if (child) { /* Handle child CPU group. */
CPU_NAND(&cpumask, &child->cg_mask);
switch (match) {
case CPU_SEARCH_LOWEST:
load = cpu_search_lowest(child, &lgroup);
@ -653,23 +634,52 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low,
load = cpu_search_both(child, &lgroup, &hgroup);
break;
}
total += load;
if (match & CPU_SEARCH_LOWEST)
if (load < lload || low->cs_cpu == -1) {
*low = lgroup;
lload = load;
} else { /* Handle child CPU. */
tdq = TDQ_CPU(cpu);
load = tdq->tdq_load * 256;
rnd = DPCPU_SET(randomval,
DPCPU_GET(randomval) * 69069 + 5) >> 26;
if (match & CPU_SEARCH_LOWEST) {
if (cpu == low->cs_prefer)
load -= 64;
/* If that CPU is allowed and get data. */
if (CPU_ISSET(cpu, &lgroup.cs_mask) &&
tdq->tdq_lowpri > lgroup.cs_pri &&
tdq->tdq_load <= lgroup.cs_limit) {
lgroup.cs_cpu = cpu;
lgroup.cs_load = load - rnd;
}
if (match & CPU_SEARCH_HIGHEST)
if (load > hload || high->cs_cpu == -1) {
hload = load;
*high = hgroup;
}
if (match & CPU_SEARCH_HIGHEST)
if (CPU_ISSET(cpu, &hgroup.cs_mask) &&
tdq->tdq_load >= hgroup.cs_limit &&
tdq->tdq_transferable) {
hgroup.cs_cpu = cpu;
hgroup.cs_load = load - rnd;
}
}
} else {
int cpu;
total += load;
CPUSET_FOREACH(cpu, cg->cg_mask)
total += cpu_compare(cpu, low, high, match);
/* We have info about child item. Compare it. */
if (match & CPU_SEARCH_LOWEST) {
if ((load < lload) ||
(load == lload && lgroup.cs_load < low->cs_load)) {
lload = load;
low->cs_cpu = lgroup.cs_cpu;
low->cs_load = lgroup.cs_load;
}
}
if (match & CPU_SEARCH_HIGHEST)
if ((load > hload) ||
(load == hload && hgroup.cs_load > high->cs_load)) {
hload = load;
high->cs_cpu = hgroup.cs_cpu;
high->cs_load = hgroup.cs_load;
}
if (child)
i++;
else
cpu++;
}
return (total);
}
@ -679,19 +689,19 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low,
* optimization.
*/
int
cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
{
return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
}
int
cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
{
return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
}
int
cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
struct cpu_search *high)
{
return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
@ -703,14 +713,16 @@ cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
* acceptable.
*/
static inline int
sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
int prefer)
{
struct cpu_search low;
low.cs_cpu = -1;
low.cs_load = -1;
low.cs_prefer = prefer;
low.cs_mask = mask;
low.cs_limit = pri;
low.cs_pri = pri;
low.cs_limit = maxload;
cpu_search_lowest(cg, &low);
return low.cs_cpu;
}
@ -719,12 +731,11 @@ sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
* Find the cpu with the highest load via the highest loaded path.
*/
static inline int
sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
{
struct cpu_search high;
high.cs_cpu = -1;
high.cs_load = 0;
high.cs_mask = mask;
high.cs_limit = minload;
cpu_search_highest(cg, &high);
@ -735,17 +746,17 @@ sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
* Simultaneously find the highest and lowest loaded cpu reachable via
* cg.
*/
static inline void
sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
static inline void
sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
{
struct cpu_search high;
struct cpu_search low;
low.cs_cpu = -1;
low.cs_limit = -1;
low.cs_load = -1;
low.cs_prefer = -1;
low.cs_pri = -1;
low.cs_limit = INT_MAX;
low.cs_mask = mask;
high.cs_load = 0;
high.cs_cpu = -1;
high.cs_limit = -1;
high.cs_mask = mask;
@ -758,30 +769,45 @@ sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
static void
sched_balance_group(struct cpu_group *cg)
{
cpuset_t mask;
int high;
int low;
int i;
cpuset_t hmask, lmask;
int high, low, anylow;
CPU_FILL(&mask);
CPU_FILL(&hmask);
for (;;) {
sched_both(cg, mask, &low, &high);
if (low == high || low == -1 || high == -1)
high = sched_highest(cg, hmask, 1);
/* Stop if there is no more CPU with transferrable threads. */
if (high == -1)
break;
if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
CPU_CLR(high, &hmask);
CPU_COPY(&hmask, &lmask);
/* Stop if there is no more CPU left for low. */
if (CPU_EMPTY(&lmask))
break;
/*
* If we failed to move any threads determine which cpu
* to kick out of the set and try again.
*/
if (TDQ_CPU(high)->tdq_transferable == 0)
CPU_CLR(high, &mask);
else
CPU_CLR(low, &mask);
anylow = 1;
nextlow:
low = sched_lowest(cg, lmask, -1,
TDQ_CPU(high)->tdq_load - 1, high);
/* Stop if we looked well and found no less loaded CPU. */
if (anylow && low == -1)
break;
/* Go to next high if we found no less loaded CPU. */
if (low == -1)
continue;
/* Transfer thread from high to low. */
if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
/* CPU that got thread can no longer be a donor. */
CPU_CLR(low, &hmask);
} else {
/*
* If failed, then there is no threads on high
* that can run on this low. Drop low from low
* mask and look for different one.
*/
CPU_CLR(low, &lmask);
anylow = 0;
goto nextlow;
}
}
for (i = 0; i < cg->cg_children; i++)
sched_balance_group(&cg->cg_child[i]);
}
static void
@ -834,32 +860,17 @@ tdq_unlock_pair(struct tdq *one, struct tdq *two)
static int
sched_balance_pair(struct tdq *high, struct tdq *low)
{
int transferable;
int high_load;
int low_load;
int moved;
int move;
int cpu;
int diff;
int i;
tdq_lock_pair(high, low);
transferable = high->tdq_transferable;
high_load = high->tdq_load;
low_load = low->tdq_load;
moved = 0;
/*
* Determine what the imbalance is and then adjust that to how many
* threads we actually have to give up (transferable).
*/
if (transferable != 0) {
diff = high_load - low_load;
move = diff / 2;
if (diff & 0x1)
move++;
move = min(move, transferable);
for (i = 0; i < move; i++)
moved += tdq_move(high, low);
if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
(moved = tdq_move(high, low)) > 0) {
/*
* In case the target isn't the current cpu IPI it to force a
* reschedule with the new workload.
@ -1003,8 +1014,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start)
{
struct rqbits *rqb;
struct rqhead *rqh;
struct thread *td;
int first;
struct thread *td, *first;
int bit;
int pri;
int i;
@ -1012,7 +1022,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start)
rqb = &rq->rq_status;
bit = start & (RQB_BPW -1);
pri = 0;
first = 0;
first = NULL;
again:
for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
if (rqb->rqb_bits[i] == 0)
@ -1031,7 +1041,7 @@ again:
if (first && THREAD_CAN_MIGRATE(td) &&
THREAD_CAN_SCHED(td, cpu))
return (td);
first = 1;
first = td;
}
}
if (start != 0) {
@ -1039,6 +1049,9 @@ again:
goto again;
}
if (first && THREAD_CAN_MIGRATE(first) &&
THREAD_CAN_SCHED(first, cpu))
return (first);
return (NULL);
}
@ -1140,13 +1153,11 @@ SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
static int
sched_pickcpu(struct thread *td, int flags)
{
struct cpu_group *cg;
struct cpu_group *cg, *ccg;
struct td_sched *ts;
struct tdq *tdq;
cpuset_t mask;
int self;
int pri;
int cpu;
int cpu, pri, self;
self = PCPU_GET(cpuid);
ts = td->td_sched;
@ -1161,45 +1172,70 @@ sched_pickcpu(struct thread *td, int flags)
* Prefer to run interrupt threads on the processors that generate
* the interrupt.
*/
pri = td->td_priority;
if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
curthread->td_intr_nesting_level && ts->ts_cpu != self) {
SCHED_STAT_INC(pickcpu_intrbind);
ts->ts_cpu = self;
}
/*
* If the thread can run on the last cpu and the affinity has not
* expired or it is idle run it there.
*/
pri = td->td_priority;
tdq = TDQ_CPU(ts->ts_cpu);
if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
SCHED_STAT_INC(pickcpu_idle_affinity);
return (ts->ts_cpu);
}
if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
if (TDQ_CPU(self)->tdq_lowpri > pri) {
SCHED_STAT_INC(pickcpu_affinity);
return (ts->ts_cpu);
}
}
/*
* Search for the highest level in the tree that still has affinity.
* If the thread can run on the last cpu and the affinity has not
* expired or it is idle run it there.
*/
cg = NULL;
for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
if (SCHED_AFFINITY(ts, cg->cg_level))
break;
tdq = TDQ_CPU(ts->ts_cpu);
cg = tdq->tdq_cg;
if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
tdq->tdq_lowpri >= PRI_MIN_IDLE &&
SCHED_AFFINITY(ts, CG_SHARE_L2)) {
if (cg->cg_flags & CG_FLAG_THREAD) {
CPUSET_FOREACH(cpu, cg->cg_mask) {
if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
break;
}
} else
cpu = INT_MAX;
if (cpu > mp_maxid) {
SCHED_STAT_INC(pickcpu_idle_affinity);
return (ts->ts_cpu);
}
}
/*
* Search for the last level cache CPU group in the tree.
* Skip caches with expired affinity time and SMT groups.
* Affinity to higher level caches will be handled less aggressively.
*/
for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
if (cg->cg_flags & CG_FLAG_THREAD)
continue;
if (!SCHED_AFFINITY(ts, cg->cg_level))
continue;
ccg = cg;
}
if (ccg != NULL)
cg = ccg;
cpu = -1;
/* Search the group for the less loaded idle CPU we can run now. */
mask = td->td_cpuset->cs_mask;
if (cg)
cpu = sched_lowest(cg, mask, pri);
if (cg != NULL && cg != cpu_top &&
CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0)
cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
INT_MAX, ts->ts_cpu);
/* Search globally for the less loaded CPU we can run now. */
if (cpu == -1)
cpu = sched_lowest(cpu_top, mask, -1);
cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
/* Search globally for the less loaded CPU. */
if (cpu == -1)
cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
/*
* Compare the lowest loaded cpu to current cpu.
*/
if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
SCHED_STAT_INC(pickcpu_local);
cpu = self;
} else
@ -2750,8 +2786,6 @@ SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
&balance_interval, 0,
"Average frequency in stathz ticks to run the long-term balancer");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
"Steals work from another hyper-threaded core on idle");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
"Attempts to steal work from other cores before idling");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,