Major revamp of ULE's cpu load balancing:

- Switch back to direct modification of remote CPU run queues.  This added
   a lot of complexity with questionable gain.  It's easy enough to
   reimplement if it's shown to help on huge machines.
 - Re-implement the old tdq_transfer() call as tdq_pickidle().  Change
   sched_add() so we have selectable cpu choosers and simplify the logic
   a bit here.
 - Implement tdq_pickpri() as the new default cpu chooser.  This algorithm
   is similar to Solaris in that it tries to always run the threads with
   the best priorities.  It is actually slightly more complex than
   solaris's algorithm because we also tend to favor the local cpu over
   other cpus which has a boost in latency but also potentially enables
   cache sharing between the waking thread and the woken thread.
 - Add a bunch of tunables that can be used to measure effects of different
   load balancing strategies.  Most of these will go away once the
   algorithm is more definite.
 - Add a new mechanism to steal threads from busy cpus when we idle.  This
   is enabled with kern.sched.steal_busy and kern.sched.busy_thresh.  The
   threshold is the required length of a tdq's run queue before another
   cpu will be able to steal runnable threads.  This prevents most queue
   imbalances that contribute the long latencies.
This commit is contained in:
Jeff Roberson 2007-01-19 21:56:08 +00:00
parent c3f5198b21
commit 7b8bfa0de9

View File

@ -80,17 +80,17 @@ struct td_sched {
int ts_ltick; /* Last tick that we were running on */ int ts_ltick; /* Last tick that we were running on */
int ts_ftick; /* First tick that we were running on */ int ts_ftick; /* First tick that we were running on */
int ts_ticks; /* Tick count */ int ts_ticks; /* Tick count */
#ifdef SMP
int ts_rltick; /* Real last tick, for affinity. */
#endif
/* originally from kg_sched */ /* originally from kg_sched */
int skg_slptime; /* Number of ticks we vol. slept */ int skg_slptime; /* Number of ticks we vol. slept */
int skg_runtime; /* Number of ticks we were running */ int skg_runtime; /* Number of ticks we were running */
}; };
#define ts_assign ts_procq.tqe_next
/* flags kept in ts_flags */ /* flags kept in ts_flags */
#define TSF_ASSIGNED 0x0001 /* Thread is being migrated. */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */
#define TSF_BOUND 0x0002 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */
#define TSF_XFERABLE 0x0004 /* Thread was added as transferable. */
#define TSF_REMOVED 0x0008 /* Thread was removed while ASSIGNED */
#define TSF_DIDRUN 0x2000 /* Thread actually ran. */ #define TSF_DIDRUN 0x2000 /* Thread actually ran. */
static struct td_sched td_sched0; static struct td_sched td_sched0;
@ -163,7 +163,6 @@ static int sched_interact = SCHED_INTERACT_THRESH;
static int realstathz; static int realstathz;
static int tickincr; static int tickincr;
static int sched_slice; static int sched_slice;
static int sched_rebalance = 1;
/* /*
* tdq - per processor runqs and statistics. * tdq - per processor runqs and statistics.
@ -175,16 +174,18 @@ struct tdq {
int tdq_idx; /* Current insert index. */ int tdq_idx; /* Current insert index. */
int tdq_ridx; /* Current removal index. */ int tdq_ridx; /* Current removal index. */
int tdq_load; /* Aggregate load. */ int tdq_load; /* Aggregate load. */
int tdq_flags; /* Thread queue flags */
#ifdef SMP #ifdef SMP
int tdq_transferable; int tdq_transferable;
LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */
struct tdq_group *tdq_group; /* Our processor group. */ struct tdq_group *tdq_group; /* Our processor group. */
volatile struct td_sched *tdq_assigned; /* assigned by another CPU. */
#else #else
int tdq_sysload; /* For loadavg, !ITHD load. */ int tdq_sysload; /* For loadavg, !ITHD load. */
#endif #endif
}; };
#define TDQF_BUSY 0x0001 /* Queue is marked as busy */
#ifdef SMP #ifdef SMP
/* /*
* tdq groups are groups of processors which can cheaply share threads. When * tdq groups are groups of processors which can cheaply share threads. When
@ -203,13 +204,30 @@ struct tdq_group {
int tdg_transferable; /* Transferable load of this group. */ int tdg_transferable; /* Transferable load of this group. */
LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */
}; };
#endif
#define SCHED_AFFINITY_DEFAULT (hz / 100)
#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity)
/*
* Run-time tunables.
*/
static int rebalance = 1;
static int pick_pri = 1;
static int affinity;
static int tryself = 1;
static int tryselfidle = 1;
static int ipi_ast = 0;
static int ipi_preempt = 1;
static int ipi_thresh = PRI_MIN_KERN;
static int steal_htt = 1;
static int steal_busy = 1;
static int busy_thresh = 4;
/* /*
* One thread queue per processor. * One thread queue per processor.
*/ */
#ifdef SMP static volatile cpumask_t tdq_idle;
static cpumask_t tdq_idle; static volatile cpumask_t tdq_busy;
static int tdg_maxid; static int tdg_maxid;
static struct tdq tdq_cpu[MAXCPU]; static struct tdq tdq_cpu[MAXCPU];
static struct tdq_group tdq_groups[MAXCPU]; static struct tdq_group tdq_groups[MAXCPU];
@ -248,21 +266,20 @@ static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
void tdq_print(int cpu); void tdq_print(int cpu);
static void runq_print(struct runq *rq); static void runq_print(struct runq *rq);
#ifdef SMP #ifdef SMP
static int tdq_transfer(struct tdq *, struct td_sched *, int); static int tdq_pickidle(struct tdq *, struct td_sched *);
static int tdq_pickpri(struct tdq *, struct td_sched *, int);
static struct td_sched *runq_steal(struct runq *); static struct td_sched *runq_steal(struct runq *);
static void sched_balance(void); static void sched_balance(void);
static void sched_balance_groups(void); static void sched_balance_groups(void);
static void sched_balance_group(struct tdq_group *); static void sched_balance_group(struct tdq_group *);
static void sched_balance_pair(struct tdq *, struct tdq *); static void sched_balance_pair(struct tdq *, struct tdq *);
static void sched_smp_tick(void); static void sched_smp_tick(struct thread *);
static void tdq_move(struct tdq *, int); static void tdq_move(struct tdq *, int);
static int tdq_idled(struct tdq *); static int tdq_idled(struct tdq *);
static void tdq_notify(struct td_sched *, int); static void tdq_notify(struct td_sched *);
static void tdq_assign(struct tdq *);
static struct td_sched *tdq_steal(struct tdq *, int); static struct td_sched *tdq_steal(struct tdq *, int);
#define THREAD_CAN_MIGRATE(td) \ #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
((td)->td_pinned == 0 && (td)->td_pri_class != PRI_ITHD)
#endif #endif
static void sched_setup(void *dummy); static void sched_setup(void *dummy);
@ -337,6 +354,11 @@ tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
tdq->tdq_transferable++; tdq->tdq_transferable++;
tdq->tdq_group->tdg_transferable++; tdq->tdq_group->tdg_transferable++;
ts->ts_flags |= TSF_XFERABLE; ts->ts_flags |= TSF_XFERABLE;
if (tdq->tdq_transferable >= busy_thresh &&
(tdq->tdq_flags & TDQF_BUSY) == 0) {
tdq->tdq_flags |= TDQF_BUSY;
atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq));
}
} }
#endif #endif
if (ts->ts_runq == &tdq->tdq_timeshare) { if (ts->ts_runq == &tdq->tdq_timeshare) {
@ -376,6 +398,11 @@ tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
tdq->tdq_transferable--; tdq->tdq_transferable--;
tdq->tdq_group->tdg_transferable--; tdq->tdq_group->tdg_transferable--;
ts->ts_flags &= ~TSF_XFERABLE; ts->ts_flags &= ~TSF_XFERABLE;
if (tdq->tdq_transferable < busy_thresh &&
(tdq->tdq_flags & TDQF_BUSY)) {
atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq));
tdq->tdq_flags &= ~TDQF_BUSY;
}
} }
#endif #endif
if (ts->ts_runq == &tdq->tdq_timeshare) { if (ts->ts_runq == &tdq->tdq_timeshare) {
@ -402,7 +429,8 @@ tdq_load_add(struct tdq *tdq, struct td_sched *ts)
class = PRI_BASE(ts->ts_thread->td_pri_class); class = PRI_BASE(ts->ts_thread->td_pri_class);
tdq->tdq_load++; tdq->tdq_load++;
CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) if (class != PRI_ITHD &&
(ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
#ifdef SMP #ifdef SMP
tdq->tdq_group->tdg_load++; tdq->tdq_group->tdg_load++;
#else #else
@ -416,7 +444,8 @@ tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
int class; int class;
mtx_assert(&sched_lock, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED);
class = PRI_BASE(ts->ts_thread->td_pri_class); class = PRI_BASE(ts->ts_thread->td_pri_class);
if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) if (class != PRI_ITHD &&
(ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
#ifdef SMP #ifdef SMP
tdq->tdq_group->tdg_load--; tdq->tdq_group->tdg_load--;
#else #else
@ -429,23 +458,18 @@ tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
#ifdef SMP #ifdef SMP
static void static void
sched_smp_tick(void) sched_smp_tick(struct thread *td)
{ {
struct tdq *tdq; struct tdq *tdq;
tdq = TDQ_SELF(); tdq = TDQ_SELF();
if (sched_rebalance) { if (rebalance) {
if (ticks >= bal_tick) if (ticks >= bal_tick)
sched_balance(); sched_balance();
if (ticks >= gbal_tick && balance_groups) if (ticks >= gbal_tick && balance_groups)
sched_balance_groups(); sched_balance_groups();
} }
/* td->td_sched->ts_rltick = ticks;
* We could have been assigned a non real-time thread without an
* IPI.
*/
if (tdq->tdq_assigned)
tdq_assign(tdq); /* Potentially sets NEEDRESCHED */
} }
/* /*
@ -599,10 +623,11 @@ tdq_move(struct tdq *from, int cpu)
} }
if (tdq == to) if (tdq == to)
return; return;
ts->ts_state = TSS_THREAD; sched_rem(ts->ts_thread);
tdq_runq_rem(tdq, ts); ts->ts_cpu = cpu;
tdq_load_rem(tdq, ts); sched_pin_td(ts->ts_thread);
tdq_notify(ts, cpu); sched_add(ts->ts_thread, SRQ_YIELDING);
sched_unpin_td(ts->ts_thread);
} }
static int static int
@ -617,21 +642,34 @@ tdq_idled(struct tdq *tdq)
* If we're in a cpu group, try and steal threads from another cpu in * If we're in a cpu group, try and steal threads from another cpu in
* the group before idling. * the group before idling.
*/ */
if (tdg->tdg_cpus > 1 && tdg->tdg_transferable) { if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
if (steal == tdq || steal->tdq_transferable == 0) if (steal == tdq || steal->tdq_transferable == 0)
continue; continue;
ts = tdq_steal(steal, 0); ts = tdq_steal(steal, 0);
if (ts)
goto steal;
}
}
if (steal_busy) {
while (tdq_busy) {
int cpu;
cpu = ffs(tdq_busy);
if (cpu == 0)
break;
cpu--;
steal = TDQ_CPU(cpu);
if (steal->tdq_transferable == 0)
continue;
ts = tdq_steal(steal, 1);
if (ts == NULL) if (ts == NULL)
continue; continue;
ts->ts_state = TSS_THREAD; CTR5(KTR_SCHED,
tdq_runq_rem(steal, ts); "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X",
tdq_load_rem(steal, ts); ts->ts_thread, ts->ts_thread->td_proc->p_comm,
ts->ts_cpu = PCPU_GET(cpuid); ts->ts_thread->td_priority, cpu, tdq_busy);
sched_pin_td(ts->ts_thread); goto steal;
sched_add(ts->ts_thread, SRQ_YIELDING);
sched_unpin_td(ts->ts_thread);
return (0);
} }
} }
/* /*
@ -640,79 +678,51 @@ tdq_idled(struct tdq *tdq)
* back and forth between two idle cores on seperate physical CPUs. * back and forth between two idle cores on seperate physical CPUs.
*/ */
tdg->tdg_idlemask |= PCPU_GET(cpumask); tdg->tdg_idlemask |= PCPU_GET(cpumask);
if (tdg->tdg_idlemask != tdg->tdg_cpumask) if (tdg->tdg_idlemask == tdg->tdg_cpumask)
return (1); atomic_set_int(&tdq_idle, tdg->tdg_mask);
atomic_set_int(&tdq_idle, tdg->tdg_mask);
return (1); return (1);
steal:
sched_rem(ts->ts_thread);
ts->ts_cpu = PCPU_GET(cpuid);
sched_pin_td(ts->ts_thread);
sched_add(ts->ts_thread, SRQ_YIELDING);
sched_unpin_td(ts->ts_thread);
return (0);
} }
static void static void
tdq_assign(struct tdq *tdq) tdq_notify(struct td_sched *ts)
{ {
struct td_sched *nts;
struct td_sched *ts;
do {
*(volatile struct td_sched **)&ts = tdq->tdq_assigned;
} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
(uintptr_t)ts, (uintptr_t)NULL));
for (; ts != NULL; ts = nts) {
nts = ts->ts_assign;
tdq->tdq_group->tdg_load--;
tdq->tdq_load--;
ts->ts_flags &= ~TSF_ASSIGNED;
if (ts->ts_flags & TSF_REMOVED) {
ts->ts_flags &= ~TSF_REMOVED;
continue;
}
sched_pin_td(ts->ts_thread);
sched_add(ts->ts_thread, SRQ_YIELDING);
sched_unpin_td(ts->ts_thread);
}
}
static void
tdq_notify(struct td_sched *ts, int cpu)
{
struct tdq *tdq;
struct thread *td; struct thread *td;
struct pcpu *pcpu; struct pcpu *pcpu;
int class;
int prio; int prio;
int cpu;
tdq = TDQ_CPU(cpu);
class = PRI_BASE(ts->ts_thread->td_pri_class);
if ((class != PRI_IDLE && class != PRI_ITHD)
&& (tdq_idle & tdq->tdq_group->tdg_mask))
atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
tdq->tdq_group->tdg_load++;
tdq->tdq_load++;
ts->ts_cpu = cpu;
ts->ts_flags |= TSF_ASSIGNED;
prio = ts->ts_thread->td_priority; prio = ts->ts_thread->td_priority;
cpu = ts->ts_cpu;
/*
* Place a thread on another cpu's queue and force a resched.
*/
do {
*(volatile struct td_sched **)&ts->ts_assign = tdq->tdq_assigned;
} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
(uintptr_t)ts->ts_assign, (uintptr_t)ts));
/* Only ipi for realtime/ithd priorities */
if (ts->ts_thread->td_priority > PRI_MIN_KERN)
return;
/*
* Without sched_lock we could lose a race where we set NEEDRESCHED
* on a thread that is switched out before the IPI is delivered. This
* would lead us to miss the resched. This will be a problem once
* sched_lock is pushed down.
*/
pcpu = pcpu_find(cpu); pcpu = pcpu_find(cpu);
td = pcpu->pc_curthread; td = pcpu->pc_curthread;
if (ts->ts_thread->td_priority < td->td_priority) { /*
* IPI if we exceed the threshold or if the target cpu is running an
* idle thread.
*/
if (prio > ipi_thresh && td->td_priority < PRI_MIN_IDLE)
return;
/*
* IPI only if our priority is better than the running thread and
* the running thread is not the per cpu idle thread. The
* idlethread finds new work via sched_runnable().
*/
if (td == pcpu->pc_idlethread)
return;
if (prio > td->td_priority)
return;
if (ipi_ast) {
td->td_flags |= TDF_NEEDRESCHED; td->td_flags |= TDF_NEEDRESCHED;
ipi_selected(1 << cpu, IPI_AST); ipi_selected(1 << cpu, IPI_AST);
} } else if (ipi_preempt)
ipi_selected(1 << cpu, IPI_PREEMPT);
} }
static struct td_sched * static struct td_sched *
@ -762,95 +772,134 @@ tdq_steal(struct tdq *tdq, int stealidle)
} }
int int
tdq_transfer(struct tdq *tdq, struct td_sched *ts, int class) tdq_pickidle(struct tdq *tdq, struct td_sched *ts)
{ {
struct tdq_group *ntdg;
struct tdq_group *tdg; struct tdq_group *tdg;
struct tdq *old; int self;
int cpu; int cpu;
int idx;
self = PCPU_GET(cpuid);
if (smp_started == 0) if (smp_started == 0)
return (0); return (self);
cpu = 0;
/* /*
* If our load exceeds a certain threshold we should attempt to * If the current CPU has idled, just run it here.
* reassign this thread. The first candidate is the cpu that
* originally ran the thread. If it is idle, assign it there,
* otherwise, pick an idle cpu.
*
* The threshold at which we start to reassign has a large impact
* on the overall performance of the system. Tuned too high and
* some CPUs may idle. Too low and there will be excess migration
* and context switches.
*/ */
old = TDQ_CPU(ts->ts_cpu); if ((tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0)
ntdg = old->tdq_group; return (self);
tdg = tdq->tdq_group;
if (tdq_idle) {
if (tdq_idle & ntdg->tdg_mask) {
cpu = ffs(ntdg->tdg_idlemask);
if (cpu) {
CTR2(KTR_SCHED,
"tdq_transfer: %p found old cpu %X "
"in idlemask.", ts, cpu);
goto migrate;
}
}
/*
* Multiple cpus could find this bit simultaneously
* but the race shouldn't be terrible.
*/
cpu = ffs(tdq_idle);
if (cpu) {
CTR2(KTR_SCHED, "tdq_transfer: %p found %X "
"in idlemask.", ts, cpu);
goto migrate;
}
}
idx = 0;
#if 0
if (old->tdq_load < tdq->tdq_load) {
cpu = ts->ts_cpu + 1;
CTR2(KTR_SCHED, "tdq_transfer: %p old cpu %X "
"load less than ours.", ts, cpu);
goto migrate;
}
/* /*
* No new CPU was found, look for one with less load. * Try the last group we ran on.
*/ */
for (idx = 0; idx <= tdg_maxid; idx++) { tdg = TDQ_CPU(ts->ts_cpu)->tdq_group;
ntdg = TDQ_GROUP(idx); cpu = ffs(tdg->tdg_idlemask);
if (ntdg->tdg_load /*+ (ntdg->tdg_cpus * 2)*/ < tdg->tdg_load) { if (cpu)
cpu = ffs(ntdg->tdg_cpumask); return (cpu - 1);
CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X load less "
"than ours.", ts, cpu);
goto migrate;
}
}
#endif
/* /*
* If another cpu in this group has idled, assign a thread over * Search for an idle group.
* to them after checking to see if there are idled groups.
*/ */
if (tdg->tdg_idlemask) { cpu = ffs(tdq_idle);
cpu = ffs(tdg->tdg_idlemask); if (cpu)
if (cpu) { return (cpu - 1);
CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X idle in "
"group.", ts, cpu);
goto migrate;
}
}
return (0);
migrate:
/* /*
* Now that we've found an idle CPU, migrate the thread. * XXX If there are no idle groups, check for an idle core.
*/ */
cpu--; /*
ts->ts_runq = NULL; * No idle CPUs?
tdq_notify(ts, cpu); */
return (self);
}
return (1); static int
tdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags)
{
struct pcpu *pcpu;
int lowpri;
int lowcpu;
int lowload;
int load;
int self;
int pri;
int cpu;
self = PCPU_GET(cpuid);
if (smp_started == 0)
return (self);
pri = ts->ts_thread->td_priority;
/*
* Regardless of affinity, if the last cpu is idle send it there.
*/
pcpu = pcpu_find(ts->ts_cpu);
if (pcpu->pc_curthread->td_priority > PRI_MIN_IDLE) {
CTR5(KTR_SCHED,
"ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
ts->ts_cpu, ts->ts_rltick, ticks, pri,
pcpu->pc_curthread->td_priority);
return (ts->ts_cpu);
}
/*
* If we have affinity, try to place it on the cpu we last ran on.
*/
if (SCHED_AFFINITY(ts) && pcpu->pc_curthread->td_priority > pri) {
CTR5(KTR_SCHED,
"affinity for %d, ltick %d ticks %d pri %d curthread %d",
ts->ts_cpu, ts->ts_rltick, ticks, pri,
pcpu->pc_curthread->td_priority);
return (ts->ts_cpu);
}
/*
* Try ourself first; If we're running something lower priority this
* may have some locality with the waking thread and execute faster
* here.
*/
if (tryself) {
/*
* If we're being awoken by an interrupt thread or the waker
* is going right to sleep run here as well.
*/
if ((TDQ_SELF()->tdq_load == 1) && (flags & SRQ_YIELDING ||
curthread->td_pri_class == PRI_ITHD)) {
CTR2(KTR_SCHED, "tryself load %d flags %d",
TDQ_SELF()->tdq_load, flags);
return (self);
}
}
/*
* Look for an idle group.
*/
CTR1(KTR_SCHED, "tdq_idle %X", tdq_idle);
cpu = ffs(tdq_idle);
if (cpu)
return (cpu - 1);
if (tryselfidle && pri < curthread->td_priority) {
CTR1(KTR_SCHED, "tryself %d",
curthread->td_priority);
return (self);
}
/*
* Now search for the cpu running the lowest priority thread with
* the least load.
*/
lowload = 0;
lowpri = lowcpu = 0;
for (cpu = 0; cpu <= mp_maxid; cpu++) {
if (CPU_ABSENT(cpu))
continue;
pcpu = pcpu_find(cpu);
pri = pcpu->pc_curthread->td_priority;
CTR4(KTR_SCHED,
"cpu %d pri %d lowcpu %d lowpri %d",
cpu, pri, lowcpu, lowpri);
if (pri < lowpri)
continue;
load = TDQ_CPU(cpu)->tdq_load;
if (lowpri && lowpri == pri && load > lowload)
continue;
lowpri = pri;
lowcpu = cpu;
lowload = load;
}
return (lowcpu);
} }
#endif /* SMP */ #endif /* SMP */
@ -926,7 +975,6 @@ sched_setup(void *dummy)
struct tdq *tdq; struct tdq *tdq;
tdq = &tdq_cpu[i]; tdq = &tdq_cpu[i];
tdq->tdq_assigned = NULL;
tdq_setup(&tdq_cpu[i]); tdq_setup(&tdq_cpu[i]);
} }
if (smp_topology == NULL) { if (smp_topology == NULL) {
@ -1023,6 +1071,9 @@ sched_initticks(void *dummy)
*/ */
if (tickincr == 0) if (tickincr == 0)
tickincr = 1; tickincr = 1;
#ifdef SMP
affinity = SCHED_AFFINITY_DEFAULT;
#endif
mtx_unlock_spin(&sched_lock); mtx_unlock_spin(&sched_lock);
} }
@ -1231,16 +1282,10 @@ sched_thread_priority(struct thread *td, u_char prio)
* propagation, we may have to move ourselves to a new * propagation, we may have to move ourselves to a new
* queue. This could be optimized to not re-add in some * queue. This could be optimized to not re-add in some
* cases. * cases.
*
* Hold this td_sched on this cpu so that sched_prio() doesn't
* cause excessive migration. We only want migration to
* happen as the result of a wakeup.
*/ */
sched_pin_td(td);
sched_rem(td); sched_rem(td);
td->td_priority = prio; td->td_priority = prio;
sched_add(td, SRQ_BORROWING); sched_add(td, SRQ_BORROWING);
sched_unpin_td(td);
} else } else
td->td_priority = prio; td->td_priority = prio;
} }
@ -1356,9 +1401,11 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
{ {
struct tdq *tdq; struct tdq *tdq;
struct td_sched *ts; struct td_sched *ts;
int preempt;
mtx_assert(&sched_lock, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED);
preempt = flags & SW_PREEMPT;
tdq = TDQ_SELF(); tdq = TDQ_SELF();
ts = td->td_sched; ts = td->td_sched;
td->td_lastcpu = td->td_oncpu; td->td_lastcpu = td->td_oncpu;
@ -1371,19 +1418,20 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
*/ */
if (td == PCPU_GET(idlethread)) { if (td == PCPU_GET(idlethread)) {
TD_SET_CAN_RUN(td); TD_SET_CAN_RUN(td);
} else if ((ts->ts_flags & TSF_ASSIGNED) == 0) { } else {
/* We are ending our run so make our slot available again */
tdq_load_rem(tdq, ts); tdq_load_rem(tdq, ts);
if (TD_IS_RUNNING(td)) { if (TD_IS_RUNNING(td)) {
/* /*
* Don't allow the thread to migrate * Don't allow the thread to migrate
* from a preemption. * from a preemption.
*/ */
sched_pin_td(td); if (preempt)
setrunqueue(td, (flags & SW_PREEMPT) ? sched_pin_td(td);
setrunqueue(td, preempt ?
SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
SRQ_OURSELF|SRQ_YIELDING); SRQ_OURSELF|SRQ_YIELDING);
sched_unpin_td(td); if (preempt)
sched_unpin_td(td);
} }
} }
if (newtd != NULL) { if (newtd != NULL) {
@ -1614,7 +1662,7 @@ sched_clock(struct thread *td)
mtx_assert(&sched_lock, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED);
#ifdef SMP #ifdef SMP
sched_smp_tick(); sched_smp_tick(td);
#endif #endif
tdq = TDQ_SELF(); tdq = TDQ_SELF();
/* /*
@ -1656,9 +1704,6 @@ sched_clock(struct thread *td)
* We're out of time, recompute priorities and requeue. * We're out of time, recompute priorities and requeue.
*/ */
sched_priority(td); sched_priority(td);
tdq_load_rem(tdq, ts);
ts->ts_slice = sched_slice;
tdq_load_add(tdq, ts);
td->td_flags |= TDF_NEEDRESCHED; td->td_flags |= TDF_NEEDRESCHED;
} }
@ -1672,11 +1717,8 @@ sched_runnable(void)
tdq = TDQ_SELF(); tdq = TDQ_SELF();
#ifdef SMP #ifdef SMP
if (tdq->tdq_assigned) { if (tdq_busy)
mtx_lock_spin(&sched_lock); goto out;
tdq_assign(tdq);
mtx_unlock_spin(&sched_lock);
}
#endif #endif
if ((curthread->td_flags & TDF_IDLETD) != 0) { if ((curthread->td_flags & TDF_IDLETD) != 0) {
if (tdq->tdq_load > 0) if (tdq->tdq_load > 0)
@ -1699,8 +1741,6 @@ sched_choose(void)
tdq = TDQ_SELF(); tdq = TDQ_SELF();
#ifdef SMP #ifdef SMP
restart: restart:
if (tdq->tdq_assigned)
tdq_assign(tdq);
#endif #endif
ts = tdq_choose(tdq); ts = tdq_choose(tdq);
if (ts) { if (ts) {
@ -1726,8 +1766,11 @@ sched_add(struct thread *td, int flags)
struct tdq *tdq; struct tdq *tdq;
struct td_sched *ts; struct td_sched *ts;
int preemptive; int preemptive;
int canmigrate;
int class; int class;
#ifdef SMP
int cpuid;
int cpumask;
#endif
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread, td, td->td_proc->p_comm, td->td_priority, curthread,
@ -1737,15 +1780,6 @@ sched_add(struct thread *td, int flags)
ts = td->td_sched; ts = td->td_sched;
class = PRI_BASE(td->td_pri_class); class = PRI_BASE(td->td_pri_class);
preemptive = !(flags & SRQ_YIELDING); preemptive = !(flags & SRQ_YIELDING);
canmigrate = 1;
#ifdef SMP
if (ts->ts_flags & TSF_ASSIGNED) {
if (ts->ts_flags & TSF_REMOVED)
ts->ts_flags &= ~TSF_REMOVED;
return;
}
canmigrate = THREAD_CAN_MIGRATE(td);
#endif
KASSERT(ts->ts_state != TSS_ONRUNQ, KASSERT(ts->ts_state != TSS_ONRUNQ,
("sched_add: thread %p (%s) already in run queue", td, ("sched_add: thread %p (%s) already in run queue", td,
td->td_proc->p_comm)); td->td_proc->p_comm));
@ -1754,42 +1788,38 @@ sched_add(struct thread *td, int flags)
KASSERT(ts->ts_runq == NULL, KASSERT(ts->ts_runq == NULL,
("sched_add: thread %p is still assigned to a run queue", td)); ("sched_add: thread %p is still assigned to a run queue", td));
/* /*
* Set the slice and pick the run queue. * Recalculate the priority before we select the target cpu or
* run-queue.
*/ */
if (ts->ts_slice == 0)
ts->ts_slice = sched_slice;
if (class == PRI_TIMESHARE) if (class == PRI_TIMESHARE)
sched_priority(td); sched_priority(td);
if (td->td_priority <= PRI_MAX_REALTIME) {
ts->ts_runq = &tdq->tdq_realtime;
/*
* If the thread is not artificially pinned and it's in
* the realtime queue we directly dispatch it on this cpu
* for minimum latency. Interrupt handlers may also have
* to complete on the cpu that dispatched them.
*/
if (td->td_pinned == 0 && class == PRI_ITHD)
ts->ts_cpu = PCPU_GET(cpuid);
} else if (td->td_priority <= PRI_MAX_TIMESHARE)
ts->ts_runq = &tdq->tdq_timeshare;
else
ts->ts_runq = &tdq->tdq_idle;
#ifdef SMP #ifdef SMP
cpuid = PCPU_GET(cpuid);
/* /*
* If this thread is pinned or bound, notify the target cpu. * Pick the destination cpu and if it isn't ours transfer to the
* target cpu.
*/ */
if (!canmigrate && ts->ts_cpu != PCPU_GET(cpuid) ) { if (THREAD_CAN_MIGRATE(td)) {
ts->ts_runq = NULL; if (td->td_priority <= PRI_MAX_ITHD) {
tdq_notify(ts, ts->ts_cpu); CTR2(KTR_SCHED, "ithd %d < %d", td->td_priority, PRI_MAX_ITHD);
return; ts->ts_cpu = cpuid;
} }
if (pick_pri)
ts->ts_cpu = tdq_pickpri(tdq, ts, flags);
else
ts->ts_cpu = tdq_pickidle(tdq, ts);
} else
CTR1(KTR_SCHED, "pinned %d", td->td_pinned);
if (ts->ts_cpu != cpuid)
preemptive = 0;
tdq = TDQ_CPU(ts->ts_cpu);
cpumask = 1 << ts->ts_cpu;
/* /*
* If we had been idle, clear our bit in the group and potentially * If we had been idle, clear our bit in the group and potentially
* the global bitmap. If not, see if we should transfer this thread. * the global bitmap.
*/ */
if ((class != PRI_IDLE && class != PRI_ITHD) && if ((class != PRI_IDLE && class != PRI_ITHD) &&
(tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) { (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
/* /*
* Check to see if our group is unidling, and if so, remove it * Check to see if our group is unidling, and if so, remove it
* from the global idle mask. * from the global idle mask.
@ -1800,20 +1830,34 @@ sched_add(struct thread *td, int flags)
/* /*
* Now remove ourselves from the group specific idle mask. * Now remove ourselves from the group specific idle mask.
*/ */
tdq->tdq_group->tdg_idlemask &= ~PCPU_GET(cpumask); tdq->tdq_group->tdg_idlemask &= ~cpumask;
} else if (canmigrate && tdq->tdq_load > 1) }
if (tdq_transfer(tdq, ts, class))
return;
ts->ts_cpu = PCPU_GET(cpuid);
#endif #endif
if (td->td_priority < curthread->td_priority) /*
curthread->td_flags |= TDF_NEEDRESCHED; * Set the slice and pick the run queue.
*/
if (ts->ts_slice == 0)
ts->ts_slice = sched_slice;
if (td->td_priority <= PRI_MAX_REALTIME)
ts->ts_runq = &tdq->tdq_realtime;
else if (td->td_priority <= PRI_MAX_TIMESHARE)
ts->ts_runq = &tdq->tdq_timeshare;
else
ts->ts_runq = &tdq->tdq_idle;
if (preemptive && maybe_preempt(td)) if (preemptive && maybe_preempt(td))
return; return;
ts->ts_state = TSS_ONRUNQ; ts->ts_state = TSS_ONRUNQ;
tdq_runq_add(tdq, ts, flags); tdq_runq_add(tdq, ts, flags);
tdq_load_add(tdq, ts); tdq_load_add(tdq, ts);
#ifdef SMP
if (ts->ts_cpu != cpuid) {
tdq_notify(ts);
return;
}
#endif
if (td->td_priority < curthread->td_priority)
curthread->td_flags |= TDF_NEEDRESCHED;
} }
void void
@ -1827,10 +1871,6 @@ sched_rem(struct thread *td)
curthread->td_proc->p_comm); curthread->td_proc->p_comm);
mtx_assert(&sched_lock, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED);
ts = td->td_sched; ts = td->td_sched;
if (ts->ts_flags & TSF_ASSIGNED) {
ts->ts_flags |= TSF_REMOVED;
return;
}
KASSERT((ts->ts_state == TSS_ONRUNQ), KASSERT((ts->ts_state == TSS_ONRUNQ),
("sched_rem: thread not on run queue")); ("sched_rem: thread not on run queue"));
@ -1881,8 +1921,6 @@ sched_bind(struct thread *td, int cpu)
return; return;
/* sched_rem without the runq_remove */ /* sched_rem without the runq_remove */
ts->ts_state = TSS_THREAD; ts->ts_state = TSS_THREAD;
tdq_load_rem(TDQ_CPU(ts->ts_cpu), ts);
tdq_notify(ts, cpu);
/* When we return from mi_switch we'll be on the correct cpu. */ /* When we return from mi_switch we'll be on the correct cpu. */
mi_switch(SW_VOL, NULL); mi_switch(SW_VOL, NULL);
sched_pin(); sched_pin();
@ -1962,7 +2000,22 @@ SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &sched_rebalance, 0, ""); #ifdef SMP
SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_affinity, CTLFLAG_RW,
&affinity, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryself, CTLFLAG_RW,
&tryself, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryselfidle, CTLFLAG_RW,
&tryselfidle, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, ipi_preempt, CTLFLAG_RW, &ipi_preempt, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, ipi_ast, CTLFLAG_RW, &ipi_ast, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
#endif
/* ps compat */ /* ps compat */
static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */