From 119fa6328d23c9b528d0ea704f4a460c117ed15d Mon Sep 17 00:00:00 2001
From: jeff <jeff@FreeBSD.org>
Date: Tue, 17 Jul 2007 22:53:23 +0000
Subject: [PATCH] ULE 3.0:  Fine grain scheduler locking and affinity
 improvements.  This has been in development for over 6 months as SCHED_SMP. 
 - Implement one spin lock per thread-queue.  Threads assigned to a   
 run-queue point to this lock via td_lock.  - Improve the facility for
 assigning threads to CPUs now that sched_lock    contention no longer
 dominates scheduling decisions on larger SMP    machines.  - Re-write idle
 time stealing in an attempt to make it less damaging to    general
 performance.  This is still disabled by default. See   
 kern.sched.steal_idle.  - Call the long-term load balancer from a callout
 rather than sched_clock()    so there are no locks held.  This is disabled by
 default.  See    kern.sched.balance.  - Parameterize many scheduling
 decisions via sysctls.  Try to document    these via sysctl descriptions.  -
 General structural and naming cleanups.  - Document each function with
 comments.

Tested by:	current@ amd64, x86, UP, SMP.
Approved by:	re
---
 sys/kern/sched_ule.c | 1466 ++++++++++++++++++++++++++----------------
 1 file changed, 917 insertions(+), 549 deletions(-)

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 9b1fff6829cb..a90e65798125 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -24,6 +24,17 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/*
+ * This file implements the ULE scheduler.  ULE supports independent CPU
+ * run queues and fine grain locking.  It has superior interactive
+ * performance under load even on uni-processor systems.
+ *
+ * etymology:
+ *   ULE is the last three letters in schedule.  It owes it's name to a
+ * generic user created for a scheduling system by Paul Mikesell at
+ * Isilon Systems and a general lack of creativity on the part of the author.
+ */
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
@@ -64,26 +75,23 @@ __FBSDID("$FreeBSD$");
 #error	"SCHED_ULE requires options PREEMPTION"
 #endif
 
-/*
- * TODO:
- *	Pick idle from affinity group or self group first.
- *	Implement pick_score.
- */
-
-#define	KTR_ULE	0x0		/* Enable for pickpri debugging. */
+#define	KTR_ULE	0
 
 /*
- * Thread scheduler specific section.
+ * Thread scheduler specific section.  All fields are protected
+ * by the thread lock.
  */
 struct td_sched {	
-	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
-	int		ts_flags;	/* (j) TSF_* flags. */
-	struct thread	*ts_thread;	/* (*) Active associated thread. */
-	u_char		ts_rqindex;	/* (j) Run queue index. */
-	int		ts_slptime;
-	int		ts_slice;
-	struct runq	*ts_runq;
+	TAILQ_ENTRY(td_sched) ts_procq;	/* Run queue. */
+	struct thread	*ts_thread;	/* Active associated thread. */
+	struct runq	*ts_runq;	/* Run-queue we're queued on. */
+	short		ts_flags;	/* TSF_* flags. */
+	u_char		ts_rqindex;	/* Run queue index. */
 	u_char		ts_cpu;		/* CPU that we have affinity for. */
+	int		ts_slptick;	/* Tick when we went to sleep. */
+	int		ts_slice;	/* Ticks of slice remaining. */
+	u_int		ts_slptime;	/* Number of ticks we vol. slept */
+	u_int		ts_runtime;	/* Number of ticks we were running */
 	/* The following variables are only used for pctcpu calculation */
 	int		ts_ltick;	/* Last tick that we were running on */
 	int		ts_ftick;	/* First tick that we were running on */
@@ -91,10 +99,6 @@ struct td_sched {
 #ifdef SMP
 	int		ts_rltick;	/* Real last tick, for affinity. */
 #endif
-
-	/* originally from kg_sched */
-	u_int	skg_slptime;		/* Number of ticks we vol. slept */
-	u_int	skg_runtime;		/* Number of ticks we were running */
 };
 /* flags kept in ts_flags */
 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
@@ -165,33 +169,40 @@ static struct td_sched td_sched0;
  *			due to rounding would be unacceptably high.
  * realstathz:		stathz is sometimes 0 and run off of hz.
  * sched_slice:		Runtime of each thread before rescheduling.
+ * preempt_thresh:	Priority threshold for preemption and remote IPIs.
  */
 static int sched_interact = SCHED_INTERACT_THRESH;
 static int realstathz;
 static int tickincr;
 static int sched_slice;
+static int preempt_thresh = PRI_MIN_KERN;
+
+#define	SCHED_BAL_SECS	2	/* How often we run the rebalance algorithm. */
 
 /*
- * tdq - per processor runqs and statistics.
+ * tdq - per processor runqs and statistics.  All fields are protected by the
+ * tdq_lock.  The load and lowpri may be accessed without to avoid excess
+ * locking in sched_pickcpu();
  */
 struct tdq {
-	struct runq	tdq_idle;		/* Queue of IDLE threads. */
-	struct runq	tdq_timeshare;		/* timeshare run queue. */
+	struct mtx	tdq_lock;		/* Protects all fields below. */
 	struct runq	tdq_realtime;		/* real-time run queue. */
+	struct runq	tdq_timeshare;		/* timeshare run queue. */
+	struct runq	tdq_idle;		/* Queue of IDLE threads. */
+	int		tdq_load;		/* Aggregate load. */
 	u_char		tdq_idx;		/* Current insert index. */
 	u_char		tdq_ridx;		/* Current removal index. */
-	short		tdq_flags;		/* Thread queue flags */
-	int		tdq_load;		/* Aggregate load. */
 #ifdef SMP
-	int		tdq_transferable;
+	u_char		tdq_lowpri;		/* Lowest priority thread. */
+	int		tdq_transferable;	/* Transferable thread count. */
 	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
 	struct tdq_group *tdq_group;		/* Our processor group. */
 #else
 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 #endif
-};
+	char		tdq_name[16];		/* lock name. */
+} __aligned(64);
 
-#define	TDQF_BUSY	0x0001			/* Queue is marked as busy */
 
 #ifdef SMP
 /*
@@ -210,9 +221,9 @@ struct tdq_group {
 	int	tdg_load;		/* Total load of this group. */
 	int	tdg_transferable;	/* Transferable load of this group. */
 	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
-};
+} __aligned(64);
 
-#define	SCHED_AFFINITY_DEFAULT	(hz / 100)
+#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 300))
 #define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
 
 /*
@@ -220,28 +231,23 @@ struct tdq_group {
  */
 static int rebalance = 0;
 static int pick_pri = 0;
+static int pick_zero = 0;
 static int affinity;
 static int tryself = 1;
 static int tryselfidle = 1;
-static int ipi_ast = 0;
-static int ipi_preempt = 1;
-static int ipi_thresh = PRI_MIN_KERN;
-static int steal_htt = 1;
-static int steal_busy = 1;
-static int busy_thresh = 4;
+static int steal_htt = 0;
+static int steal_idle = 0;
 static int topology = 0;
 
 /*
  * One thread queue per processor.
  */
 static volatile cpumask_t tdq_idle;
-static volatile cpumask_t tdq_busy;
 static int tdg_maxid;
 static struct tdq	tdq_cpu[MAXCPU];
 static struct tdq_group tdq_groups[MAXCPU];
-static int bal_tick;
-static int gbal_tick;
-static int balance_groups;
+static struct callout balco;
+static struct callout gbalco;
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
@@ -255,14 +261,18 @@ static struct tdq	tdq_cpu;
 #define	TDQ_CPU(x)	(&tdq_cpu)
 #endif
 
+#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
+#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
+#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
+
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
 static int sched_interact_score(struct thread *);
 static void sched_interact_update(struct thread *);
 static void sched_interact_fork(struct thread *);
 static void sched_pctcpu_update(struct td_sched *);
-static inline void sched_pin_td(struct thread *td);
-static inline void sched_unpin_td(struct thread *td);
 
 /* Operations on per processor queues */
 static struct td_sched * tdq_choose(struct tdq *);
@@ -273,19 +283,21 @@ static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
 static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
 void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
+static void tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
-static int tdq_pickidle(struct tdq *, struct td_sched *);
-static int tdq_pickpri(struct tdq *, struct td_sched *, int);
-static struct td_sched *runq_steal(struct runq *);
-static void sched_balance(void);
-static void sched_balance_groups(void);
-static void sched_balance_group(struct tdq_group *);
-static void sched_balance_pair(struct tdq *, struct tdq *);
-static void sched_smp_tick(struct thread *);
-static void tdq_move(struct tdq *, int);
+static void tdq_move(struct tdq *, struct tdq *);
 static int tdq_idled(struct tdq *);
 static void tdq_notify(struct td_sched *);
 static struct td_sched *tdq_steal(struct tdq *, int);
+static struct td_sched *runq_steal(struct runq *);
+static int sched_pickcpu(struct td_sched *, int);
+static void sched_balance(void *);
+static void sched_balance_groups(void *);
+static void sched_balance_group(struct tdq_group *);
+static void sched_balance_pair(struct tdq *, struct tdq *);
+static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
+static inline struct mtx *thread_block_switch(struct thread *);
+static inline void thread_unblock_switch(struct thread *, struct mtx *);
 
 #define	THREAD_CAN_MIGRATE(td)	 ((td)->td_pinned == 0)
 #endif
@@ -296,18 +308,9 @@ SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
 
-static inline void
-sched_pin_td(struct thread *td)
-{
-	td->td_pinned++;
-}
-
-static inline void
-sched_unpin_td(struct thread *td)
-{
-	td->td_pinned--;
-}
-
+/*
+ * Print the threads waiting on a run-queue.
+ */
 static void
 runq_print(struct runq *rq)
 {
@@ -332,6 +335,9 @@ runq_print(struct runq *rq)
 	}
 }
 
+/*
+ * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
+ */
 void
 tdq_print(int cpu)
 {
@@ -340,8 +346,10 @@ tdq_print(int cpu)
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq:\n");
+	printf("\tlockptr         %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tlock name       %s\n", tdq->tdq_name);
 	printf("\tload:           %d\n", tdq->tdq_load);
-	printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
+	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
 	printf("\trealtime runq:\n");
 	runq_print(&tdq->tdq_realtime);
@@ -351,22 +359,26 @@ tdq_print(int cpu)
 	runq_print(&tdq->tdq_idle);
 #ifdef SMP
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
+	printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
 #endif
 }
 
+#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
+/*
+ * Add a thread to the actual run-queue.  Keeps transferable counts up to
+ * date with what is actually on the run-queue.  Selects the correct
+ * queue position for timeshare threads.
+ */
 static __inline void
 tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
 {
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
 #ifdef SMP
 	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
 		tdq->tdq_transferable++;
 		tdq->tdq_group->tdg_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
-		if (tdq->tdq_transferable >= busy_thresh &&
-		    (tdq->tdq_flags & TDQF_BUSY) == 0) {
-			tdq->tdq_flags |= TDQF_BUSY;
-			atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq));
-		}
 	}
 #endif
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -379,7 +391,6 @@ tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
 		 * This queue contains only priorities between MIN and MAX
 		 * realtime.  Use the whole queue to represent these values.
 		 */
-#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
 		if ((flags & SRQ_BORROWING) == 0) {
 			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
 			pri = (pri + tdq->tdq_idx) % RQ_NQS;
@@ -398,19 +409,22 @@ tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
 		runq_add(ts->ts_runq, ts, flags);
 }
 
+/* 
+ * Remove a thread from a run-queue.  This typically happens when a thread
+ * is selected to run.  Running threads are not on the queue and the
+ * transferable count does not reflect them.
+ */
 static __inline void
 tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
 {
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	KASSERT(ts->ts_runq != NULL,
+	    ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
 #ifdef SMP
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
 		tdq->tdq_group->tdg_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
-		if (tdq->tdq_transferable < busy_thresh && 
-		    (tdq->tdq_flags & TDQF_BUSY)) {
-			atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq));
-			tdq->tdq_flags &= ~TDQF_BUSY;
-		}
 	}
 #endif
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -429,11 +443,17 @@ tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
 		runq_remove(ts->ts_runq, ts);
 }
 
+/*
+ * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
+ * for this thread to the referenced thread queue.
+ */
 static void
 tdq_load_add(struct tdq *tdq, struct td_sched *ts)
 {
 	int class;
-	mtx_assert(&sched_lock, MA_OWNED);
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	tdq->tdq_load++;
 	CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
@@ -446,11 +466,17 @@ tdq_load_add(struct tdq *tdq, struct td_sched *ts)
 #endif
 }
 
+/*
+ * Remove the load from a thread that is transitioning to a sleep state or
+ * exiting.
+ */
 static void
 tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
 {
 	int class;
-	mtx_assert(&sched_lock, MA_OWNED);
+
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	if (class != PRI_ITHD &&
 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
@@ -459,27 +485,14 @@ tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
 #else
 		tdq->tdq_sysload--;
 #endif
+	KASSERT(tdq->tdq_load != 0,
+	    ("tdq_load_rem: Removing with 0 load on queue %d", (int)TDQ_ID(tdq)));
 	tdq->tdq_load--;
 	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
 	ts->ts_runq = NULL;
 }
 
 #ifdef SMP
-static void
-sched_smp_tick(struct thread *td)
-{
-	struct tdq *tdq;
-
-	tdq = TDQ_SELF();
-	if (rebalance) {
-		if (ticks >= bal_tick)
-			sched_balance();
-		if (ticks >= gbal_tick && balance_groups)
-			sched_balance_groups();
-	}
-	td->td_sched->ts_rltick = ticks;
-}
-
 /*
  * sched_balance is a simple CPU load balancing algorithm.  It operates by
  * finding the least loaded and most loaded cpu and equalizing their load
@@ -489,15 +502,11 @@ sched_smp_tick(struct thread *td)
  * installations will only have 2 cpus.  Secondly, load balancing too much at
  * once can have an unpleasant effect on the system.  The scheduler rarely has
  * enough information to make perfect decisions.  So this algorithm chooses
- * algorithm simplicity and more gradual effects on load in larger systems.
- *
- * It could be improved by considering the priorities and slices assigned to
- * each task prior to balancing them.  There are many pathological cases with
- * any approach and so the semi random algorithm below may work as well as any.
+ * simplicity and more gradual effects on load in larger systems.
  *
  */
 static void
-sched_balance(void)
+sched_balance(void *arg)
 {
 	struct tdq_group *high;
 	struct tdq_group *low;
@@ -505,8 +514,9 @@ sched_balance(void)
 	int cnt;
 	int i;
 
-	bal_tick = ticks + (random() % (hz * 2));
-	if (smp_started == 0)
+	callout_reset(&balco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
+	    sched_balance, NULL);
+	if (smp_started == 0 || rebalance == 0)
 		return;
 	low = high = NULL;
 	i = random() % (tdg_maxid + 1);
@@ -529,18 +539,25 @@ sched_balance(void)
 		    LIST_FIRST(&low->tdg_members));
 }
 
+/*
+ * Balance load between CPUs in a group.  Will only migrate within the group.
+ */
 static void
-sched_balance_groups(void)
+sched_balance_groups(void *arg)
 {
 	int i;
 
-	gbal_tick = ticks + (random() % (hz * 2));
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (smp_started)
-		for (i = 0; i <= tdg_maxid; i++)
-			sched_balance_group(TDQ_GROUP(i));
+	callout_reset(&gbalco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
+	    sched_balance_groups, NULL);
+	if (smp_started == 0 || rebalance == 0)
+		return;
+	for (i = 0; i <= tdg_maxid; i++)
+		sched_balance_group(TDQ_GROUP(i));
 }
 
+/*
+ * Finds the greatest imbalance between two tdqs in a group.
+ */
 static void
 sched_balance_group(struct tdq_group *tdg)
 {
@@ -564,6 +581,24 @@ sched_balance_group(struct tdq_group *tdg)
 		sched_balance_pair(high, low);
 }
 
+/*
+ * Lock two thread queues using their address to maintain lock order.
+ */
+static void
+tdq_lock_pair(struct tdq *one, struct tdq *two)
+{
+	if (one < two) {
+		TDQ_LOCK(one);
+		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
+	} else {
+		TDQ_LOCK(two);
+		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
+	}
+}
+
+/*
+ * Transfer load between two imbalanced thread queues.
+ */
 static void
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
@@ -574,6 +609,7 @@ sched_balance_pair(struct tdq *high, struct tdq *low)
 	int diff;
 	int i;
 
+	tdq_lock_pair(high, low);
 	/*
 	 * If we're transfering within a group we have to use this specific
 	 * tdq's transferable count, otherwise we can steal from other members
@@ -588,31 +624,37 @@ sched_balance_pair(struct tdq *high, struct tdq *low)
 		high_load = high->tdq_group->tdg_load;
 		low_load = low->tdq_group->tdg_load;
 	}
-	if (transferable == 0)
-		return;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
 	 */
-	diff = high_load - low_load;
-	move = diff / 2;
-	if (diff & 0x1)
-		move++;
-	move = min(move, transferable);
-	for (i = 0; i < move; i++)
-		tdq_move(high, TDQ_ID(low));
+	if (transferable != 0) {
+		diff = high_load - low_load;
+		move = diff / 2;
+		if (diff & 0x1)
+			move++;
+		move = min(move, transferable);
+		for (i = 0; i < move; i++)
+			tdq_move(high, low);
+	}
+	TDQ_UNLOCK(high);
+	TDQ_UNLOCK(low);
 	return;
 }
 
+/*
+ * Move a thread from one thread queue to another.
+ */
 static void
-tdq_move(struct tdq *from, int cpu)
+tdq_move(struct tdq *from, struct tdq *to)
 {
-	struct tdq *tdq;
-	struct tdq *to;
 	struct td_sched *ts;
+	struct thread *td;
+	struct tdq *tdq;
+	int cpu;
 
 	tdq = from;
-	to = TDQ_CPU(cpu);
+	cpu = TDQ_ID(to);
 	ts = tdq_steal(tdq, 1);
 	if (ts == NULL) {
 		struct tdq_group *tdg;
@@ -625,26 +667,42 @@ tdq_move(struct tdq *from, int cpu)
 			break;
 		}
 		if (ts == NULL)
-			panic("tdq_move: No threads available with a "
-			    "transferable count of %d\n", 
-			    tdg->tdg_transferable);
+			return;
 	}
 	if (tdq == to)
 		return;
-	sched_rem(ts->ts_thread);
+	td = ts->ts_thread;
+	/*
+	 * Although the run queue is locked the thread may be blocked.  Lock
+	 * it to clear this.
+	 */
+	thread_lock(td);
+	/* Drop recursive lock on from. */
+	TDQ_UNLOCK(from);
+	sched_rem(td);
 	ts->ts_cpu = cpu;
-	sched_pin_td(ts->ts_thread);
-	sched_add(ts->ts_thread, SRQ_YIELDING);
-	sched_unpin_td(ts->ts_thread);
+	td->td_lock = TDQ_LOCKPTR(to);
+	tdq_add(to, td, SRQ_YIELDING);
 }
 
+/*
+ * This tdq has idled.  Try to steal a thread from another cpu and switch
+ * to it.
+ */
 static int
 tdq_idled(struct tdq *tdq)
 {
 	struct tdq_group *tdg;
 	struct tdq *steal;
 	struct td_sched *ts;
+	struct thread *td;
+	int highload;
+	int highcpu;
+	int load;
+	int cpu;
 
+	/* We don't want to be preempted while we're iterating over tdqs */
+	spinlock_enter();
 	tdg = tdq->tdq_group;
 	/*
 	 * If we're in a cpu group, try and steal threads from another cpu in
@@ -654,51 +712,59 @@ tdq_idled(struct tdq *tdq)
 		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
 			if (steal == tdq || steal->tdq_transferable == 0)
 				continue;
+			TDQ_LOCK(steal);
 			ts = tdq_steal(steal, 0);
 			if (ts)
 				goto steal;
+			TDQ_UNLOCK(steal);
 		}
 	}
-	if (steal_busy) {
-		while (tdq_busy) {
-			int cpu;
-
-			cpu = ffs(tdq_busy);
-			if (cpu == 0)
-				break;
-			cpu--;
+	for (;;) {
+		if (steal_idle == 0)
+			break;
+		highcpu = 0;
+		highload = 0;
+		for (cpu = 0; cpu <= mp_maxid; cpu++) {
+			if (CPU_ABSENT(cpu))
+				continue;
 			steal = TDQ_CPU(cpu);
-			if (steal->tdq_transferable == 0)
+			load = TDQ_CPU(cpu)->tdq_transferable;
+			if (load < highload)
 				continue;
-			ts = tdq_steal(steal, 1);
-			if (ts == NULL)
-				continue;
-			CTR5(KTR_ULE,
-			    "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X",
-			    ts->ts_thread, ts->ts_thread->td_proc->p_comm,
-			    ts->ts_thread->td_priority, cpu, tdq_busy);
-			goto steal;
+			highload = load;
+			highcpu = cpu;
 		}
+		if (highload < 2)
+			break;
+		steal = TDQ_CPU(highcpu);
+		TDQ_LOCK(steal);
+		if (steal->tdq_transferable > 1 &&
+		    (ts = tdq_steal(steal, 1)) != NULL)
+			goto steal;
+		TDQ_UNLOCK(steal);
+		break;
 	}
-	/*
-	 * We only set the idled bit when all of the cpus in the group are
-	 * idle.  Otherwise we could get into a situation where a thread bounces
-	 * back and forth between two idle cores on seperate physical CPUs.
-	 */
-	tdg->tdg_idlemask |= PCPU_GET(cpumask);
-	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
-		atomic_set_int(&tdq_idle, tdg->tdg_mask);
+	spinlock_exit();
 	return (1);
 steal:
-	sched_rem(ts->ts_thread);
-	ts->ts_cpu = PCPU_GET(cpuid);
-	sched_pin_td(ts->ts_thread);
-	sched_add(ts->ts_thread, SRQ_YIELDING);
-	sched_unpin_td(ts->ts_thread);
+	td = ts->ts_thread;
+	thread_lock(td);
+	spinlock_exit();
+	MPASS(td->td_lock == TDQ_LOCKPTR(steal));
+	TDQ_UNLOCK(steal);
+	sched_rem(td);
+	sched_setcpu(ts, PCPU_GET(cpuid), SRQ_YIELDING);
+	tdq_add(tdq, td, SRQ_YIELDING);
+	MPASS(td->td_lock == curthread->td_lock);
+	mi_switch(SW_VOL, NULL);
+	thread_unlock(curthread);
 
 	return (0);
 }
 
+/*
+ * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
+ */
 static void
 tdq_notify(struct td_sched *ts)
 {
@@ -734,29 +800,74 @@ tdq_notify(struct td_sched *ts)
 	/*
 	 * Otherwise only IPI if we exceed the threshold.
 	 */
-	if (pri > ipi_thresh)
+	if (pri > preempt_thresh)
 		return;
 sendipi:
 	ctd->td_flags |= TDF_NEEDRESCHED;
-	if (cpri < PRI_MIN_IDLE) {
-		if (ipi_ast)
-			ipi_selected(1 << cpu, IPI_AST);
-		else if (ipi_preempt)
-			ipi_selected(1 << cpu, IPI_PREEMPT);
-	} else 
-		ipi_selected(1 << cpu, IPI_PREEMPT);
+	ipi_selected(1 << cpu, IPI_PREEMPT);
 }
 
+/*
+ * Steals load from a timeshare queue.  Honors the rotating queue head
+ * index.
+ */
+static struct td_sched *
+runq_steal_from(struct runq *rq, u_char start)
+{
+	struct td_sched *ts;
+	struct rqbits *rqb;
+	struct rqhead *rqh;
+	int first;
+	int bit;
+	int pri;
+	int i;
+
+	rqb = &rq->rq_status;
+	bit = start & (RQB_BPW -1);
+	pri = 0;
+	first = 0;
+again:
+	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
+		if (rqb->rqb_bits[i] == 0)
+			continue;
+		if (bit != 0) {
+			for (pri = bit; pri < RQB_BPW; pri++)
+				if (rqb->rqb_bits[i] & (1ul << pri))
+					break;
+			if (pri >= RQB_BPW)
+				continue;
+		} else
+			pri = RQB_FFS(rqb->rqb_bits[i]);
+		pri += (i << RQB_L2BPW);
+		rqh = &rq->rq_queues[pri];
+		TAILQ_FOREACH(ts, rqh, ts_procq) {
+			if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
+				return (ts);
+			first = 1;
+		}
+	}
+	if (start != 0) {
+		start = 0;
+		goto again;
+	}
+
+	return (NULL);
+}
+
+/*
+ * Steals load from a standard linear queue.
+ */
 static struct td_sched *
 runq_steal(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct rqbits *rqb;
 	struct td_sched *ts;
+	int first;
 	int word;
 	int bit;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	first = 0;
 	rqb = &rq->rq_status;
 	for (word = 0; word < RQB_LEN; word++) {
 		if (rqb->rqb_bits[word] == 0)
@@ -766,106 +877,186 @@ runq_steal(struct runq *rq)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
 			TAILQ_FOREACH(ts, rqh, ts_procq) {
-				if (THREAD_CAN_MIGRATE(ts->ts_thread))
+				if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
 					return (ts);
+				first = 1;
 			}
 		}
 	}
 	return (NULL);
 }
 
+/*
+ * Attempt to steal a thread in priority order from a thread queue.
+ */
 static struct td_sched *
 tdq_steal(struct tdq *tdq, int stealidle)
 {
 	struct td_sched *ts;
 
-	/*
-	 * Steal from next first to try to get a non-interactive task that
-	 * may not have run for a while.
-	 * XXX Need to effect steal order for timeshare threads.
-	 */
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
 		return (ts);
-	if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL)
+	if ((ts = runq_steal_from(&tdq->tdq_timeshare, tdq->tdq_ridx)) != NULL)
 		return (ts);
 	if (stealidle)
 		return (runq_steal(&tdq->tdq_idle));
 	return (NULL);
 }
 
-int
-tdq_pickidle(struct tdq *tdq, struct td_sched *ts)
+/*
+ * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
+ * current lock and returns with the assigned queue locked.  If this is
+ * via sched_switch() we leave the thread in a blocked state as an
+ * optimization.
+ */
+static inline struct tdq *
+sched_setcpu(struct td_sched *ts, int cpu, int flags)
 {
-	struct tdq_group *tdg;
-	int self;
-	int cpu;
+	struct thread *td;
+	struct tdq *tdq;
 
-	self = PCPU_GET(cpuid);
-	if (smp_started == 0)
-		return (self);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+
+	tdq = TDQ_CPU(cpu);
+	td = ts->ts_thread;
+	ts->ts_cpu = cpu;
+	if (td->td_lock == TDQ_LOCKPTR(tdq))
+		return (tdq);
+#ifdef notyet
 	/*
-	 * If the current CPU has idled, just run it here.
+	 * If the thread isn't running it's lockptr is a
+	 * turnstile or a sleepqueue.  We can just lock_set without
+	 * blocking.
 	 */
-	if ((tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0)
-		return (self);
+	if (TD_CAN_RUN(td)) {
+		TDQ_LOCK(tdq);
+		thread_lock_set(td, TDQ_LOCKPTR(tdq));
+		return (tdq);
+	}
+#endif
 	/*
-	 * Try the last group we ran on.
+	 * The hard case, migration, we need to block the thread first to
+	 * prevent order reversals with other cpus locks.
 	 */
-	tdg = TDQ_CPU(ts->ts_cpu)->tdq_group;
-	cpu = ffs(tdg->tdg_idlemask);
-	if (cpu)
-		return (cpu - 1);
-	/*
-	 * Search for an idle group.
-	 */
-	cpu = ffs(tdq_idle);
-	if (cpu) 
-		return (cpu - 1);
-	/*
-	 * XXX If there are no idle groups, check for an idle core.
-	 */
-	/*
-	 * No idle CPUs?
-	 */
-	return (self);
+	thread_lock_block(td);
+	TDQ_LOCK(tdq);
+	/* Return to sched_switch() with the lock still blocked */
+	if ((flags & SRQ_OURSELF) == 0)
+		thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
+	return (tdq);
 }
 
+/*
+ * Find the thread queue running the lowest priority thread.
+ */
 static int
-tdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags)
+tdq_lowestpri(void)
 {
-	struct pcpu *pcpu;
+	struct tdq *tdq;
 	int lowpri;
 	int lowcpu;
 	int lowload;
 	int load;
+	int cpu;
+	int pri;
+
+	lowload = 0;
+	lowpri = lowcpu = 0;
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		if (CPU_ABSENT(cpu))
+			continue;
+		tdq = TDQ_CPU(cpu);
+		pri = tdq->tdq_lowpri;
+		load = TDQ_CPU(cpu)->tdq_load;
+		CTR4(KTR_ULE,
+		    "cpu %d pri %d lowcpu %d lowpri %d",
+		    cpu, pri, lowcpu, lowpri);
+		if (pri < lowpri)
+			continue;
+		if (lowpri && lowpri == pri && load > lowload)
+			continue;
+		lowpri = pri;
+		lowcpu = cpu;
+		lowload = load;
+	}
+
+	return (lowcpu);
+}
+
+/*
+ * Find the thread queue with the least load.
+ */
+static int
+tdq_lowestload(void)
+{
+	struct tdq *tdq;
+	int lowload;
+	int lowpri;
+	int lowcpu;
+	int load;
+	int cpu;
+	int pri;
+
+	lowcpu = 0;
+	lowload = TDQ_CPU(0)->tdq_load;
+	lowpri = TDQ_CPU(0)->tdq_lowpri;
+	for (cpu = 1; cpu <= mp_maxid; cpu++) {
+		if (CPU_ABSENT(cpu))
+			continue;
+		tdq = TDQ_CPU(cpu);
+		load = tdq->tdq_load;
+		pri = tdq->tdq_lowpri;
+		CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d",
+		    cpu, load, lowcpu, lowload);
+		if (load > lowload)
+			continue;
+		if (load == lowload && pri < lowpri)
+			continue;
+		lowcpu = cpu;
+		lowload = load;
+		lowpri = pri;
+	}
+
+	return (lowcpu);
+}
+
+/*
+ * Pick the destination cpu for sched_add().  Respects affinity and makes
+ * a determination based on load or priority of available processors.
+ */
+static int
+sched_pickcpu(struct td_sched *ts, int flags)
+{
+	struct tdq *tdq;
 	int self;
 	int pri;
 	int cpu;
 
-	self = PCPU_GET(cpuid);
+	cpu = self = PCPU_GET(cpuid);
 	if (smp_started == 0)
 		return (self);
-
 	pri = ts->ts_thread->td_priority;
+	cpu = ts->ts_cpu;
 	/*
 	 * Regardless of affinity, if the last cpu is idle send it there.
 	 */
-	pcpu = pcpu_find(ts->ts_cpu);
-	if (pcpu->pc_curthread->td_priority > PRI_MIN_IDLE) {
+	tdq = TDQ_CPU(cpu);
+	if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
 		CTR5(KTR_ULE,
 		    "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
 		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
-		    pcpu->pc_curthread->td_priority);
+		    tdq->tdq_lowpri);
 		return (ts->ts_cpu);
 	}
 	/*
 	 * If we have affinity, try to place it on the cpu we last ran on.
 	 */
-	if (SCHED_AFFINITY(ts) && pcpu->pc_curthread->td_priority > pri) {
+	if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) {
 		CTR5(KTR_ULE,
 		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
 		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
-		    pcpu->pc_curthread->td_priority);
+		    tdq->tdq_lowpri);
 		return (ts->ts_cpu);
 	}
 	/*
@@ -878,7 +1069,7 @@ tdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags)
 		 * If we're being awoken by an interrupt thread or the waker
 		 * is going right to sleep run here as well.
 		 */
-		if ((TDQ_SELF()->tdq_load == 1) && (flags & SRQ_YIELDING ||
+		if ((TDQ_SELF()->tdq_load <= 1) && (flags & (SRQ_YIELDING) || 
 		    curthread->td_pri_class == PRI_ITHD)) {
 			CTR2(KTR_ULE, "tryself load %d flags %d",
 			    TDQ_SELF()->tdq_load, flags);
@@ -891,37 +1082,28 @@ tdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags)
 	CTR1(KTR_ULE, "tdq_idle %X", tdq_idle);
 	cpu = ffs(tdq_idle);
 	if (cpu)
-		return (cpu - 1);
+		return (--cpu);
 	if (tryselfidle && pri < curthread->td_priority) {
-		CTR1(KTR_ULE, "tryself %d",
+		CTR1(KTR_ULE, "tryselfidle %d",
 		    curthread->td_priority);
 		return (self);
 	}
+	/*
+	 * XXX Under heavy load mysql performs way better if you
+	 * serialize the non-running threads on one cpu.  This is
+	 * a horrible hack.
+	 */
+	if (pick_zero)
+		return (0);
 	/*
  	 * Now search for the cpu running the lowest priority thread with
 	 * the least load.
 	 */
-	lowload = 0;
-	lowpri = lowcpu = 0;
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
-		if (CPU_ABSENT(cpu))
-			continue;
-		pcpu = pcpu_find(cpu);
-		pri = pcpu->pc_curthread->td_priority;
-		CTR4(KTR_ULE,
-		    "cpu %d pri %d lowcpu %d lowpri %d",
-		    cpu, pri, lowcpu, lowpri);
-		if (pri < lowpri)
-			continue;
-		load = TDQ_CPU(cpu)->tdq_load;
-		if (lowpri && lowpri == pri && load > lowload)
-			continue;
-		lowpri = pri;
-		lowcpu = cpu;
-		lowload = load;
-	}
-
-	return (lowcpu);
+	if (pick_pri)
+		cpu = tdq_lowestpri();
+	else
+		cpu = tdq_lowestload();
+	return (cpu);
 }
 
 #endif	/* SMP */
@@ -929,14 +1111,12 @@ tdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags)
 /*
  * Pick the highest priority task we have and return it.
  */
-
 static struct td_sched *
 tdq_choose(struct tdq *tdq)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	ts = runq_choose(&tdq->tdq_realtime);
 	if (ts != NULL)
 		return (ts);
@@ -959,44 +1139,45 @@ tdq_choose(struct tdq *tdq)
 	return (NULL);
 }
 
+/*
+ * Initialize a thread queue.
+ */
 static void
 tdq_setup(struct tdq *tdq)
 {
+
+	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
+	    "sched lock %d", (int)TDQ_ID(tdq));
+	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
+	    MTX_SPIN | MTX_RECURSE);
 	runq_init(&tdq->tdq_realtime);
 	runq_init(&tdq->tdq_timeshare);
 	runq_init(&tdq->tdq_idle);
 	tdq->tdq_load = 0;
 }
 
+/*
+ * Setup the thread queues and initialize the topology based on MD
+ * information.
+ */
 static void
 sched_setup(void *dummy)
 {
+	struct tdq *tdq;
 #ifdef SMP
+	int balance_groups;
 	int i;
-#endif
 
-	/*
-	 * To avoid divide-by-zero, we set realstathz a dummy value
-	 * in case which sched_clock() called before sched_initticks().
-	 */
-	realstathz = hz;
-	sched_slice = (realstathz/10);	/* ~100ms */
-	tickincr = 1 << SCHED_TICK_SHIFT;
-
-#ifdef SMP
 	balance_groups = 0;
 	/*
 	 * Initialize the tdqs.
 	 */
 	for (i = 0; i < MAXCPU; i++) {
-		struct tdq *tdq;
-
 		tdq = &tdq_cpu[i];
 		tdq_setup(&tdq_cpu[i]);
 	}
 	if (smp_topology == NULL) {
 		struct tdq_group *tdg;
-		struct tdq *tdq;
 		int cpus;
 
 		for (cpus = 0, i = 0; i < MAXCPU; i++) {
@@ -1056,25 +1237,41 @@ sched_setup(void *dummy)
 		tdg_maxid = smp_topology->ct_count - 1;
 	}
 	/*
-	 * Stagger the group and global load balancer so they do not
-	 * interfere with each other.
+	 * Initialize long-term cpu balancing algorithm.
 	 */
-	bal_tick = ticks + hz;
+	callout_init(&balco, CALLOUT_MPSAFE);
+	callout_init(&gbalco, CALLOUT_MPSAFE);
+	sched_balance(NULL);
 	if (balance_groups)
-		gbal_tick = ticks + (hz / 2);
+		sched_balance_groups(NULL);
+
 #else
 	tdq_setup(TDQ_SELF());
 #endif
-	mtx_lock_spin(&sched_lock);
-	tdq_load_add(TDQ_SELF(), &td_sched0);
-	mtx_unlock_spin(&sched_lock);
+	/*
+	 * To avoid divide-by-zero, we set realstathz a dummy value
+	 * in case which sched_clock() called before sched_initticks().
+	 */
+	realstathz = hz;
+	sched_slice = (realstathz/10);	/* ~100ms */
+	tickincr = 1 << SCHED_TICK_SHIFT;
+
+	/* Add thread0's load since it's running. */
+	tdq = TDQ_SELF();
+	TDQ_LOCK(tdq);
+	tdq_load_add(tdq, &td_sched0);
+	TDQ_UNLOCK(tdq);
 }
 
+/*
+ * This routine determines the tickincr after stathz and hz are setup.
+ */
 /* ARGSUSED */
 static void
 sched_initticks(void *dummy)
 {
-	mtx_lock_spin(&sched_lock);
+	int incr;
+
 	realstathz = stathz ? stathz : hz;
 	sched_slice = (realstathz/10);	/* ~100ms */
 
@@ -1082,20 +1279,63 @@ sched_initticks(void *dummy)
 	 * tickincr is shifted out by 10 to avoid rounding errors due to
 	 * hz not being evenly divisible by stathz on all platforms.
 	 */
-	tickincr = (hz << SCHED_TICK_SHIFT) / realstathz;
+	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
 	/*
 	 * This does not work for values of stathz that are more than
 	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
 	 */
-	if (tickincr == 0)
-		tickincr = 1;
+	if (incr == 0)
+		incr = 1;
+	tickincr = incr;
 #ifdef SMP
 	affinity = SCHED_AFFINITY_DEFAULT;
 #endif
-	mtx_unlock_spin(&sched_lock);
 }
 
 
+/*
+ * This is the core of the interactivity algorithm.  Determines a score based
+ * on past behavior.  It is the ratio of sleep time to run time scaled to
+ * a [0, 100] integer.  This is the voluntary sleep time of a process, which
+ * differs from the cpu usage because it does not account for time spent
+ * waiting on a run-queue.  Would be prettier if we had floating point.
+ */
+static int
+sched_interact_score(struct thread *td)
+{
+	struct td_sched *ts;
+	int div;
+
+	ts = td->td_sched;
+	/*
+	 * The score is only needed if this is likely to be an interactive
+	 * task.  Don't go through the expense of computing it if there's
+	 * no chance.
+	 */
+	if (sched_interact <= SCHED_INTERACT_HALF &&
+		ts->ts_runtime >= ts->ts_slptime)
+			return (SCHED_INTERACT_HALF);
+
+	if (ts->ts_runtime > ts->ts_slptime) {
+		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
+		return (SCHED_INTERACT_HALF +
+		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
+	}
+	if (ts->ts_slptime > ts->ts_runtime) {
+		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
+		return (ts->ts_runtime / div);
+	}
+	/* runtime == slptime */
+	if (ts->ts_runtime)
+		return (SCHED_INTERACT_HALF);
+
+	/*
+	 * This can happen if slptime and runtime are 0.
+	 */
+	return (0);
+
+}
+
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
@@ -1113,7 +1353,7 @@ sched_priority(struct thread *td)
 	 * queue with a priority that is less than kernel and interrupt
 	 * priorities.  These threads are not subject to nice restrictions.
 	 *
-	 * Scores greater than this are placed on the normal realtime queue
+	 * Scores greater than this are placed on the normal timeshare queue
 	 * where the priority is partially decided by the most recent cpu
 	 * utilization and the rest is decided by nice value.
 	 */
@@ -1130,22 +1370,12 @@ sched_priority(struct thread *td)
 		if (td->td_sched->ts_ticks)
 			pri += SCHED_PRI_TICKS(td->td_sched);
 		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
-		if (!(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE)) {
-			static int once = 1;
-			if (once) {
-				printf("sched_priority: invalid priority %d",
-				    pri);
-				printf("nice %d, ticks %d ftick %d ltick %d tick pri %d\n",
-				    td->td_proc->p_nice,
-				    td->td_sched->ts_ticks,
-				    td->td_sched->ts_ftick,
-				    td->td_sched->ts_ltick,
-				    SCHED_PRI_TICKS(td->td_sched));
-				once = 0;
-			}
-			pri = min(max(pri, PRI_MIN_TIMESHARE),
-			    PRI_MAX_TIMESHARE);
-		}
+		KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
+		    ("sched_priority: invalid priority %d: nice %d, " 
+		    "ticks %d ftick %d ltick %d tick pri %d",
+		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
+		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
+		    SCHED_PRI_TICKS(td->td_sched)));
 	}
 	sched_user_prio(td, pri);
 
@@ -1154,7 +1384,8 @@ sched_priority(struct thread *td)
 
 /*
  * This routine enforces a maximum limit on the amount of scheduling history
- * kept.  It is called after either the slptime or runtime is adjusted.
+ * kept.  It is called after either the slptime or runtime is adjusted.  This
+ * function is ugly due to integer math.
  */
 static void
 sched_interact_update(struct thread *td)
@@ -1163,7 +1394,7 @@ sched_interact_update(struct thread *td)
 	u_int sum;
 
 	ts = td->td_sched;
-	sum = ts->skg_runtime + ts->skg_slptime;
+	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
 		return;
 	/*
@@ -1172,12 +1403,12 @@ sched_interact_update(struct thread *td)
 	 * 2) We have added an unusual amount of sleep time from sched_sleep().
 	 */
 	if (sum > SCHED_SLP_RUN_MAX * 2) {
-		if (ts->skg_runtime > ts->skg_slptime) {
-			ts->skg_runtime = SCHED_SLP_RUN_MAX;
-			ts->skg_slptime = 1;
+		if (ts->ts_runtime > ts->ts_slptime) {
+			ts->ts_runtime = SCHED_SLP_RUN_MAX;
+			ts->ts_slptime = 1;
 		} else {
-			ts->skg_slptime = SCHED_SLP_RUN_MAX;
-			ts->skg_runtime = 1;
+			ts->ts_slptime = SCHED_SLP_RUN_MAX;
+			ts->ts_runtime = 1;
 		}
 		return;
 	}
@@ -1187,55 +1418,36 @@ sched_interact_update(struct thread *td)
 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 	 */
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
-		ts->skg_runtime /= 2;
-		ts->skg_slptime /= 2;
+		ts->ts_runtime /= 2;
+		ts->ts_slptime /= 2;
 		return;
 	}
-	ts->skg_runtime = (ts->skg_runtime / 5) * 4;
-	ts->skg_slptime = (ts->skg_slptime / 5) * 4;
+	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
+	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
 }
 
+/*
+ * Scale back the interactivity history when a child thread is created.  The
+ * history is inherited from the parent but the thread may behave totally
+ * differently.  For example, a shell spawning a compiler process.  We want
+ * to learn that the compiler is behaving badly very quickly.
+ */
 static void
 sched_interact_fork(struct thread *td)
 {
 	int ratio;
 	int sum;
 
-	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
+	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
 	if (sum > SCHED_SLP_RUN_FORK) {
 		ratio = sum / SCHED_SLP_RUN_FORK;
-		td->td_sched->skg_runtime /= ratio;
-		td->td_sched->skg_slptime /= ratio;
+		td->td_sched->ts_runtime /= ratio;
+		td->td_sched->ts_slptime /= ratio;
 	}
 }
 
-static int
-sched_interact_score(struct thread *td)
-{
-	int div;
-
-	if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) {
-		div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF);
-		return (SCHED_INTERACT_HALF +
-		    (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div)));
-	}
-	if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) {
-		div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF);
-		return (td->td_sched->skg_runtime / div);
-	}
-	/* runtime == slptime */
-	if (td->td_sched->skg_runtime)
-		return (SCHED_INTERACT_HALF);
-
-	/*
-	 * This can happen if slptime and runtime are 0.
-	 */
-	return (0);
-
-}
-
 /*
- * Called from proc0_init() to bootstrap the scheduler.
+ * Called from proc0_init() to setup the scheduler fields.
  */
 void
 schedinit(void)
@@ -1246,7 +1458,7 @@ schedinit(void)
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &td_sched0;
-	thread0.td_lock = &sched_lock;
+	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
 	td_sched0.ts_ltick = ticks;
 	td_sched0.ts_ftick = ticks;
 	td_sched0.ts_thread = &thread0;
@@ -1265,6 +1477,12 @@ sched_rr_interval(void)
 	return (hz/(realstathz/sched_slice));
 }
 
+/*
+ * Update the percent cpu tracking information when it is requested or
+ * the total history exceeds the maximum.  We keep a sliding history of
+ * tick counts that slowly decays.  This is less precise than the 4BSD
+ * mechanism since it happens with less regular and frequent events.
+ */
 static void
 sched_pctcpu_update(struct td_sched *ts)
 {
@@ -1286,6 +1504,11 @@ sched_pctcpu_update(struct td_sched *ts)
 	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
 }
 
+/*
+ * Adjust the priority of a thread.  Move it to the appropriate run-queue
+ * if necessary.  This is the back-end for several priority related
+ * functions.
+ */
 static void
 sched_thread_priority(struct thread *td, u_char prio)
 {
@@ -1306,12 +1529,19 @@ sched_thread_priority(struct thread *td, u_char prio)
 		 * queue.  This could be optimized to not re-add in some
 		 * cases.
 		 */
-		MPASS(td->td_lock == &sched_lock);
 		sched_rem(td);
 		td->td_priority = prio;
-		sched_add(td, SRQ_BORROWING|SRQ_OURSELF);
-	} else
+		sched_add(td, SRQ_BORROWING);
+	} else {
+#ifdef SMP
+		struct tdq *tdq;
+
+		tdq = TDQ_CPU(ts->ts_cpu);
+		if (prio < tdq->tdq_lowpri)
+			tdq->tdq_lowpri = prio;
+#endif
 		td->td_priority = prio;
+	}
 }
 
 /*
@@ -1351,6 +1581,9 @@ sched_unlend_prio(struct thread *td, u_char prio)
 		sched_lend_prio(td, prio);
 }
 
+/*
+ * Standard entry for setting the priority to an absolute value.
+ */
 void
 sched_prio(struct thread *td, u_char prio)
 {
@@ -1378,6 +1611,9 @@ sched_prio(struct thread *td, u_char prio)
 		turnstile_adjust(td, oldprio);
 }
 
+/*
+ * Set the base user priority, does not effect current running priority.
+ */
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
@@ -1420,79 +1656,147 @@ sched_unlend_user_prio(struct thread *td, u_char prio)
 		sched_lend_user_prio(td, prio);
 }
 
+/*
+ * Block a thread for switching.  Similar to thread_block() but does not
+ * bump the spin count.
+ */
+static inline struct mtx *
+thread_block_switch(struct thread *td)
+{
+	struct mtx *lock;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = td->td_lock;
+	td->td_lock = &blocked_lock;
+	mtx_unlock_spin(lock);
+
+	return (lock);
+}
+
+/*
+ * Release a thread that was blocked with thread_block_switch().
+ */
+static inline void
+thread_unblock_switch(struct thread *td, struct mtx *mtx)
+{
+	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
+	    (uintptr_t)mtx);
+}
+
+/*
+ * Switch threads.  This function has to handle threads coming in while
+ * blocked for some reason, running, or idle.  It also must deal with
+ * migrating a thread from one queue to another as running threads may
+ * be assigned elsewhere via binding.
+ */
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
-	int preempt;
+	struct mtx *mtx;
+	int cpuid;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
-	preempt = flags & SW_PREEMPT;
-	tdq = TDQ_SELF();
+	cpuid = PCPU_GET(cpuid);
+	tdq = TDQ_CPU(cpuid);
 	ts = td->td_sched;
+	mtx = TDQ_LOCKPTR(tdq);
+#ifdef SMP
+	ts->ts_rltick = ticks;
+	if (newtd && newtd->td_priority < tdq->tdq_lowpri)
+		tdq->tdq_lowpri = newtd->td_priority;
+#endif
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
 	td->td_flags &= ~TDF_NEEDRESCHED;
 	td->td_owepreempt = 0;
 	/*
-	 * If the thread has been assigned it may be in the process of switching
-	 * to the new cpu.  This is the case in sched_bind().
+	 * The lock pointer in an idle thread should never change.  Reset it
+	 * to CAN_RUN as well.
 	 */
-	/*
-	 * Switch to the sched lock to fix things up and pick
-	 * a new thread.
-	 */
-	if (td->td_lock != &sched_lock) {
-		mtx_lock_spin(&sched_lock);
-		thread_unlock(td);
-	}
 	if (TD_IS_IDLETHREAD(td)) {
-		MPASS(td->td_lock == &sched_lock);
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		TD_SET_CAN_RUN(td);
 	} else if (TD_IS_RUNNING(td)) {
-		/*
-		 * Don't allow the thread to migrate
-		 * from a preemption.
-		 */
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		/* Remove our load so the selection algorithm is not biased. */
 		tdq_load_rem(tdq, ts);
-		if (preempt)
-			sched_pin_td(td);
-		sched_add(td, preempt ?
+		sched_add(td, (flags & SW_PREEMPT) ?
 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 		    SRQ_OURSELF|SRQ_YIELDING);
-		if (preempt)
-			sched_unpin_td(td);
-	} else
-		tdq_load_rem(tdq, ts);
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (newtd != NULL) {
 		/*
-		 * If we bring in a thread account for it as if it had been
-		 * added to the run queue and then chosen.
+		 * When migrating we return from sched_add with an extra
+		 * spinlock nesting, the tdq locked, and a blocked thread.
+		 * This is to optimize out an extra block/unblock cycle here.
 		 */
-		TD_SET_RUNNING(newtd);
-		tdq_load_add(TDQ_SELF(), newtd->td_sched);
-	} else
-		newtd = choosethread();
+		if (ts->ts_cpu != cpuid) {
+			mtx = TDQ_LOCKPTR(TDQ_CPU(ts->ts_cpu));
+			mtx_unlock_spin(mtx);
+			TDQ_LOCK(tdq);
+			spinlock_exit();
+		}
+	} else {
+		/* This thread must be going to sleep. */
+		TDQ_LOCK(tdq);
+		mtx = thread_block_switch(td);
+		tdq_load_rem(tdq, ts);
+	}
+	/*
+	 * We enter here with the thread blocked and assigned to the
+	 * appropriate cpu run-queue or sleep-queue and with the current
+	 * thread-queue locked.
+	 */
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * If KSE assigned a new thread just add it here and pick the best one.
+	 */
+	if (newtd != NULL) {
+		/* XXX This is bogus.  What if the thread is locked elsewhere? */
+		td->td_lock = TDQ_LOCKPTR(tdq);
+		td->td_sched->ts_cpu = cpuid;
+		tdq_add(tdq, td, SRQ_YIELDING);
+	}
+	newtd = choosethread();
+	/*
+	 * Call the MD code to switch contexts if necessary.
+	 */
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
-
-		cpu_switch(td, newtd, td->td_lock);
+		cpu_switch(td, newtd, mtx);
+		/*
+		 * We may return from cpu_switch on a different cpu.  However,
+		 * we always return with td_lock pointing to the current cpu's
+		 * run queue lock.
+		 */
+		cpuid = PCPU_GET(cpuid);
+		tdq = TDQ_CPU(cpuid);
+		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)td;
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
-	}
-	sched_lock.mtx_lock = (uintptr_t)td;
-	td->td_oncpu = PCPU_GET(cpuid);
-	MPASS(td->td_lock == &sched_lock);
+	} else
+		thread_unblock_switch(td, mtx);
+	/*
+	 * Assert that all went well and return.
+	 */
+#ifdef SMP
+	/* We should always get here with the lowest priority td possible */
+	tdq->tdq_lowpri = td->td_priority;
+#endif
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	td->td_oncpu = cpuid;
 }
 
+/*
+ * Adjust thread priorities as a result of a nice request.
+ */
 void
 sched_nice(struct proc *p, int nice)
 {
@@ -1510,20 +1814,27 @@ sched_nice(struct proc *p, int nice)
 	}
 }
 
+/*
+ * Record the sleep time for the interactivity scorer.
+ */
 void
 sched_sleep(struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
-	td->td_sched->ts_slptime = ticks;
+	td->td_sched->ts_slptick = ticks;
 }
 
+/*
+ * Schedule a thread to resume execution and record how long it voluntarily
+ * slept.  We also update the pctcpu, interactivity, and priority.
+ */
 void
 sched_wakeup(struct thread *td)
 {
 	struct td_sched *ts;
-	int slptime;
+	int slptick;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
@@ -1531,13 +1842,13 @@ sched_wakeup(struct thread *td)
 	 * If we slept for more than a tick update our interactivity and
 	 * priority.
 	 */
-	slptime = ts->ts_slptime;
-	ts->ts_slptime = 0;
-	if (slptime && slptime != ticks) {
+	slptick = ts->ts_slptick;
+	ts->ts_slptick = 0;
+	if (slptick && slptick != ticks) {
 		u_int hzticks;
 
-		hzticks = (ticks - slptime) << SCHED_TICK_SHIFT;
-		ts->skg_slptime += hzticks;
+		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
+		ts->ts_slptime += hzticks;
 		sched_interact_update(td);
 		sched_pctcpu_update(ts);
 		sched_priority(td);
@@ -1561,11 +1872,14 @@ sched_fork(struct thread *td, struct thread *child)
 	 */
 	sched_interact_fork(child);
 	sched_priority(child);
-	td->td_sched->skg_runtime += tickincr;
+	td->td_sched->ts_runtime += tickincr;
 	sched_interact_update(td);
 	sched_priority(td);
 }
 
+/*
+ * Fork a new thread, may be within the same process.
+ */
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
@@ -1577,7 +1891,7 @@ sched_fork_thread(struct thread *td, struct thread *child)
 	 */
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_newthread(child);
-	child->td_lock = &sched_lock;
+	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
 	ts = td->td_sched;
 	ts2 = child->td_sched;
 	ts2->ts_cpu = ts->ts_cpu;
@@ -1593,11 +1907,14 @@ sched_fork_thread(struct thread *td, struct thread *child)
 	/*
 	 * And update interactivity score.
 	 */
-	ts2->skg_slptime = ts->skg_slptime;
-	ts2->skg_runtime = ts->skg_runtime;
+	ts2->ts_slptime = ts->ts_slptime;
+	ts2->ts_runtime = ts->ts_runtime;
 	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
 }
 
+/*
+ * Adjust the priority class of a thread.
+ */
 void
 sched_class(struct thread *td, int class)
 {
@@ -1646,6 +1963,12 @@ sched_exit(struct proc *p, struct thread *child)
 	sched_exit_thread(td, child);
 }
 
+/*
+ * Penalize another thread for the time spent on this one.  This helps to
+ * worsen the priority and interactivity of processes which schedule batch
+ * jobs such as make.  This has little effect on the make process itself but
+ * causes new processes spawned by it to receive worse scores immediately.
+ */
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
@@ -1653,9 +1976,6 @@ sched_exit_thread(struct thread *td, struct thread *child)
 	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
 	    child, child->td_proc->p_comm, child->td_priority);
 
-	thread_lock(child);
-	tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
-	thread_unlock(child);
 #ifdef KSE
 	/*
 	 * KSE forks and exits so often that this penalty causes short-lived
@@ -1671,12 +1991,16 @@ sched_exit_thread(struct thread *td, struct thread *child)
 	 * launch expensive things to mark their children as expensive.
 	 */
 	thread_lock(td);
-	td->td_sched->skg_runtime += child->td_sched->skg_runtime;
+	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
 	sched_interact_update(td);
 	sched_priority(td);
 	thread_unlock(td);
 }
 
+/*
+ * Fix priorities on return to user-space.  Priorities may be elevated due
+ * to static priorities in msleep() or similar.
+ */
 void
 sched_userret(struct thread *td)
 {
@@ -1699,16 +2023,17 @@ sched_userret(struct thread *td)
         }
 }
 
+/*
+ * Handle a stathz tick.  This is really only relevant for timeshare
+ * threads.
+ */
 void
 sched_clock(struct thread *td)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-#ifdef SMP
-	sched_smp_tick(td);
-#endif
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
 	/*
 	 * Advance the insert index once for each tick to ensure that all
@@ -1729,7 +2054,7 @@ sched_clock(struct thread *td)
 	 * We used a tick; charge it to the thread so that we can compute our
 	 * interactivity.
 	 */
-	td->td_sched->skg_runtime += tickincr;
+	td->td_sched->ts_runtime += tickincr;
 	sched_interact_update(td);
 	/*
 	 * We used up one time slice.
@@ -1743,6 +2068,31 @@ sched_clock(struct thread *td)
 	td->td_flags |= TDF_NEEDRESCHED;
 }
 
+/*
+ * Called once per hz tick.  Used for cpu utilization information.  This
+ * is easier than trying to scale based on stathz.
+ */
+void
+sched_tick(void)
+{
+	struct td_sched *ts;
+
+	ts = curthread->td_sched;
+	/* Adjust ticks for pctcpu */
+	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
+	ts->ts_ltick = ticks;
+	/*
+	 * Update if we've exceeded our desired tick threshhold by over one
+	 * second.
+	 */
+	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
+		sched_pctcpu_update(ts);
+}
+
+/*
+ * Return whether the current CPU has runnable tasks.  Used for in-kernel
+ * cooperative idle threads.
+ */
 int
 sched_runnable(void)
 {
@@ -1752,10 +2102,6 @@ sched_runnable(void)
 	load = 1;
 
 	tdq = TDQ_SELF();
-#ifdef SMP
-	if (tdq_busy)
-		goto out;
-#endif
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (tdq->tdq_load > 0)
 			goto out;
@@ -1767,36 +2113,48 @@ out:
 	return (load);
 }
 
+/*
+ * Choose the highest priority thread to run.  The thread is removed from
+ * the run-queue while running however the load remains.  For SMP we set
+ * the tdq in the global idle bitmask if it idles here.
+ */
 struct thread *
 sched_choose(void)
 {
-	struct tdq *tdq;
-	struct td_sched *ts;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	tdq = TDQ_SELF();
 #ifdef SMP
-restart:
+	struct tdq_group *tdg;
 #endif
+	struct td_sched *ts;
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	ts = tdq_choose(tdq);
 	if (ts) {
-#ifdef SMP
-		if (ts->ts_thread->td_priority > PRI_MIN_IDLE)
-			if (tdq_idled(tdq) == 0)
-				goto restart;
-#endif
 		tdq_runq_rem(tdq, ts);
 		return (ts->ts_thread);
 	}
 #ifdef SMP
-	if (tdq_idled(tdq) == 0)
-		goto restart;
+	/*
+	 * We only set the idled bit when all of the cpus in the group are
+	 * idle.  Otherwise we could get into a situation where a thread bounces
+	 * back and forth between two idle cores on seperate physical CPUs.
+	 */
+	tdg = tdq->tdq_group;
+	tdg->tdg_idlemask |= PCPU_GET(cpumask);
+	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
+		atomic_set_int(&tdq_idle, tdg->tdg_mask);
+	tdq->tdq_lowpri = PRI_MAX_IDLE;
 #endif
 	return (PCPU_GET(idlethread));
 }
 
-static int
-sched_preempt(struct thread *td)
+/*
+ * Set owepreempt if necessary.  Preemption never happens directly in ULE,
+ * we always request it once we exit a critical section.
+ */
+static inline void
+sched_setpreempt(struct thread *td)
 {
 	struct thread *ctd;
 	int cpri;
@@ -1805,108 +2163,57 @@ sched_preempt(struct thread *td)
 	ctd = curthread;
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
+	if (td->td_priority < ctd->td_priority)
+		curthread->td_flags |= TDF_NEEDRESCHED;
 	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
-		return (0);
+		return;
 	/*
 	 * Always preempt IDLE threads.  Otherwise only if the preempting
 	 * thread is an ithread.
 	 */
-	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
-		return (0);
-	if (ctd->td_critnest > 1) {
-		CTR1(KTR_PROC, "sched_preempt: in critical section %d",
-		    ctd->td_critnest);
-		ctd->td_owepreempt = 1;
-		return (0);
-	}
-	/*
-	 * Thread is runnable but not yet put on system run queue.
-	 */
-	MPASS(TD_ON_RUNQ(td));
-	TD_SET_RUNNING(td);
-	MPASS(ctd->td_lock == &sched_lock);
-	MPASS(td->td_lock == &sched_lock);
-	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
-	    td->td_proc->p_pid, td->td_proc->p_comm);
-	/*
-	 * We enter the switch with two runnable threads that both have
-	 * the same lock.  When we return td may be sleeping so we need
-	 * to switch locks to make sure he's locked correctly.
-	 */
-	SCHED_STAT_INC(switch_preempt);
-	mi_switch(SW_INVOL|SW_PREEMPT, td);
-	spinlock_enter();
-	thread_unlock(ctd);
-	thread_lock(td);
-	spinlock_exit();
-
-	return (1);
+	if (pri > preempt_thresh && cpri < PRI_MIN_IDLE)
+		return;
+	ctd->td_owepreempt = 1;
+	return;
 }
 
+/*
+ * Add a thread to a thread queue.  Initializes priority, slice, runq, and
+ * add it to the appropriate queue.  This is the internal function called
+ * when the tdq is predetermined.
+ */
 void
-sched_add(struct thread *td, int flags)
+tdq_add(struct tdq *tdq, struct thread *td, int flags)
 {
-	struct tdq *tdq;
 	struct td_sched *ts;
-	int preemptive;
 	int class;
 #ifdef SMP
-	int cpuid;
 	int cpumask;
 #endif
-	ts = td->td_sched;
 
-	THREAD_LOCK_ASSERT(td, MA_OWNED);
-	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
-	    td, td->td_proc->p_comm, td->td_priority, curthread,
-	    curthread->td_proc->p_comm);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
-	/*
-	 * Now that the thread is moving to the run-queue, set the lock
-	 * to the scheduler's lock.
-	 */
-	if (td->td_lock != &sched_lock) {
-		mtx_lock_spin(&sched_lock);
-		thread_lock_set(td, &sched_lock);
-	}
-	mtx_assert(&sched_lock, MA_OWNED);
-        TD_SET_RUNQ(td);
-	tdq = TDQ_SELF();
+
+	ts = td->td_sched;
 	class = PRI_BASE(td->td_pri_class);
-	preemptive = !(flags & SRQ_YIELDING);
-	/*
-	 * Recalculate the priority before we select the target cpu or
-	 * run-queue.
-	 */
-	if (class == PRI_TIMESHARE)
-		sched_priority(td);
+        TD_SET_RUNQ(td);
 	if (ts->ts_slice == 0)
 		ts->ts_slice = sched_slice;
-#ifdef SMP
-	cpuid = PCPU_GET(cpuid);
 	/*
-	 * Pick the destination cpu and if it isn't ours transfer to the
-	 * target cpu.
+	 * Pick the run queue based on priority.
 	 */
-	if (THREAD_CAN_MIGRATE(td)) {
-		if (td->td_priority <= PRI_MAX_ITHD) {
-			CTR2(KTR_ULE, "ithd %d < %d",
-			    td->td_priority, PRI_MAX_ITHD);
-			ts->ts_cpu = cpuid;
-		} else if (pick_pri)
-			ts->ts_cpu = tdq_pickpri(tdq, ts, flags);
-		else
-			ts->ts_cpu = tdq_pickidle(tdq, ts);
-	} else
-		CTR1(KTR_ULE, "pinned %d", td->td_pinned);
-	if (ts->ts_cpu != cpuid)
-		preemptive = 0;
-	tdq = TDQ_CPU(ts->ts_cpu);
+	if (td->td_priority <= PRI_MAX_REALTIME)
+		ts->ts_runq = &tdq->tdq_realtime;
+	else if (td->td_priority <= PRI_MAX_TIMESHARE)
+		ts->ts_runq = &tdq->tdq_timeshare;
+	else
+		ts->ts_runq = &tdq->tdq_idle;
+#ifdef SMP
 	cpumask = 1 << ts->ts_cpu;
 	/*
 	 * If we had been idle, clear our bit in the group and potentially
@@ -1926,30 +2233,74 @@ sched_add(struct thread *td, int flags)
 		 */
 		tdq->tdq_group->tdg_idlemask &= ~cpumask;
 	}
+	if (td->td_priority < tdq->tdq_lowpri)
+		tdq->tdq_lowpri = td->td_priority;
 #endif
-	/*
-	 * Pick the run queue based on priority.
-	 */
-	if (td->td_priority <= PRI_MAX_REALTIME)
-		ts->ts_runq = &tdq->tdq_realtime;
-	else if (td->td_priority <= PRI_MAX_TIMESHARE)
-		ts->ts_runq = &tdq->tdq_timeshare;
-	else
-		ts->ts_runq = &tdq->tdq_idle;
-	if (preemptive && sched_preempt(td))
-		return;
 	tdq_runq_add(tdq, ts, flags);
 	tdq_load_add(tdq, ts);
+}
+
+/*
+ * Select the target thread queue and add a thread to it.  Request
+ * preemption or IPI a remote processor if required.
+ */
+void
+sched_add(struct thread *td, int flags)
+{
+	struct td_sched *ts;
+	struct tdq *tdq;
 #ifdef SMP
-	if (ts->ts_cpu != cpuid) {
+	int cpuid;
+	int cpu;
+#endif
+	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
+	    td, td->td_proc->p_comm, td->td_priority, curthread,
+	    curthread->td_proc->p_comm);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	/*
+	 * Recalculate the priority before we select the target cpu or
+	 * run-queue.
+	 */
+	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+		sched_priority(td);
+#ifdef SMP
+	cpuid = PCPU_GET(cpuid);
+	/*
+	 * Pick the destination cpu and if it isn't ours transfer to the
+	 * target cpu.
+	 */
+	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td))
+		cpu = cpuid;
+	else if (!THREAD_CAN_MIGRATE(td))
+		cpu = ts->ts_cpu;
+	else
+		cpu = sched_pickcpu(ts, flags);
+	tdq = sched_setcpu(ts, cpu, flags);
+	tdq_add(tdq, td, flags);
+	if (cpu != cpuid) {
 		tdq_notify(ts);
 		return;
 	}
+#else
+	tdq = TDQ_SELF();
+	TDQ_LOCK(tdq);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	thread_lock_set(td, TDQ_LOCKPTR(tdq));
+	tdq_add(tdq, td, flags);
 #endif
-	if (td->td_priority < curthread->td_priority)
-		curthread->td_flags |= TDF_NEEDRESCHED;
+	if (!(flags & SRQ_YIELDING))
+		sched_setpreempt(td);
 }
 
+/*
+ * Remove a thread from a run-queue without running it.  This is used
+ * when we're stealing a thread from a remote queue.  Otherwise all threads
+ * exit by calling sched_exit_thread() and sched_throw() themselves.
+ */
 void
 sched_rem(struct thread *td)
 {
@@ -1959,17 +2310,20 @@ sched_rem(struct thread *td)
 	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
-	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
+	tdq = TDQ_CPU(ts->ts_cpu);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
-
-	tdq = TDQ_CPU(ts->ts_cpu);
 	tdq_runq_rem(tdq, ts);
 	tdq_load_rem(tdq, ts);
 	TD_SET_CAN_RUN(td);
 }
 
+/*
+ * Fetch cpu utilization information.  Updates on demand.
+ */
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
@@ -1996,6 +2350,9 @@ sched_pctcpu(struct thread *td)
 	return (pctcpu);
 }
 
+/*
+ * Bind a thread to a target cpu.
+ */
 void
 sched_bind(struct thread *td, int cpu)
 {
@@ -2016,6 +2373,9 @@ sched_bind(struct thread *td, int cpu)
 #endif
 }
 
+/*
+ * Release a bound thread.
+ */
 void
 sched_unbind(struct thread *td)
 {
@@ -2038,6 +2398,9 @@ sched_is_bound(struct thread *td)
 	return (td->td_sched->ts_flags & TSF_BOUND);
 }
 
+/*
+ * Basic yield call.
+ */
 void
 sched_relinquish(struct thread *td)
 {
@@ -2049,6 +2412,9 @@ sched_relinquish(struct thread *td)
 	thread_unlock(td);
 }
 
+/*
+ * Return the total system load.
+ */
 int
 sched_load(void)
 {
@@ -2077,38 +2443,27 @@ sched_sizeof_thread(void)
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
-void
-sched_tick(void)
-{
-	struct td_sched *ts;
-
-	ts = curthread->td_sched;
-	/* Adjust ticks for pctcpu */
-	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
-	ts->ts_ltick = ticks;
-	/*
-	 * Update if we've exceeded our desired tick threshhold by over one
-	 * second.
-	 */
-	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
-		sched_pctcpu_update(ts);
-}
-
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
-	struct proc *p;
 	struct thread *td;
+	struct tdq *tdq;
 
 	td = curthread;
-	p = td->td_proc;
+	tdq = TDQ_SELF();
 	mtx_assert(&Giant, MA_NOTOWNED);
-	/* ULE Relies on preemption for idle interruption. */
-	for (;;)
+	/* ULE relies on preemption for idle interruption. */
+	for (;;) {
+#ifdef SMP
+		if (tdq_idled(tdq))
+			cpu_idle();
+#else
 		cpu_idle();
+#endif
+	}
 }
 
 /*
@@ -2117,64 +2472,77 @@ sched_idletd(void *dummy)
 void
 sched_throw(struct thread *td)
 {
-	/*
-	 * Correct spinlock nesting.  The idle thread context that we are
-	 * borrowing was created so that it would start out with a single
-	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
-	 * explicitly acquired locks in this function, the nesting count
-	 * is now 2 rather than 1.  Since we are nested, calling
-	 * spinlock_exit() will simply adjust the counts without allowing
-	 * spin lock using code to interrupt us.
-	 */
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
 	if (td == NULL) {
-		mtx_lock_spin(&sched_lock);
+		/* Correct spinlock nesting and acquire the correct lock. */
+		TDQ_LOCK(tdq);
 		spinlock_exit();
 	} else {
-		MPASS(td->td_lock == &sched_lock);
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		tdq_load_rem(tdq, td->td_sched);
 	}
-	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	PCPU_SET(switchtime, cpu_ticks());
 	PCPU_SET(switchticks, ticks);
 	cpu_throw(td, choosethread());	/* doesn't return */
 }
 
+/*
+ * This is called from fork_exit().  Just acquire the correct locks and
+ * let fork do the rest of the work.
+ */
 void
 sched_fork_exit(struct thread *td)
 {
+	struct td_sched *ts;
+	struct tdq *tdq;
+	int cpuid;
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
-	 * non-nested critical section with sched_lock held but not recursed.
+	 * non-nested critical section with the scheduler lock held.
 	 */
-	td->td_oncpu = PCPU_GET(cpuid);
-	sched_lock.mtx_lock = (uintptr_t)td;
+	cpuid = PCPU_GET(cpuid);
+	tdq = TDQ_CPU(cpuid);
+	ts = td->td_sched;
+	if (TD_IS_IDLETHREAD(td))
+		td->td_lock = TDQ_LOCKPTR(tdq);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	td->td_oncpu = cpuid;
+	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)td;
 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 }
 
-static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
-SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
+static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0,
+    "Scheduler");
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
     "Scheduler name");
-SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
+    "Slice size for timeshare threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
+     "Interactivity score threshold");
+SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
+     0,"Min priority for preemption, lower priorities have greater precedence");
 #ifdef SMP
-SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_affinity, CTLFLAG_RW,
-    &affinity, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryself, CTLFLAG_RW,
-    &tryself, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryselfidle, CTLFLAG_RW,
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
+    "Pick the target cpu based on priority rather than load.");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_zero, CTLFLAG_RW, &pick_zero, 0,
+    "If there are no idle cpus pick cpu0");
+SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
+    "Number of hz ticks to keep thread affinity for");
+SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, tryselfidle, CTLFLAG_RW,
     &tryselfidle, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, ipi_preempt, CTLFLAG_RW, &ipi_preempt, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, ipi_ast, CTLFLAG_RW, &ipi_ast, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
+    "Enables the long-term load balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
+    "Steals work from another hyper-threaded core on idle");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
+    "Attempts to steal work from other cores before idling");
+SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0,
+    "True when a topology has been specified by the MD code.");
 #endif
 
 /* ps compat */