diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 8c4eeaa8a0ea..4d5d4609202f 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -78,8 +78,9 @@ int realstathz;
 int tickincr = 1;
 
 #ifdef SMP
-/* Callout to handle load balancing SMP systems. */
+/* Callouts to handle load balancing SMP systems. */
 static struct callout kseq_lb_callout;
+static struct callout kseq_group_callout;
 #endif
 
 /*
@@ -234,6 +235,7 @@ struct kseq_group {
 	int	ksg_cpumask;		/* Mask of cpus in this group. */
 	int	ksg_idlemask;		/* Idle cpus in this group. */
 	int	ksg_mask;		/* Bit mask for first cpu. */
+	int	ksg_load;		/* Total load of this group. */
 	int	ksg_transferable;	/* Transferable load of this group. */
 	LIST_HEAD(, kseq)	ksg_members; /* Linked list of all members. */
 };
@@ -244,10 +246,13 @@ struct kseq_group {
  */
 #ifdef SMP
 static int kseq_idle;
+static int ksg_maxid;
 static struct kseq	kseq_cpu[MAXCPU];
 static struct kseq_group kseq_groups[MAXCPU];
 #define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
 #define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
+#define	KSEQ_ID(x)	((x) - kseq_cpu)
+#define	KSEQ_GROUP(x)	(&kseq_groups[(x)])
 #else	/* !SMP */
 static struct kseq	kseq_cpu;
 #define	KSEQ_SELF()	(&kseq_cpu)
@@ -275,6 +280,8 @@ void kseq_print(int cpu);
 static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class);
 static struct kse *runq_steal(struct runq *rq);
 static void sched_balance(void *arg);
+static void sched_balance_group(struct kseq_group *ksg);
+static void sched_balance_pair(struct kseq *high, struct kseq *low);
 static void kseq_move(struct kseq *from, int cpu);
 static int kseq_idled(struct kseq *kseq);
 static void kseq_notify(struct kse *ke, int cpu);
@@ -340,6 +347,10 @@ kseq_load_add(struct kseq *kseq, struct kse *ke)
 	if (class == PRI_TIMESHARE)
 		kseq->ksq_load_timeshare++;
 	kseq->ksq_load++;
+#ifdef SMP
+	if (class != PRI_ITHD)
+		kseq->ksq_group->ksg_load++;
+#endif
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 		CTR6(KTR_ULE,
 		    "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
@@ -357,6 +368,10 @@ kseq_load_rem(struct kseq *kseq, struct kse *ke)
 	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
 	if (class == PRI_TIMESHARE)
 		kseq->ksq_load_timeshare--;
+#ifdef SMP
+	if (class != PRI_ITHD)
+		kseq->ksq_group->ksg_load--;
+#endif
 	kseq->ksq_load--;
 	ke->ke_runq = NULL;
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
@@ -421,75 +436,128 @@ kseq_nice_rem(struct kseq *kseq, int nice)
 static void
 sched_balance(void *arg)
 {
-	struct kseq *kseq;
-	int transferable;
-	int high_load;
-	int low_load;
-	int high_cpu;
-	int low_cpu;
-	int move;
-	int diff;
+	struct kseq_group *high;
+	struct kseq_group *low;
+	struct kseq_group *ksg;
+	int timo;
+	int cnt;
 	int i;
 
-	high_cpu = 0;
-	low_cpu = 0;
-	high_load = 0;
-	low_load = -1;
-
 	mtx_lock_spin(&sched_lock);
 	if (smp_started == 0)
 		goto out;
-
-	for (i = 0; i <= mp_maxid; i++) {
-		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
-			continue;
-		kseq = KSEQ_CPU(i);
+	low = high = NULL;
+	i = random() % (ksg_maxid + 1);
+	for (cnt = 0; cnt <= ksg_maxid; cnt++) {
+		ksg = KSEQ_GROUP(i);
 		/*
-		 * Find the CPU with the highest load that has some threads
-		 * to transfer.
+		 * Find the CPU with the highest load that has some
+		 * threads to transfer.
 		 */
-		if (kseq->ksq_load > high_load &&
-		    kseq->ksq_group->ksg_transferable) {
-			high_load = kseq->ksq_load;
-			high_cpu = i;
-		}
-		if (low_load == -1 || kseq->ksq_load < low_load) {
-			low_load = kseq->ksq_load;
-			low_cpu = i;
-		}
+		if ((high == NULL || ksg->ksg_load > high->ksg_load)
+		    && ksg->ksg_transferable)
+			high = ksg;
+		if (low == NULL || ksg->ksg_load < low->ksg_load)
+			low = ksg;
+		if (++i > ksg_maxid)
+			i = 0;
 	}
-	kseq = KSEQ_CPU(high_cpu);
-	/*
-	 * Nothing to do.
-	 */
-	if (low_load >= high_load)
-		goto out;
+	if (low != NULL && high != NULL && high != low)
+		sched_balance_pair(LIST_FIRST(&high->ksg_members),
+		    LIST_FIRST(&low->ksg_members));
+out:
+	mtx_unlock_spin(&sched_lock);
+	timo = random() % (hz * 2);
+	callout_reset(&kseq_lb_callout, timo, sched_balance, NULL);
+}
+
+static void
+sched_balance_groups(void *arg)
+{
+	int timo;
+	int i;
+
+	mtx_lock_spin(&sched_lock);
+	if (smp_started)
+		for (i = 0; i <= ksg_maxid; i++)
+			sched_balance_group(KSEQ_GROUP(i));
+	mtx_unlock_spin(&sched_lock);
+	timo = random() % (hz * 2);
+	callout_reset(&kseq_group_callout, timo, sched_balance_groups, NULL);
+}
+
+static void
+sched_balance_group(struct kseq_group *ksg)
+{
+	struct kseq *kseq;
+	struct kseq *high;
+	struct kseq *low;
+	int load;
+
+	if (ksg->ksg_transferable == 0)
+		return;
+	low = NULL;
+	high = NULL;
+	LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
+		load = kseq->ksq_load;
+		if (kseq == KSEQ_CPU(0))
+			load--;
+		if (high == NULL || load > high->ksq_load)
+			high = kseq;
+		if (low == NULL || load < low->ksq_load)
+			low = kseq;
+	}
+	if (high != NULL && low != NULL && high != low)
+		sched_balance_pair(high, low);
+}
+
+static void
+sched_balance_pair(struct kseq *high, struct kseq *low)
+{
+	int transferable;
+	int high_load;
+	int low_load;
+	int move;
+	int diff;
+	int i;
+
 	/*
 	 * If we're transfering within a group we have to use this specific
 	 * kseq's transferable count, otherwise we can steal from other members
 	 * of the group.
 	 */
-	if (kseq->ksq_group == KSEQ_CPU(low_cpu)->ksq_group)
-		transferable = kseq->ksq_transferable;
-	else
-		transferable = kseq->ksq_group->ksg_transferable;
+	if (high->ksq_group == low->ksq_group) {
+		transferable = high->ksq_transferable;
+		high_load = high->ksq_load;
+		low_load = low->ksq_load;
+		/*
+		 * XXX If we encounter cpu 0 we must remember to reduce it's
+		 * load by 1 to reflect the swi that is running the callout.
+		 * At some point we should really fix load balancing of the
+		 * swi and then this wont matter.
+		 */
+		if (high == KSEQ_CPU(0))
+			high_load--;
+		if (low == KSEQ_CPU(0))
+			low_load--;
+	} else {
+		transferable = high->ksq_group->ksg_transferable;
+		high_load = high->ksq_group->ksg_load;
+		low_load = low->ksq_group->ksg_load;
+	}
 	if (transferable == 0)
-		goto out;
+		return;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * kses we actually have to give up (transferable).
 	 */
-	diff = kseq->ksq_load - low_load;
+	diff = high_load - low_load;
 	move = diff / 2;
 	if (diff & 0x1)
 		move++;
 	move = min(move, transferable);
 	for (i = 0; i < move; i++)
-		kseq_move(kseq, low_cpu);
-out:
-	mtx_unlock_spin(&sched_lock);
-	callout_reset(&kseq_lb_callout, hz, sched_balance, NULL);
-
+		kseq_move(high, KSEQ_ID(low));
 	return;
 }
 
@@ -763,6 +831,7 @@ static void
 sched_setup(void *dummy)
 {
 #ifdef SMP
+	int balance_groups;
 	int i;
 #endif
 
@@ -770,6 +839,7 @@ sched_setup(void *dummy)
 	slice_max = (hz/7);	/* ~140ms */
 
 #ifdef SMP
+	balance_groups = 0;
 	/*
 	 * Initialize the kseqs.
 	 */
@@ -795,6 +865,7 @@ sched_setup(void *dummy)
 			ksg->ksg_cpus = 1;
 			ksg->ksg_idlemask = 0;
 			ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
+			ksg->ksg_load = 0;
 			ksg->ksg_transferable = 0;
 			LIST_INIT(&ksg->ksg_members);
 			LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
@@ -811,6 +882,7 @@ sched_setup(void *dummy)
 			 * Initialize the group.
 			 */
 			ksg->ksg_idlemask = 0;
+			ksg->ksg_load = 0;
 			ksg->ksg_transferable = 0;
 			ksg->ksg_cpus = cg->cg_count;
 			ksg->ksg_cpumask = cg->cg_mask;
@@ -828,10 +900,21 @@ sched_setup(void *dummy)
 					    &kseq_cpu[j], ksq_siblings);
 				}
 			}
+			if (ksg->ksg_cpus > 1)
+				balance_groups = 1;
 		}
+		ksg_maxid = smp_topology->ct_count - 1;
 	}
 	callout_init(&kseq_lb_callout, CALLOUT_MPSAFE);
+	callout_init(&kseq_group_callout, CALLOUT_MPSAFE);
 	sched_balance(NULL);
+	/*
+	 * Stagger the group and global load balancer so they do not
+	 * interfere with each other.
+	 */
+	if (balance_groups)
+		callout_reset(&kseq_group_callout, hz / 2,
+		    sched_balance_groups, NULL);
 #else
 	kseq_setup(KSEQ_SELF());
 #endif