From c9f25d8f9269572c46694dc8c5d8e77299952781 Mon Sep 17 00:00:00 2001
From: Jeff Roberson <jeff@FreeBSD.org>
Date: Tue, 28 Jan 2003 09:28:20 +0000
Subject: [PATCH]  - Fix the ksq_load calculation.  It now reflects the number
 of entries on the    run queue for each cpu.  - Introduce kse stealing into
 the sched_choose() code.  This helps balance    cpus better in cases where
 process turnover is high.  This implementation    is fairly trivial and will
 likely be only a temporary measure until    something more sophisticated has
 been written.

---
 sys/kern/sched_ule.c | 161 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 134 insertions(+), 27 deletions(-)

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index eb96eb40b941..199fcb65d276 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -181,6 +181,7 @@ struct kseq	kseq_cpu[MAXCPU];
 static int sched_slice(struct ksegrp *kg);
 static int sched_priority(struct ksegrp *kg);
 void sched_pctcpu_update(struct kse *ke);
+void sched_check_runqs(void);
 int sched_pickcpu(void);
 
 static void
@@ -197,8 +198,6 @@ sched_setup(void *dummy)
 		runq_init(kseq_cpu[i].ksq_curr);
 		runq_init(kseq_cpu[i].ksq_next);
 	}
-	/* CPU0 has proc0 */
-	kseq_cpu[0].ksq_load++;
 	mtx_unlock_spin(&sched_lock);
 }
 
@@ -368,7 +367,6 @@ sched_switchin(struct thread *td)
 	if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
 	    td->td_priority != td->td_ksegrp->kg_user_pri)
 		curthread->td_kse->ke_flags |= KEF_NEEDRESCHED;
-
 }
 
 void
@@ -443,8 +441,14 @@ sched_fork(struct ksegrp *kg, struct ksegrp *child)
 	child->kg_slptime = kg->kg_slptime;
 	child->kg_user_pri = kg->kg_user_pri;
 
+	if (pkse->ke_oncpu != PCPU_GET(cpuid)) {
+		printf("pkse->ke_oncpu = %d\n", pkse->ke_oncpu);
+		printf("cpuid = %d", PCPU_GET(cpuid));
+		Debugger("stop");
+	}
+
 	ckse->ke_slice = pkse->ke_slice;
-	ckse->ke_oncpu = sched_pickcpu();
+	ckse->ke_oncpu = pkse->ke_oncpu; /* sched_pickcpu(); */
 	ckse->ke_runq = NULL;
 	/*
 	 * Claim that we've been running for one second for statistical
@@ -475,7 +479,6 @@ sched_exit(struct ksegrp *kg, struct ksegrp *child)
 	 */
 	ke = FIRST_KSE_IN_KSEGRP(kg);
 	kseq = &kseq_cpu[ke->ke_oncpu];
-	kseq->ksq_load--;
 }
 
 int sched_clock_switches;
@@ -484,7 +487,9 @@ void
 sched_clock(struct thread *td)
 {
 	struct kse *ke;
+#if 0
 	struct kse *nke;
+#endif
 	struct ksegrp *kg;
 	struct kseq *kseq;
 	int cpu;
@@ -497,8 +502,6 @@ sched_clock(struct thread *td)
 	ke = td->td_kse;
 	kg = td->td_ksegrp;
 
-	nke = runq_choose(kseq->ksq_curr);
-
 	if (td->td_kse->ke_flags & KEF_IDLEKSE) {
 #if 0
 		if (nke && nke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
@@ -509,11 +512,15 @@ sched_clock(struct thread *td)
 #endif
 		return;
 	}
+#if 0
+	nke = runq_choose(kseq->ksq_curr);
+
 	if (nke && nke->ke_thread &&
 	    nke->ke_thread->td_priority < td->td_priority) {
 		sched_clock_switches++;
 		ke->ke_flags |= KEF_NEEDRESCHED;
 	}
+#endif
 
 	/*
 	 * We used a tick, decrease our total sleep time.  This decreases our
@@ -538,6 +545,7 @@ sched_clock(struct thread *td)
 		ke->ke_flags |= KEF_NEEDRESCHED;
 		ke->ke_runq = NULL;
 	}
+
 	ke->ke_ticks += 10000;
 	ke->ke_ltick = ticks;
 	/* Go up to one second beyond our max and then trim back down */
@@ -545,6 +553,20 @@ sched_clock(struct thread *td)
 		sched_pctcpu_update(ke);
 }
 
+void sched_print_load(void);
+
+void
+sched_print_load(void)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < mp_maxid; cpu++) {
+		if (CPU_ABSENT(cpu))
+			continue;
+		printf("%d: %d\n", cpu, kseq_cpu[cpu].ksq_load);
+	}
+}
+
 int
 sched_runnable(void)
 {
@@ -554,9 +576,24 @@ sched_runnable(void)
 	cpu = PCPU_GET(cpuid);
 	kseq = &kseq_cpu[cpu];
 
-	if (runq_check(kseq->ksq_curr) == 0)
-		return (runq_check(kseq->ksq_next));
-	return (1);
+	if (runq_check(kseq->ksq_curr))
+		return (1);
+
+	if (runq_check(kseq->ksq_next))
+		return (1);
+#ifdef SMP
+	if (smp_started) {
+		int i;
+
+		for (i = 0; i < mp_maxid; i++) {
+			if (CPU_ABSENT(i))
+				continue;
+			if (kseq_cpu[i].ksq_load && i != cpu)
+				return (1);
+		}
+	}
+#endif
+	return (0);
 }
 
 void
@@ -573,16 +610,33 @@ sched_userret(struct thread *td)
 	}
 }
 
-struct kse *
-sched_choose(void)
+void
+sched_check_runqs(void)
 {
 	struct kseq *kseq;
+	int cpu;
+
+	for (cpu = 0; cpu < mp_maxid; cpu++) {
+		if (CPU_ABSENT(cpu))
+			continue;
+		kseq = &kseq_cpu[cpu];
+		if (kseq->ksq_load !=
+		    (runq_depth(kseq->ksq_curr) + runq_depth(kseq->ksq_next))) {
+			printf("CPU: %d\tload: %d\tcurr: %d\tnext: %d\n",
+			    cpu, kseq->ksq_load, runq_depth(kseq->ksq_curr),
+			    runq_depth(kseq->ksq_next));
+			Debugger("Imbalance");
+		}
+	}
+}
+
+struct kse * sched_choose_kseq(struct kseq *kseq);
+
+struct kse *
+sched_choose_kseq(struct kseq *kseq)
+{
 	struct kse *ke;
 	struct runq *swap;
-	int cpu;
-	
-	cpu = PCPU_GET(cpuid);
-	kseq = &kseq_cpu[cpu];
 
 	if ((ke = runq_choose(kseq->ksq_curr)) == NULL) {
 		swap = kseq->ksq_curr;
@@ -590,19 +644,66 @@ sched_choose(void)
 		kseq->ksq_next = swap;
 		ke = runq_choose(kseq->ksq_curr);
 	}
+
+	return (ke);
+}
+
+struct kse *
+sched_choose(void)
+{
+	struct kse *ke;
+	int cpu;
+
+	cpu = PCPU_GET(cpuid);
+	ke = sched_choose_kseq(&kseq_cpu[cpu]);
+
 	if (ke) {
 		runq_remove(ke->ke_runq, ke);
 		ke->ke_state = KES_THREAD;
+#ifdef SMP
+		kseq_cpu[cpu].ksq_load--;
+#if 0
+		sched_check_runqs();
+#endif
+#endif
 	}
 
+#ifdef SMP
+	if (ke == NULL && smp_started) {
+		int load;
+		int me;
+		int i;
+
+		me = cpu;
+
+		/*
+		 * Find the cpu with the highest load and steal one proc.
+		 */
+		for (load = 0, i = 0; i < mp_maxid; i++) {
+			if (CPU_ABSENT(i) || i == me)
+				continue;
+			if (kseq_cpu[i].ksq_load > load) {
+				load = kseq_cpu[i].ksq_load;
+				cpu = i;
+			}
+		}
+		if (load) {
+			ke = sched_choose_kseq(&kseq_cpu[cpu]);
+			kseq_cpu[cpu].ksq_load--;
+			ke->ke_state = KES_THREAD;
+			runq_remove(ke->ke_runq, ke);
+			ke->ke_runq = NULL;
+			ke->ke_oncpu = me;
+		}
+
+	}
+#endif
 	return (ke);
 }
 
 void
 sched_add(struct kse *ke)
 {
-	struct kseq *kseq;
-	int cpu;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE"));
@@ -614,12 +715,11 @@ sched_add(struct kse *ke)
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("runq_add: process swapped out"));
 
-	/* cpu = PCPU_GET(cpuid); */
-	cpu = ke->ke_oncpu;
-	kseq = &kseq_cpu[cpu];
-	kseq->ksq_load++;
 
 	if (ke->ke_runq == NULL) {
+		struct kseq *kseq;
+
+		kseq = &kseq_cpu[ke->ke_oncpu];
 		if (SCHED_CURR(ke->ke_ksegrp))
 			ke->ke_runq = kseq->ksq_curr;
 		else
@@ -629,23 +729,30 @@ sched_add(struct kse *ke)
 	ke->ke_state = KES_ONRUNQ;
 
 	runq_add(ke->ke_runq, ke);
+#ifdef SMP
+	kseq_cpu[ke->ke_oncpu].ksq_load++;
+#if 0
+	sched_check_runqs();
+#endif
+#endif
 }
 
 void
 sched_rem(struct kse *ke)
 {
-	struct kseq *kseq;
-
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); */
 
-	kseq = &kseq_cpu[ke->ke_oncpu];
-	kseq->ksq_load--;
-
 	runq_remove(ke->ke_runq, ke);
 	ke->ke_runq = NULL;
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
+#ifdef SMP
+	kseq_cpu[ke->ke_oncpu].ksq_load--;
+#if 0
+	sched_check_runqs();
+#endif
+#endif
 }
 
 fixpt_t