- Add support for CPU groups to ule. All SMT cores on the same physical

cpu are added to a group. - Don't place a cpu into the kseq_idle bitmask until all cpus in that group have idled. - Prefer idle groups over idle group members in the new kseq_transfer() function. In this way we will prefer to balance load across full cores rather than add further load a partial core. - Before a cpu goes idle, check the other group members for threads. Since SMT cpus may freely share threads, this is cheap. - SMT cores may be individually pinned and bound to now. This contrasts the old mechanism where binding or pinning would have allowed a thread to run on any available cpu. - Remove some unnecessary logic from sched_switch(). Priority propagation should be properly taken care of in sched_prio() now.
2003-12-11 03:57:10 +00:00 · 2003-12-11 03:57:10 +00:00 · 7c857e9275
commit 7c857e9275
parent cf00356cc6
1 changed files with 264 additions and 117 deletions
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@ -203,9 +203,6 @@ struct td_sched *thread0_sched = &td_sched;
 /*
 * kseq - per processor runqs and statistics.
 */
-
-#define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
-
 struct kseq {
 	struct runq	ksq_idle;		/* Queue of IDLE threads. */
 	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
@ -216,23 +213,42 @@ struct kseq {
 	short		ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
 	short		ksq_nicemin;		/* Least nice. */
 #ifdef SMP
-	int		ksq_load_transferable;	/* kses that may be migrated. */
-	int		ksq_idled;
-	int		ksq_cpus;	/* Count of CPUs in this kseq. */
-	volatile struct kse *ksq_assigned;	/* assigned by another CPU. */
+	int			ksq_transferable;
+	LIST_ENTRY(kseq)	ksq_siblings;	/* Next in kseq group. */
+	struct kseq_group	*ksq_group;	/* Our processor group. */
+	volatile struct kse	*ksq_assigned;	/* assigned by another CPU. */
 #endif
 };

+#ifdef SMP
+/*
+ * kseq groups are groups of processors which can cheaply share threads.  When
+ * one processor in the group goes idle it will check the runqs of the other
+ * processors in its group prior to halting and waiting for an interrupt.
+ * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
+ * In a numa environment we'd want an idle bitmap per group and a two tiered
+ * load balancer.
+ */
+struct kseq_group {
+	int	ksg_cpus;		/* Count of CPUs in this kseq group. */
+	int	ksg_cpumask;		/* Mask of cpus in this group. */
+	int	ksg_idlemask;		/* Idle cpus in this group. */
+	int	ksg_mask;		/* Bit mask for first cpu. */
+	int	ksg_transferable;	/* Transferable load of this group. */
+	LIST_HEAD(, kseq)	ksg_members; /* Linked list of all members. */
+};
+#endif
+
 /*
 * One kse queue per processor.
 */
 #ifdef SMP
 static int kseq_idle;
 static struct kseq	kseq_cpu[MAXCPU];
-static struct kseq	*kseq_idmap[MAXCPU];
-#define	KSEQ_SELF()	(kseq_idmap[PCPU_GET(cpuid)])
-#define	KSEQ_CPU(x)	(kseq_idmap[(x)])
-#else
+static struct kseq_group kseq_groups[MAXCPU];
+#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
+#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
+#else	/* !SMP */
 static struct kseq	kseq_cpu;
 #define	KSEQ_SELF()	(&kseq_cpu)
 #define	KSEQ_CPU(x)	(&kseq_cpu)
@ -256,13 +272,14 @@ static void kseq_nice_add(struct kseq *kseq, int nice);
 static void kseq_nice_rem(struct kseq *kseq, int nice);
 void kseq_print(int cpu);
 #ifdef SMP
+static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class);
 static struct kse *runq_steal(struct runq *rq);
 static void sched_balance(void *arg);
 static void kseq_move(struct kseq *from, int cpu);
-static __inline void kseq_setidle(struct kseq *kseq);
+static int kseq_idled(struct kseq *kseq);
 static void kseq_notify(struct kse *ke, int cpu);
 static void kseq_assign(struct kseq *);
-static struct kse *kseq_steal(struct kseq *kseq);
+static struct kse *kseq_steal(struct kseq *kseq, int stealidle);
 #define	KSE_CAN_MIGRATE(ke, class)					\
    ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 &&		\
    ((ke)->ke_flags & KEF_BOUND) == 0)
@ -280,7 +297,7 @@ kseq_print(int cpu)
 	printf("\tload:           %d\n", kseq->ksq_load);
 	printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
 #ifdef SMP
-	printf("\tload transferable: %d\n", kseq->ksq_load_transferable);
+	printf("\tload transferable: %d\n", kseq->ksq_transferable);
 #endif
 	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
 	printf("\tnice counts:\n");
@ -294,8 +311,10 @@ static __inline void
 kseq_runq_add(struct kseq *kseq, struct kse *ke)
 {
 #ifdef SMP
-	if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class)))
-		kseq->ksq_load_transferable++;
+	if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) {
+		kseq->ksq_transferable++;
+		kseq->ksq_group->ksg_transferable++;
+	}
 #endif
 	runq_add(ke->ke_runq, ke);
 }
@ -304,8 +323,10 @@ static __inline void
 kseq_runq_rem(struct kseq *kseq, struct kse *ke)
 {
 #ifdef SMP
-	if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class)))
-		kseq->ksq_load_transferable--;
+	if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) {
+		kseq->ksq_transferable--;
+		kseq->ksq_group->ksg_transferable--;
+	}
 #endif
 	runq_remove(ke->ke_runq, ke);
 }
@ -401,6 +422,7 @@ static void
 sched_balance(void *arg)
 {
 	struct kseq *kseq;
+	int transferable;
 	int high_load;
 	int low_load;
 	int high_cpu;
@ -422,8 +444,13 @@ sched_balance(void *arg)
 		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 			continue;
 		kseq = KSEQ_CPU(i);
-		if (kseq->ksq_load_transferable > high_load) {
-			high_load = kseq->ksq_load_transferable;
+		/*
+		 * Find the CPU with the highest load that has some threads
+		 * to transfer.
+		 */
+		if (kseq->ksq_load > high_load &&
+		    kseq->ksq_group->ksg_transferable) {
+			high_load = kseq->ksq_load;
 			high_cpu = i;
 		}
 		if (low_load == -1 || kseq->ksq_load < low_load) {
@ -435,17 +462,28 @@ sched_balance(void *arg)
 	/*
 	 * Nothing to do.
 	 */
-	if (high_load == 0 || low_load >= kseq->ksq_load)
+	if (low_load >= high_load)
+		goto out;
+	/*
+	 * If we're transfering within a group we have to use this specific
+	 * kseq's transferable count, otherwise we can steal from other members
+	 * of the group.
+	 */
+	if (kseq->ksq_group == KSEQ_CPU(low_cpu)->ksq_group)
+		transferable = kseq->ksq_transferable;
+	else
+		transferable = kseq->ksq_group->ksg_transferable;
+	if (transferable == 0)
 		goto out;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
-	 * kses we actually have to give up (load_transferable).
+	 * kses we actually have to give up (transferable).
 	 */
 	diff = kseq->ksq_load - low_load;
 	move = diff / 2;
 	if (diff & 0x1)
 		move++;
-	move = min(move, high_load);
+	move = min(move, transferable);
 	for (i = 0; i < move; i++)
 		kseq_move(kseq, low_cpu);
 out:
@ -458,25 +496,75 @@ sched_balance(void *arg)
 static void
 kseq_move(struct kseq *from, int cpu)
 {
+	struct kseq *kseq;
+	struct kseq *to;
 	struct kse *ke;

-	ke = kseq_steal(from);
+	kseq = from;
+	to = KSEQ_CPU(cpu);
+	ke = kseq_steal(kseq, 1);
+	if (ke == NULL) {
+		struct kseq_group *ksg;
+
+		ksg = kseq->ksq_group;
+		LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
+			if (kseq == from || kseq->ksq_transferable == 0)
+				continue;
+			ke = kseq_steal(kseq, 1);
+			break;
+		}
+		if (ke == NULL)
+			panic("kseq_move: No KSEs available with a "
+			    "transferable count of %d\n", 
+			    ksg->ksg_transferable);
+	}
+	if (kseq == to)
+		return;
 	ke->ke_state = KES_THREAD;
-	kseq_runq_rem(from, ke);
-	kseq_load_rem(from, ke);
+	kseq_runq_rem(kseq, ke);
+	kseq_load_rem(kseq, ke);

 	ke->ke_cpu = cpu;
 	kseq_notify(ke, cpu);
 }

-static __inline void
-kseq_setidle(struct kseq *kseq)
+static int
+kseq_idled(struct kseq *kseq)
 {
-	if (kseq->ksq_idled)
-		return;
-	kseq->ksq_idled = 1;
-	atomic_set_int(&kseq_idle, PCPU_GET(cpumask));
-	return;
+	struct kseq_group *ksg;
+	struct kseq *steal;
+	struct kse *ke;
+
+	ksg = kseq->ksq_group;
+	/*
+	 * If we're in a cpu group, try and steal kses from another cpu in
+	 * the group before idling.
+	 */
+	if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
+		LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
+			if (steal == kseq || steal->ksq_transferable == 0)
+				continue;
+			ke = kseq_steal(steal, 0);
+			if (ke == NULL)
+				continue;
+			ke->ke_state = KES_THREAD;
+			kseq_runq_rem(steal, ke);
+			kseq_load_rem(steal, ke);
+			ke->ke_cpu = PCPU_GET(cpuid);
+			sched_add(ke->ke_thread);
+			return (0);
+		}
+	}
+	/*
+	 * We only set the idled bit when all of the cpus in the group are
+	 * idle.  Otherwise we could get into a situation where a KSE bounces
+	 * back and forth between two idle cores on seperate physical CPUs.
+	 */
+	ksg->ksg_idlemask |= PCPU_GET(cpumask);
+	if (ksg->ksg_idlemask != ksg->ksg_cpumask)
+		return (1);
+	atomic_set_int(&kseq_idle, ksg->ksg_mask);
+	return (1);
 }

 static void
@ -550,16 +638,69 @@ runq_steal(struct runq *rq)
 }

 static struct kse *
-kseq_steal(struct kseq *kseq)
+kseq_steal(struct kseq *kseq, int stealidle)
 {
 	struct kse *ke;

-	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
-		return (ke);
+	/*
+	 * Steal from next first to try to get a non-interactive task that
+	 * may not have run for a while.
+	 */
 	if ((ke = runq_steal(kseq->ksq_next)) != NULL)
 		return (ke);
-	return (runq_steal(&kseq->ksq_idle));
+	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
+		return (ke);
+	if (stealidle)
+		return (runq_steal(&kseq->ksq_idle));
+	return (NULL);
 }
+
+int
+kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
+{
+	struct kseq_group *ksg;
+	int cpu;
+
+	cpu = 0;
+	ksg = kseq->ksq_group;
+
+	/*
+	 * XXX This ksg_transferable might work better if we were checking
+	 * against a global group load.  As it is now, this prevents us from
+	 * transfering a thread from a group that is potentially bogged down
+	 * with non transferable load.
+	 */
+	if (ksg->ksg_transferable > ksg->ksg_cpus && kseq_idle) {
+		/*
+		 * Multiple cpus could find this bit simultaneously
+		 * but the race shouldn't be terrible.
+		 */
+		cpu = ffs(kseq_idle);
+		if (cpu)
+			atomic_clear_int(&kseq_idle, 1 << (cpu - 1));
+	}
+	/*
+	 * If another cpu in this group has idled, assign a thread over
+	 * to them after checking to see if there are idled groups.
+	 */
+	if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) {
+		cpu = ffs(ksg->ksg_idlemask);
+		if (cpu)
+			ksg->ksg_idlemask &= ~(1 << (cpu - 1));
+	}
+	/*
+	 * Now that we've found an idle CPU, migrate the thread.
+	 */
+	if (cpu) {
+		cpu--;
+		ke->ke_cpu = cpu;
+		ke->ke_runq = NULL;
+		kseq_notify(ke, cpu);
+		return (1);
+	}
+	return (0);
+}
+
 #endif	/* SMP */

 /*
@ -616,11 +757,6 @@ kseq_setup(struct kseq *kseq)
 	kseq->ksq_next = &kseq->ksq_timeshare[1];
 	kseq->ksq_load = 0;
 	kseq->ksq_load_timeshare = 0;
-#ifdef SMP
-	kseq->ksq_load_transferable = 0;
-	kseq->ksq_idled = 0;
-	kseq->ksq_assigned = NULL;
-#endif
 }

 static void
@ -634,31 +770,64 @@ sched_setup(void *dummy)
 	slice_max = (hz/7);	/* ~140ms */

 #ifdef SMP
-	/* init kseqs */
-	/* Create the idmap. */
-#ifdef ULE_HTT_EXPERIMENTAL
+	/*
+	 * Initialize the kseqs.
+	 */
+	for (i = 0; i < MAXCPU; i++) {
+		struct kseq *ksq;
+
+		ksq = &kseq_cpu[i];
+		ksq->ksq_assigned = NULL;
+		kseq_setup(&kseq_cpu[i]);
+	}
 	if (smp_topology == NULL) {
-#else
-	if (1) {
-#endif
+		struct kseq_group *ksg;
+		struct kseq *ksq;
+
 		for (i = 0; i < MAXCPU; i++) {
-			kseq_setup(&kseq_cpu[i]);
-			kseq_idmap[i] = &kseq_cpu[i];
-			kseq_cpu[i].ksq_cpus = 1;
+			ksq = &kseq_cpu[i];
+			ksg = &kseq_groups[i];
+			/*
+			 * Setup a kse group with one member.
+			 */
+			ksq->ksq_transferable = 0;
+			ksq->ksq_group = ksg;
+			ksg->ksg_cpus = 1;
+			ksg->ksg_idlemask = 0;
+			ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
+			ksg->ksg_transferable = 0;
+			LIST_INIT(&ksg->ksg_members);
+			LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
 		}
 	} else {
+		struct kseq_group *ksg;
+		struct cpu_group *cg;
 		int j;

 		for (i = 0; i < smp_topology->ct_count; i++) {
-			struct cpu_group *cg;
-
 			cg = &smp_topology->ct_group[i];
-			kseq_setup(&kseq_cpu[i]);
-
-			for (j = 0; j < MAXCPU; j++)
-				if ((cg->cg_mask & (1 << j)) != 0)
-					kseq_idmap[j] = &kseq_cpu[i];
-			kseq_cpu[i].ksq_cpus = cg->cg_count;
+			ksg = &kseq_groups[i];
+			/*
+			 * Initialize the group.
+			 */
+			ksg->ksg_idlemask = 0;
+			ksg->ksg_transferable = 0;
+			ksg->ksg_cpus = cg->cg_count;
+			ksg->ksg_cpumask = cg->cg_mask;
+			LIST_INIT(&ksg->ksg_members);
+			/*
+			 * Find all of the group members and add them.
+			 */
+			for (j = 0; j < MAXCPU; j++) {
+				if ((cg->cg_mask & (1 << j)) != 0) {
+					if (ksg->ksg_mask == 0)
+						ksg->ksg_mask = 1 << j;
+					kseq_cpu[j].ksq_transferable = 0;
+					kseq_cpu[j].ksq_group = ksg;
+					LIST_INSERT_HEAD(&ksg->ksg_members,
+					    &kseq_cpu[j], ksq_siblings);
+				}
+			}
 		}
 	}
 	callout_init(&kseq_lb_callout, CALLOUT_MPSAFE);
@ -897,20 +1066,8 @@ sched_switch(struct thread *td)
 		if (td->td_proc->p_flag & P_SA) {
 			kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 			setrunqueue(td);
-		} else {
-			/*
-			 * This queue is always correct except for idle threads
-			 * which have a higher priority due to priority
-			 * propagation.
-			 */
-			if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) {
-				if (td->td_priority < PRI_MIN_IDLE)
-					ke->ke_runq = KSEQ_SELF()->ksq_curr;
-				else
-					ke->ke_runq = &KSEQ_SELF()->ksq_idle;
-			}
+		} else 
 			kseq_runq_add(KSEQ_SELF(), ke);
-		}
 	} else {
 		if (ke->ke_runq)
 			kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
@ -1078,10 +1235,14 @@ sched_class(struct ksegrp *kg, int class)
 		 * class.
 		 */
 		if (ke->ke_state == KES_ONRUNQ) {
-			if (KSE_CAN_MIGRATE(ke, oclass))
-				kseq->ksq_load_transferable--;
-			if (KSE_CAN_MIGRATE(ke, nclass))
-				kseq->ksq_load_transferable++;
+			if (KSE_CAN_MIGRATE(ke, oclass)) {
+				kseq->ksq_transferable--;
+				kseq->ksq_group->ksg_transferable--;
+			}
+			if (KSE_CAN_MIGRATE(ke, nclass)) {
+				kseq->ksq_transferable++;
+				kseq->ksq_group->ksg_transferable++;
+			}
 		}
 #endif
 		if (oclass == PRI_TIMESHARE) {
@ -1251,6 +1412,7 @@ sched_choose(void)
 	mtx_assert(&sched_lock, MA_OWNED);
 	kseq = KSEQ_SELF();
 #ifdef SMP
+restart:
 	if (kseq->ksq_assigned)
 		kseq_assign(kseq);
 #endif
@ -1258,7 +1420,8 @@ sched_choose(void)
 	if (ke) {
 #ifdef SMP
 		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
-			kseq_setidle(kseq);
+			if (kseq_idled(kseq) == 0)
+				goto restart;
 #endif
 		kseq_runq_rem(kseq, ke);
 		ke->ke_state = KES_THREAD;
@ -1271,7 +1434,8 @@ sched_choose(void)
 		return (ke);
 	}
 #ifdef SMP
-	kseq_setidle(kseq);
+	if (kseq_idled(kseq) == 0)
+		goto restart;
 #endif
 	return (NULL);
 }
@ -1310,24 +1474,12 @@ sched_add(struct thread *td)
 		ke->ke_cpu = PCPU_GET(cpuid);
 		break;
 	case PRI_TIMESHARE:
-#ifdef SMP
-		if (ke->ke_cpu != PCPU_GET(cpuid)) {
-			kseq_notify(ke, ke->ke_cpu);
-			return;
-		}
-#endif
 		if (SCHED_CURR(kg, ke))
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = kseq->ksq_next;
 		break;
 	case PRI_IDLE:
-#ifdef SMP
-		if (ke->ke_cpu != PCPU_GET(cpuid)) {
-			kseq_notify(ke, ke->ke_cpu);
-			return;
-		}
-#endif
 		/*
 		 * This is for priority prop.
 		 */
@ -1342,35 +1494,33 @@ sched_add(struct thread *td)
 		break;
 	}
 #ifdef SMP
+	if (ke->ke_cpu != PCPU_GET(cpuid)) {
+		kseq_notify(ke, ke->ke_cpu);
+		return;
+	}
 	/*
-	 * If there are any idle processors, give them our extra load.  The
+	 * If there are any idle groups, give them our extra load.  The
 	 * threshold at which we start to reassign kses has a large impact
 	 * on the overall performance of the system.  Tuned too high and
 	 * some CPUs may idle.  Too low and there will be excess migration
 	 * and context swiches.
 	 */
-	if (kseq->ksq_load_transferable > kseq->ksq_cpus &&
-	    KSE_CAN_MIGRATE(ke, class) && kseq_idle) {
-		int cpu;
-
-		/*
-		 * Multiple cpus could find this bit simultaneously but the
-		 * race shouldn't be terrible.
-		 */
-		cpu = ffs(kseq_idle);
-		if (cpu) {
-			cpu--;
-			atomic_clear_int(&kseq_idle, 1 << cpu);
-			ke->ke_cpu = cpu;
-			ke->ke_runq = NULL;
-			kseq_notify(ke, cpu);
+	if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class))
+		if (kseq_transfer(kseq, ke, class))
 			return;
-		}
-	}
-	if (kseq->ksq_idled &&
-	    (class == PRI_TIMESHARE || class == PRI_REALTIME)) {
-		atomic_clear_int(&kseq_idle, PCPU_GET(cpumask));
-		kseq->ksq_idled = 0;
+	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
+	    (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
+		/*
+		 * Check to see if our group is unidling, and if so, remove it
+		 * from the global idle mask.
+		 */
+		if (kseq->ksq_group->ksg_idlemask ==
+		    kseq->ksq_group->ksg_cpumask)
+			atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
+		/*
+		 * Now remove ourselves from the group specific idle mask.
+		 */
+		kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
 	}
 #endif
        if (td->td_priority < curthread->td_priority)
@ -1448,13 +1598,10 @@ sched_bind(struct thread *td, int cpu)

 	mtx_assert(&sched_lock, MA_OWNED);
 	ke = td->td_kse;
-#ifndef SMP
 	ke->ke_flags |= KEF_BOUND;
-#else
-	if (PCPU_GET(cpuid) == cpu) {
-		ke->ke_flags |= KEF_BOUND;
+#ifdef SMP
+	if (PCPU_GET(cpuid) == cpu)
 		return;
-	}
 	/* sched_rem without the runq_remove */
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;