new x86 smp topology detection code

Previously, the code determined a topology of processing units (hardware threads, cores, packages) and then deduced a cache topology using certain assumptions. The new code builds a topology that includes both processing units and caches using the information provided by the hardware. At the moment, the discovered full topology is used only to creeate a scheduling topology for SCHED_ULE. There is no KPI for other kernel uses. Summary: - based on APIC ID derivation rules for Intel and AMD CPUs - can handle non-uniform topologies - requires homogeneous APIC ID assignment (same bit widths for ID components) - topology for dual-node AMD CPUs may not be optimal - topology for latest AMD CPU models may not be optimal as the code is several years old - supports only thread/package/core/cache nodes Todo: - AMD dual-node processors - latest AMD processors - NUMA nodes - checking for homogeneity of the APIC ID assignment across packages - more flexible cache placement within topology - expose topology to userland, e.g., via sysctl nodes Long term todo: - KPI for CPU sharing and affinity with respect to various resources (e.g., two logical processors may share the same FPU, etc) Reviewed by: mav Tested by: mav MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D2728
2016-04-04 16:09:29 +00:00 · 2016-04-04 16:09:29 +00:00 · 4725e6bff3
commit 4725e6bff3
parent ae7abb26b1
3 changed files with 806 additions and 281 deletions
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/sched.h>
@ -50,6 +51,10 @@ __FBSDID("$FreeBSD$");

 #include "opt_sched.h"

+#ifdef SMP
+MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
+#endif
+
 #ifdef SMP
 volatile cpuset_t stopped_cpus;
 volatile cpuset_t started_cpus;
@ -556,7 +561,7 @@ smp_rendezvous(void (* setup_func)(void *),
 	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 }

-static struct cpu_group group[MAXCPU];
+static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];

 struct cpu_group *
 smp_topo(void)
@ -615,6 +620,17 @@ smp_topo(void)
 	return (top);
 }

+struct cpu_group *
+smp_topo_alloc(u_int count)
+{
+	static u_int index;
+	u_int curr;
+
+	curr = index;
+	index += count;
+	return (&group[curr]);
+}
+
 struct cpu_group *
 smp_topo_none(void)
 {
@ -861,3 +877,233 @@ sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
 	return (error);
 }

+
+#ifdef SMP
+void
+topo_init_node(struct topo_node *node)
+{
+
+	bzero(node, sizeof(*node));
+	TAILQ_INIT(&node->children);
+}
+
+void
+topo_init_root(struct topo_node *root)
+{
+
+	topo_init_node(root);
+	root->type = TOPO_TYPE_SYSTEM;
+}
+
+struct topo_node *
+topo_add_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype)
+{
+	struct topo_node *node;
+
+	TAILQ_FOREACH_REVERSE(node, &parent->children,
+	    topo_children, siblings) {
+		if (node->hwid == hwid
+		    && node->type == type && node->subtype == subtype) {
+			return (node);
+		}
+	}
+
+	node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
+	topo_init_node(node);
+	node->parent = parent;
+	node->hwid = hwid;
+	node->type = type;
+	node->subtype = subtype;
+	TAILQ_INSERT_TAIL(&parent->children, node, siblings);
+	parent->nchildren++;
+
+	return (node);
+}
+
+struct topo_node *
+topo_find_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype)
+{
+
+	struct topo_node *node;
+
+	TAILQ_FOREACH(node, &parent->children, siblings) {
+		if (node->hwid == hwid
+		    && node->type == type && node->subtype == subtype) {
+			return (node);
+		}
+	}
+
+	return (NULL);
+}
+
+void
+topo_promote_child(struct topo_node *child)
+{
+	struct topo_node *next;
+	struct topo_node *node;
+	struct topo_node *parent;
+
+	parent = child->parent;
+	next = TAILQ_NEXT(child, siblings);
+	TAILQ_REMOVE(&parent->children, child, siblings);
+	TAILQ_INSERT_HEAD(&parent->children, child, siblings);
+
+	while (next != NULL) {
+		node = next;
+		next = TAILQ_NEXT(node, siblings);
+		TAILQ_REMOVE(&parent->children, node, siblings);
+		TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
+		child = node;
+	}
+}
+
+struct topo_node *
+topo_next_node(struct topo_node *top, struct topo_node *node)
+{
+	struct topo_node *next;
+
+	if ((next = TAILQ_FIRST(&node->children)) != NULL)
+		return (next);
+
+	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+		return (next);
+
+	while ((node = node->parent) != top)
+		if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+			return (next);
+
+	return (NULL);
+}
+
+struct topo_node *
+topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
+{
+	struct topo_node *next;
+
+	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+		return (next);
+
+	while ((node = node->parent) != top)
+		if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+			return (next);
+
+	return (NULL);
+}
+
+void
+topo_set_pu_id(struct topo_node *node, cpuid_t id)
+{
+
+	KASSERT(node->type == TOPO_TYPE_PU,
+	    ("topo_set_pu_id: wrong node type: %u", node->type));
+	KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
+	    ("topo_set_pu_id: cpuset already not empty"));
+	node->id = id;
+	CPU_SET(id, &node->cpuset);
+	node->cpu_count = 1;
+	node->subtype = 1;
+
+	while ((node = node->parent) != NULL) {
+		if (CPU_ISSET(id, &node->cpuset))
+			break;
+		CPU_SET(id, &node->cpuset);
+		node->cpu_count++;
+	}
+}
+
+int
+topo_analyze(struct topo_node *topo_root, int all,
+    int *pkg_count, int *cores_per_pkg, int *thrs_per_core)
+{
+	struct topo_node *pkg_node;
+	struct topo_node *core_node;
+	struct topo_node *pu_node;
+	int thrs_per_pkg;
+	int cpp_counter;
+	int tpc_counter;
+	int tpp_counter;
+
+	*pkg_count = 0;
+	*cores_per_pkg = -1;
+	*thrs_per_core = -1;
+	thrs_per_pkg = -1;
+	pkg_node = topo_root;
+	while (pkg_node != NULL) {
+		if (pkg_node->type != TOPO_TYPE_PKG) {
+			pkg_node = topo_next_node(topo_root, pkg_node);
+			continue;
+		}
+		if (!all && CPU_EMPTY(&pkg_node->cpuset)) {
+			pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
+			continue;
+		}
+
+		(*pkg_count)++;
+
+		cpp_counter = 0;
+		tpp_counter = 0;
+		core_node = pkg_node;
+		while (core_node != NULL) {
+			if (core_node->type == TOPO_TYPE_CORE) {
+				if (!all && CPU_EMPTY(&core_node->cpuset)) {
+					core_node =
+					    topo_next_nonchild_node(pkg_node,
+					        core_node);
+					continue;
+				}
+
+				cpp_counter++;
+
+				tpc_counter = 0;
+				pu_node = core_node;
+				while (pu_node != NULL) {
+					if (pu_node->type == TOPO_TYPE_PU &&
+					    (all || !CPU_EMPTY(&pu_node->cpuset)))
+						tpc_counter++;
+					pu_node = topo_next_node(core_node,
+					    pu_node);
+				}
+
+				if (*thrs_per_core == -1)
+					*thrs_per_core = tpc_counter;
+				else if (*thrs_per_core != tpc_counter)
+					return (0);
+
+				core_node = topo_next_nonchild_node(pkg_node,
+				    core_node);
+			} else {
+				/* PU node directly under PKG. */
+				if (core_node->type == TOPO_TYPE_PU &&
+			           (all || !CPU_EMPTY(&core_node->cpuset)))
+					tpp_counter++;
+				core_node = topo_next_node(pkg_node,
+				    core_node);
+			}
+		}
+
+		if (*cores_per_pkg == -1)
+			*cores_per_pkg = cpp_counter;
+		else if (*cores_per_pkg != cpp_counter)
+			return (0);
+		if (thrs_per_pkg == -1)
+			thrs_per_pkg = tpp_counter;
+		else if (thrs_per_pkg != tpp_counter)
+			return (0);
+
+		pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
+	}
+
+	KASSERT(*pkg_count > 0,
+		("bug in topology or analysis"));
+	if (*cores_per_pkg == 0) {
+		KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0,
+			("bug in topology or analysis"));
+		*thrs_per_core = thrs_per_pkg;
+	}
+
+	return (1);
+}
+#endif /* SMP */
+
--- a/sys/sys/smp.h
+++ b/sys/sys/smp.h
@ -17,9 +17,52 @@
 #ifndef LOCORE

 #include <sys/cpuset.h>
+#include <sys/queue.h>

 /*
- * Topology of a NUMA or HTT system.
+ * Types of nodes in the topological tree.
+ */
+typedef enum {
+	/* No node has this type; can be used in topo API calls. */
+	TOPO_TYPE_DUMMY,
+	/* Processing unit aka computing unit aka logical CPU. */
+	TOPO_TYPE_PU,
+	/* Physical subdivision of a package. */
+	TOPO_TYPE_CORE,
+	/* CPU L1/L2/L3 cache. */
+	TOPO_TYPE_CACHE,
+	/* Package aka chip, equivalent to socket. */
+	TOPO_TYPE_PKG,
+	/* NUMA node. */
+	TOPO_TYPE_NODE,
+	/* Other logical or physical grouping of PUs. */
+	/* E.g. PUs on the same dye, or PUs sharing an FPU. */
+	TOPO_TYPE_GROUP,
+	/* The whole system. */
+	TOPO_TYPE_SYSTEM
+} topo_node_type;
+
+/* Hardware indenitifier of a topology component. */
+typedef	unsigned int hwid_t;
+/* Logical CPU idenitifier. */
+typedef	int cpuid_t;
+
+/* A node in the topology. */
+struct topo_node {
+	struct topo_node			*parent;
+	TAILQ_HEAD(topo_children, topo_node)	children;
+	TAILQ_ENTRY(topo_node)			siblings;
+	cpuset_t				cpuset;
+	topo_node_type				type;
+	uintptr_t				subtype;
+	hwid_t					hwid;
+	cpuid_t					id;
+	int					nchildren;
+	int					cpu_count;
+};
+
+/*
+ * Scheduling topology of a NUMA or SMP system.
 *
 * The top level topology is an array of pointers to groups.  Each group
 * contains a bitmask of cpus in its group or subgroups.  It may also
@ -52,6 +95,8 @@ typedef struct cpu_group *cpu_group_t;
 #define	CG_SHARE_L2	2
 #define	CG_SHARE_L3	3

+#define MAX_CACHE_LEVELS	CG_SHARE_L3
+
 /*
 * Behavior modifiers for load balancing and affinity.
 */
@ -60,10 +105,29 @@ typedef struct cpu_group *cpu_group_t;
 #define	CG_FLAG_THREAD	(CG_FLAG_HTT | CG_FLAG_SMT)	/* Any threading. */

 /*
- * Convenience routines for building topologies.
+ * Convenience routines for building and traversing topologies.
 */
 #ifdef SMP
+void topo_init_node(struct topo_node *node);
+void topo_init_root(struct topo_node *root);
+struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype);
+struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype);
+void topo_promote_child(struct topo_node *child);
+struct topo_node * topo_next_node(struct topo_node *top,
+    struct topo_node *node);
+struct topo_node * topo_next_nonchild_node(struct topo_node *top,
+    struct topo_node *node);
+void topo_set_pu_id(struct topo_node *node, cpuid_t id);
+int topo_analyze(struct topo_node *topo_root, int all, int *pkg_count,
+    int *cores_per_pkg, int *thrs_per_core);
+
+#define	TOPO_FOREACH(i, root)	\
+	for (i = root; i != NULL; i = topo_next_node(root, i))
+
 struct cpu_group *smp_topo(void);
+struct cpu_group *smp_topo_alloc(u_int count);
 struct cpu_group *smp_topo_none(void);
 struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags);
 struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share,
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@ -133,19 +133,28 @@ volatile int aps_ready = 0;
 * the APs.
 */
 struct cpu_info cpu_info[MAX_APIC_ID + 1];
-int cpu_apic_ids[MAXCPU];
 int apic_cpuids[MAX_APIC_ID + 1];
+int cpu_apic_ids[MAXCPU];

 /* Holds pending bitmap based IPIs per CPU */
 volatile u_int cpu_ipi_pending[MAXCPU];

-int cpu_logical;		/* logical cpus per core */
-int cpu_cores;			/* cores per package */
-
 static void	release_aps(void *dummy);

-static u_int	hyperthreading_cpus;	/* logical cpus sharing L1 cache */
 static int	hyperthreading_allowed = 1;
+SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
+	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
+
+static struct topo_node topo_root;
+
+static int pkg_id_shift;
+static int core_id_shift;
+static int disabled_cpus;
+
+struct cache_info {
+	int	id_shift;
+	int	present;
+} static caches[MAX_CACHE_LEVELS];

 void
 mem_range_AP_init(void)
@ -155,39 +164,6 @@ mem_range_AP_init(void)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }

-static void
-topo_probe_amd(void)
-{
-	int core_id_bits;
-	int id;
-
-	/* AMD processors do not support HTT. */
-	cpu_logical = 1;
-
-	if ((amd_feature2 & AMDID2_CMP) == 0) {
-		cpu_cores = 1;
-		return;
-	}
-
-	core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
-	    AMDID_COREID_SIZE_SHIFT;
-	if (core_id_bits == 0) {
-		cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1;
-		return;
-	}
-
-	/* Fam 10h and newer should get here. */
-	for (id = 0; id <= MAX_APIC_ID; id++) {
-		/* Check logical CPU availability. */
-		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
-			continue;
-		/* Check if logical CPU has the same package ID. */
-		if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits))
-			continue;
-		cpu_cores++;
-	}
-}
-
 /*
 * Round up to the next power of two, if necessary, and then
 * take log2.
@ -200,15 +176,113 @@ mask_width(u_int x)
 	return (fls(x << (1 - powerof2(x))) - 1);
 }

+static int
+add_deterministic_cache(int type, int level, int share_count)
+{
+
+	if (type == 0)
+		return (0);
+	if (type > 3) {
+		printf("unexpected cache type %d\n", type);
+		return (1);
+	}
+	if (type == 2) /* ignore instruction cache */
+		return (1);
+	if (level == 0 || level > MAX_CACHE_LEVELS) {
+		printf("unexpected cache level %d\n", type);
+		return (1);
+	}
+
+	if (caches[level - 1].present) {
+		printf("WARNING: multiple entries for L%u data cache\n", level);
+		printf("%u => %u\n", caches[level - 1].id_shift,
+		    mask_width(share_count));
+	}
+	caches[level - 1].id_shift = mask_width(share_count);
+	caches[level - 1].present = 1;
+
+	if (caches[level - 1].id_shift > pkg_id_shift) {
+		printf("WARNING: L%u data cache covers more "
+		    "APIC IDs than a package\n", level);
+		printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
+		caches[level - 1].id_shift = pkg_id_shift;
+	}
+	if (caches[level - 1].id_shift < core_id_shift) {
+		printf("WARNING: L%u data cache covers less "
+		    "APIC IDs than a core\n", level);
+		printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
+		caches[level - 1].id_shift = core_id_shift;
+	}
+
+	return (1);
+}
+
 static void
-topo_probe_0x4(void)
+topo_probe_amd(void)
+{
+	u_int p[4];
+	int level;
+	int share_count;
+	int type;
+	int i;
+
+	/* No multi-core capability. */
+	if ((amd_feature2 & AMDID2_CMP) == 0)
+		return;
+
+	/* For families 10h and newer. */
+	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
+	    AMDID_COREID_SIZE_SHIFT;
+
+	/* For 0Fh family. */
+	if (pkg_id_shift == 0)
+		pkg_id_shift =
+		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
+
+	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
+		for (i = 0; ; i++) {
+			cpuid_count(0x8000001d, i, p);
+			type = p[0] & 0x1f;
+			level = (p[0] >> 5) & 0x7;
+			share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+			if (!add_deterministic_cache(type, level, share_count))
+				break;
+		}
+	} else {
+		if (cpu_exthigh >= 0x80000005) {
+			cpuid_count(0x80000005, 0, p);
+			if (((p[2] >> 24) & 0xff) != 0) {
+				caches[0].id_shift = 0;
+				caches[0].present = 1;
+			}
+		}
+		if (cpu_exthigh >= 0x80000006) {
+			cpuid_count(0x80000006, 0, p);
+			if (((p[2] >> 16) & 0xffff) != 0) {
+				caches[1].id_shift = 0;
+				caches[1].present = 1;
+			}
+			if (((p[3] >> 18) & 0x3fff) != 0) {
+
+				/*
+				 * TODO: Account for dual-node processors
+				 * where each node within a package has its own
+				 * L3 cache.
+				 */
+				caches[2].id_shift = pkg_id_shift;
+				caches[2].present = 1;
+			}
+		}
+	}
+}
+
+static void
+topo_probe_intel_0x4(void)
 {
 	u_int p[4];
-	int pkg_id_bits;
-	int core_id_bits;
 	int max_cores;
 	int max_logical;
-	int id;

 	/* Both zero and one here mean one logical processor per package. */
 	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
@ -216,180 +290,432 @@ topo_probe_0x4(void)
 	if (max_logical <= 1)
 		return;

-	/*
-	 * Because of uniformity assumption we examine only
-	 * those logical processors that belong to the same
-	 * package as BSP.  Further, we count number of
-	 * logical processors that belong to the same core
-	 * as BSP thus deducing number of threads per core.
-	 */
 	if (cpu_high >= 0x4) {
 		cpuid_count(0x04, 0, p);
 		max_cores = ((p[0] >> 26) & 0x3f) + 1;
 	} else
 		max_cores = 1;
-	core_id_bits = mask_width(max_logical/max_cores);
-	if (core_id_bits < 0)
-		return;
-	pkg_id_bits = core_id_bits + mask_width(max_cores);

-	for (id = 0; id <= MAX_APIC_ID; id++) {
-		/* Check logical CPU availability. */
-		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
-			continue;
-		/* Check if logical CPU has the same package ID. */
-		if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
-			continue;
-		cpu_cores++;
-		/* Check if logical CPU has the same package and core IDs. */
-		if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits))
-			cpu_logical++;
-	}
-
-	KASSERT(cpu_cores >= 1 && cpu_logical >= 1,
-	    ("topo_probe_0x4 couldn't find BSP"));
-
-	cpu_cores /= cpu_logical;
-	hyperthreading_cpus = cpu_logical;
+	core_id_shift = mask_width(max_logical/max_cores);
+	KASSERT(core_id_shift >= 0,
+	    ("intel topo: max_cores > max_logical\n"));
+	pkg_id_shift = core_id_shift + mask_width(max_cores);
 }

 static void
-topo_probe_0xb(void)
+topo_probe_intel_0xb(void)
 {
 	u_int p[4];
 	int bits;
-	int cnt;
-	int i;
-	int logical;
 	int type;
-	int x;
+	int i;
+
+	/* Fall back if CPU leaf 11 doesn't really exist. */
+	cpuid_count(0x0b, 0, p);
+	if (p[1] == 0) {
+		topo_probe_intel_0x4();
+		return;
+	}

 	/* We only support three levels for now. */
-	for (i = 0; i < 3; i++) {
+	for (i = 0; ; i++) {
 		cpuid_count(0x0b, i, p);

-		/* Fall back if CPU leaf 11 doesn't really exist. */
-		if (i == 0 && p[1] == 0) {
-			topo_probe_0x4();
-			return;
-		}
-
 		bits = p[0] & 0x1f;
-		logical = p[1] &= 0xffff;
 		type = (p[2] >> 8) & 0xff;
-		if (type == 0 || logical == 0)
+
+		if (type == 0)
 			break;
-		/*
-		 * Because of uniformity assumption we examine only
-		 * those logical processors that belong to the same
-		 * package as BSP.
-		 */
-		for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
-			if (!cpu_info[x].cpu_present ||
-			    cpu_info[x].cpu_disabled)
-				continue;
-			if (x >> bits == boot_cpu_id >> bits)
-				cnt++;
-		}
+
+		/* TODO: check for duplicate (re-)assignment */
 		if (type == CPUID_TYPE_SMT)
-			cpu_logical = cnt;
+			core_id_shift = bits;
 		else if (type == CPUID_TYPE_CORE)
-			cpu_cores = cnt;
+			pkg_id_shift = bits;
+		else
+			printf("unknown CPU level type %d\n", type);
 	}
-	if (cpu_logical == 0)
-		cpu_logical = 1;
-	cpu_cores /= cpu_logical;
+
+	if (pkg_id_shift < core_id_shift) {
+		printf("WARNING: core covers more APIC IDs than a package\n");
+		core_id_shift = pkg_id_shift;
+	}
+}
+
+static void
+topo_probe_intel_caches(void)
+{
+	u_int p[4];
+	int level;
+	int share_count;
+	int type;
+	int i;
+
+	if (cpu_high < 0x4) {
+		/*
+		 * Available cache level and sizes can be determined
+		 * via CPUID leaf 2, but that requires a huge table of hardcoded
+		 * values, so for now just assume L1 and L2 caches potentially
+		 * shared only by HTT processing units, if HTT is present.
+		 */
+		caches[0].id_shift = pkg_id_shift;
+		caches[0].present = 1;
+		caches[1].id_shift = pkg_id_shift;
+		caches[1].present = 1;
+		return;
+	}
+
+	for (i = 0; ; i++) {
+		cpuid_count(0x4, i, p);
+		type = p[0] & 0x1f;
+		level = (p[0] >> 5) & 0x7;
+		share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+		if (!add_deterministic_cache(type, level, share_count))
+			break;
+	}
+}
+
+static void
+topo_probe_intel(void)
+{
+
+	/*
+	 * See Intel(R) 64 Architecture Processor
+	 * Topology Enumeration article for details.
+	 *
+	 * Note that 0x1 <= cpu_high < 4 case should be
+	 * compatible with topo_probe_intel_0x4() logic when
+	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
+	 * or it should trigger the fallback otherwise.
+	 */
+	if (cpu_high >= 0xb)
+		topo_probe_intel_0xb();
+	else if (cpu_high >= 0x1)
+		topo_probe_intel_0x4();
+
+	topo_probe_intel_caches();
 }

 /*
- * Both topology discovery code and code that consumes topology
- * information assume top-down uniformity of the topology.
- * That is, all physical packages must be identical and each
- * core in a package must have the same number of threads.
 * Topology information is queried only on BSP, on which this
 * code runs and for which it can query CPUID information.
- * Then topology is extrapolated on all packages using the
- * uniformity assumption.
+ * Then topology is extrapolated on all packages using an
+ * assumption that APIC ID to hardware component ID mapping is
+ * homogenious.
+ * That doesn't necesserily imply that the topology is uniform.
 */
 void
 topo_probe(void)
 {
 	static int cpu_topo_probed = 0;
+	struct x86_topo_layer {
+		int type;
+		int subtype;
+		int id_shift;
+	} topo_layers[MAX_CACHE_LEVELS + 3];
+	struct topo_node *parent;
+	struct topo_node *node;
+	int layer;
+	int nlayers;
+	int node_id;
+	int i;

 	if (cpu_topo_probed)
 		return;

 	CPU_ZERO(&logical_cpus_mask);
+
 	if (mp_ncpus <= 1)
-		cpu_cores = cpu_logical = 1;
+		; /* nothing */
 	else if (cpu_vendor_id == CPU_VENDOR_AMD)
 		topo_probe_amd();
-	else if (cpu_vendor_id == CPU_VENDOR_INTEL) {
-		/*
-		 * See Intel(R) 64 Architecture Processor
-		 * Topology Enumeration article for details.
-		 *
-		 * Note that 0x1 <= cpu_high < 4 case should be
-		 * compatible with topo_probe_0x4() logic when
-		 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
-		 * or it should trigger the fallback otherwise.
-		 */
-		if (cpu_high >= 0xb)
-			topo_probe_0xb();
-		else if (cpu_high >= 0x1)
-			topo_probe_0x4();
-	}
+	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
+		topo_probe_intel();
+
+	KASSERT(pkg_id_shift >= core_id_shift,
+	    ("bug in APIC topology discovery"));
+
+	nlayers = 0;
+	bzero(topo_layers, sizeof(topo_layers));
+
+	topo_layers[nlayers].type = TOPO_TYPE_PKG;
+	topo_layers[nlayers].id_shift = pkg_id_shift;
+	if (bootverbose)
+		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
+	nlayers++;

 	/*
-	 * Fallback: assume each logical CPU is in separate
-	 * physical package.  That is, no multi-core, no SMT.
+	 * Consider all caches to be within a package/chip
+	 * and "in front" of all sub-components like
+	 * cores and hardware threads.
 	 */
-	if (cpu_cores == 0 || cpu_logical == 0)
-		cpu_cores = cpu_logical = 1;
+	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
+		if (caches[i].present) {
+			KASSERT(caches[i].id_shift <= pkg_id_shift,
+				("bug in APIC topology discovery"));
+			KASSERT(caches[i].id_shift >= core_id_shift,
+				("bug in APIC topology discovery"));
+
+			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
+			topo_layers[nlayers].subtype = i + 1;
+			topo_layers[nlayers].id_shift = caches[i].id_shift;
+			if (bootverbose)
+				printf("L%u cache ID shift: %u\n",
+				    topo_layers[nlayers].subtype,
+				    topo_layers[nlayers].id_shift);
+			nlayers++;
+		}
+	}
+
+	if (pkg_id_shift > core_id_shift) {
+		topo_layers[nlayers].type = TOPO_TYPE_CORE;
+		topo_layers[nlayers].id_shift = core_id_shift;
+		if (bootverbose)
+			printf("Core ID shift: %u\n",
+			    topo_layers[nlayers].id_shift);
+		nlayers++;
+	}
+
+	topo_layers[nlayers].type = TOPO_TYPE_PU;
+	topo_layers[nlayers].id_shift = 0;
+	nlayers++;
+
+	topo_init_root(&topo_root);
+	for (i = 0; i <= MAX_APIC_ID; ++i) {
+		if (!cpu_info[i].cpu_present)
+			continue;
+
+		parent = &topo_root;
+		for (layer = 0; layer < nlayers; ++layer) {
+			node_id = i >> topo_layers[layer].id_shift;
+			parent = topo_add_node_by_hwid(parent, node_id,
+			    topo_layers[layer].type,
+			    topo_layers[layer].subtype);
+		}
+	}
+
+	parent = &topo_root;
+	for (layer = 0; layer < nlayers; ++layer) {
+		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
+		node = topo_find_node_by_hwid(parent, node_id,
+		    topo_layers[layer].type,
+		    topo_layers[layer].subtype);
+		topo_promote_child(node);
+		parent = node;
+	}
+
 	cpu_topo_probed = 1;
 }

+/*
+ * Assign logical CPU IDs to local APICs.
+ */
+void
+assign_cpu_ids(void)
+{
+	struct topo_node *node;
+	u_int smt_mask;
+
+	smt_mask = (1u << core_id_shift) - 1;
+
+	/*
+	 * Assign CPU IDs to local APIC IDs and disable any CPUs
+	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
+	 */
+	mp_ncpus = 0;
+	TOPO_FOREACH(node, &topo_root) {
+		if (node->type != TOPO_TYPE_PU)
+			continue;
+
+		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
+			cpu_info[node->hwid].cpu_hyperthread = 1;
+
+		if (resource_disabled("lapic", node->hwid)) {
+			if (node->hwid != boot_cpu_id)
+				cpu_info[node->hwid].cpu_disabled = 1;
+			else
+				printf("Cannot disable BSP, APIC ID = %d\n",
+				    node->hwid);
+		}
+
+		if (!hyperthreading_allowed &&
+		    cpu_info[node->hwid].cpu_hyperthread)
+			cpu_info[node->hwid].cpu_disabled = 1;
+
+		if (mp_ncpus >= MAXCPU)
+			cpu_info[node->hwid].cpu_disabled = 1;
+
+		if (cpu_info[node->hwid].cpu_disabled) {
+			disabled_cpus++;
+			continue;
+		}
+
+		cpu_apic_ids[mp_ncpus] = node->hwid;
+		apic_cpuids[node->hwid] = mp_ncpus;
+		topo_set_pu_id(node, mp_ncpus);
+		mp_ncpus++;
+	}
+
+	KASSERT(mp_maxid >= mp_ncpus - 1,
+	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
+	    mp_ncpus));
+}
+
+/*
+ * Print various information about the SMP system hardware and setup.
+ */
+void
+cpu_mp_announce(void)
+{
+	struct topo_node *node;
+	const char *hyperthread;
+	int pkg_count;
+	int cores_per_pkg;
+	int thrs_per_core;
+
+	printf("FreeBSD/SMP: ");
+	if (topo_analyze(&topo_root, 1, &pkg_count,
+	    &cores_per_pkg, &thrs_per_core)) {
+		printf("%d package(s)", pkg_count);
+		if (cores_per_pkg > 0)
+			printf(" x %d core(s)", cores_per_pkg);
+		if (thrs_per_core > 1)
+		    printf(" x %d hardware threads", thrs_per_core);
+	} else {
+		printf("Non-uniform topology");
+	}
+	printf("\n");
+
+	if (disabled_cpus) {
+		printf("FreeBSD/SMP Online: ");
+		if (topo_analyze(&topo_root, 0, &pkg_count,
+		    &cores_per_pkg, &thrs_per_core)) {
+			printf("%d package(s)", pkg_count);
+			if (cores_per_pkg > 0)
+				printf(" x %d core(s)", cores_per_pkg);
+			if (thrs_per_core > 1)
+			    printf(" x %d hardware threads", thrs_per_core);
+		} else {
+			printf("Non-uniform topology");
+		}
+		printf("\n");
+	}
+
+	if (!bootverbose)
+		return;
+
+	TOPO_FOREACH(node, &topo_root) {
+		switch (node->type) {
+		case TOPO_TYPE_PKG:
+			printf("Package HW ID = %u (%#x)\n",
+			    node->hwid, node->hwid);
+			break;
+		case TOPO_TYPE_CORE:
+			printf("\tCore HW ID = %u (%#x)\n",
+			    node->hwid, node->hwid);
+			break;
+		case TOPO_TYPE_PU:
+			if (cpu_info[node->hwid].cpu_hyperthread)
+				hyperthread = "/HT";
+			else
+				hyperthread = "";
+
+			if (node->subtype == 0)
+				printf("\t\tCPU (AP%s): APIC ID: %u (%#x)"
+				    "(disabled)\n", hyperthread, node->hwid,
+				    node->hwid);
+			else if (node->id == 0)
+				printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n",
+				    node->hwid, node->hwid);
+			else
+				printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n",
+				    node->id, hyperthread, node->hwid,
+				    node->hwid);
+			break;
+		default:
+			/* ignored */
+			break;
+		}
+	}
+}
+
+static void
+x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
+{
+	struct topo_node *node;
+	int nchildren;
+	int ncores;
+	int i;
+
+	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
+	    ("x86topo_add_sched_group: bad type: %u", root->type));
+	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
+	cg_root->cg_count = root->cpu_count;
+	if (root->type == TOPO_TYPE_SYSTEM)
+		cg_root->cg_level = CG_SHARE_NONE;
+	else
+		cg_root->cg_level = root->subtype;
+
+	ncores = 0;
+	node = root;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CORE) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+
+		ncores++;
+		node = topo_next_nonchild_node(root, node);
+	}
+
+	if (cg_root->cg_level != CG_SHARE_NONE &&
+	    root->cpu_count > 1 && ncores < 2)
+		cg_root->cg_flags = CG_FLAG_SMT;
+
+	nchildren = 0;
+	node = root;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CACHE ||
+		    (root->type != TOPO_TYPE_SYSTEM &&
+		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+		nchildren++;
+		node = topo_next_nonchild_node(root, node);
+	}
+
+	cg_root->cg_child = smp_topo_alloc(nchildren);
+	cg_root->cg_children = nchildren;
+
+	node = root;
+	i = 0;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CACHE ||
+		    (root->type != TOPO_TYPE_SYSTEM &&
+		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+		cg_root->cg_child[i].cg_parent = cg_root;
+		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
+		i++;
+		node = topo_next_nonchild_node(root, node);
+	}
+}
+
 struct cpu_group *
 cpu_topo(void)
 {
-	int cg_flags;
+	struct cpu_group *cg_root;

-	/*
-	 * Determine whether any threading flags are
-	 * necessry.
-	 */
-	topo_probe();
-	if (cpu_logical > 1 && hyperthreading_cpus)
-		cg_flags = CG_FLAG_HTT;
-	else if (cpu_logical > 1)
-		cg_flags = CG_FLAG_SMT;
-	else
-		cg_flags = 0;
-	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
-		printf("WARNING: Non-uniform processors.\n");
-		printf("WARNING: Using suboptimal topology.\n");
+	if (mp_ncpus <= 1)
 		return (smp_topo_none());
-	}
-	/*
-	 * No multi-core or hyper-threaded.
-	 */
-	if (cpu_logical * cpu_cores == 1)
-		return (smp_topo_none());
-	/*
-	 * Only HTT no multi-core.
-	 */
-	if (cpu_logical > 1 && cpu_cores == 1)
-		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags));
-	/*
-	 * Only multi-core no HTT.
-	 */
-	if (cpu_cores > 1 && cpu_logical == 1)
-		return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags));
-	/*
-	 * Both HTT and multi-core.
-	 */
-	return (smp_topo_2level(CG_SHARE_L2, cpu_cores,
-	    CG_SHARE_L1, cpu_logical, cg_flags));
+
+	cg_root = smp_topo_alloc(1);
+	x86topo_add_sched_group(&topo_root, cg_root);
+	return (cg_root);
 }


@ -445,46 +771,8 @@ cpu_mp_probe(void)
 }

 /*
- * Print various information about the SMP system hardware and setup.
+ * AP CPU's call this to initialize themselves.
 */
-void
-cpu_mp_announce(void)
-{
-	const char *hyperthread;
-	int i;
-
-	printf("FreeBSD/SMP: %d package(s) x %d core(s)",
-	    mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
-	if (hyperthreading_cpus > 1)
-	    printf(" x %d HTT threads", cpu_logical);
-	else if (cpu_logical > 1)
-	    printf(" x %d SMT threads", cpu_logical);
-	printf("\n");
-
-	/* List active CPUs first. */
-	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
-	for (i = 1; i < mp_ncpus; i++) {
-		if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread)
-			hyperthread = "/HT";
-		else
-			hyperthread = "";
-		printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread,
-		    cpu_apic_ids[i]);
-	}
-
-	/* List disabled CPUs last. */
-	for (i = 0; i <= MAX_APIC_ID; i++) {
-		if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled)
-			continue;
-		if (cpu_info[i].cpu_hyperthread)
-			hyperthread = "/HT";
-		else
-			hyperthread = "";
-		printf("  cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread,
-		    i);
-	}
-}
-
 void
 init_secondary_tail(void)
 {
@ -546,8 +834,7 @@ init_secondary_tail(void)
 	printf("SMP: AP CPU #%d Launched!\n", cpuid);

 	/* Determine if we are a logical CPU. */
-	/* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */
-	if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0)
+	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
 		CPU_SET(cpuid, &logical_cpus_mask);

 	if (bootverbose)
@ -612,85 +899,13 @@ set_interrupt_apic_ids(void)
 			continue;

 		/* Don't let hyperthreads service interrupts. */
-		if (cpu_logical > 1 &&
-		    apic_id % cpu_logical != 0)
+		if (cpu_info[apic_id].cpu_hyperthread)
 			continue;

 		intr_add_cpu(i);
 	}
 }

-/*
- * Assign logical CPU IDs to local APICs.
- */
-void
-assign_cpu_ids(void)
-{
-	u_int i;
-
-	TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
-	    &hyperthreading_allowed);
-
-	/* Check for explicitly disabled CPUs. */
-	for (i = 0; i <= MAX_APIC_ID; i++) {
-		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
-			continue;
-
-		if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
-			cpu_info[i].cpu_hyperthread = 1;
-
-			/*
-			 * Don't use HT CPU if it has been disabled by a
-			 * tunable.
-			 */
-			if (hyperthreading_allowed == 0) {
-				cpu_info[i].cpu_disabled = 1;
-				continue;
-			}
-		}
-
-		/* Don't use this CPU if it has been disabled by a tunable. */
-		if (resource_disabled("lapic", i)) {
-			cpu_info[i].cpu_disabled = 1;
-			continue;
-		}
-	}
-
-	if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) {
-		hyperthreading_cpus = 0;
-		cpu_logical = 1;
-	}
-
-	/*
-	 * Assign CPU IDs to local APIC IDs and disable any CPUs
-	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
-	 *
-	 * To minimize confusion for userland, we attempt to number
-	 * CPUs such that all threads and cores in a package are
-	 * grouped together.  For now we assume that the BSP is always
-	 * the first thread in a package and just start adding APs
-	 * starting with the BSP's APIC ID.
-	 */
-	mp_ncpus = 1;
-	cpu_apic_ids[0] = boot_cpu_id;
-	apic_cpuids[boot_cpu_id] = 0;
-	for (i = boot_cpu_id + 1; i != boot_cpu_id;
-	     i == MAX_APIC_ID ? i = 0 : i++) {
-		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
-		    cpu_info[i].cpu_disabled)
-			continue;
-
-		if (mp_ncpus < MAXCPU) {
-			cpu_apic_ids[mp_ncpus] = i;
-			apic_cpuids[i] = mp_ncpus;
-			mp_ncpus++;
-		} else
-			cpu_info[i].cpu_disabled = 1;
-	}
-	KASSERT(mp_maxid >= mp_ncpus - 1,
-	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
-	    mp_ncpus));		
-}

 #ifdef COUNT_XINVLTLB_HITS
 u_int xhits_gbl[MAXCPU];