From 96c85efb4b85dbc0bfc4968516a627a727ac7ec5 Mon Sep 17 00:00:00 2001
From: Nathan Whitehorn <nwhitehorn@FreeBSD.org>
Date: Wed, 6 Jul 2016 14:09:49 +0000
Subject: [PATCH] Replace a number of conflations of mp_ncpus and mp_maxid with
 either mp_maxid or CPU_FOREACH() as appropriate. This fixes a number of
 places in the kernel that assumed CPU IDs are dense in [0, mp_ncpus) and
 would try, for example, to run tasks on CPUs that did not exist or to
 allocate too few buffers on systems with sparse CPU IDs in which there are
 holes in the range and mp_maxid > mp_ncpus. Such circumstances generally
 occur on systems with SMT, but on which SMT is disabled. This patch restores
 system operation at least on POWER8 systems configured in this way.

There are a number of other places in the kernel with potential problems
in these situations, but where sparse CPU IDs are not currently known
to occur, mostly in the ARM machine-dependent code. These will be fixed
in a follow-up commit after the stable/11 branch.

PR:		kern/210106
Reviewed by:	jhb
Approved by:	re (glebius)
---
 sys/amd64/include/counter.h            |  2 +-
 sys/cddl/compat/opensolaris/sys/proc.h |  4 ++--
 sys/dev/cpuctl/cpuctl.c                | 20 ++++++++++----------
 sys/i386/include/counter.h             |  6 +++---
 sys/kern/subr_pcpu.c                   |  8 ++++----
 sys/kern/subr_taskqueue.c              | 23 ++++++++++++++++++-----
 sys/net/flowtable.c                    |  2 +-
 sys/net/iflib.c                        |  8 ++++----
 sys/netinet/ip_id.c                    |  4 +++-
 sys/powerpc/include/counter.h          |  2 +-
 sys/powerpc/powerpc/mp_machdep.c       |  8 ++------
 sys/vm/uma.h                           |  2 +-
 sys/vm/uma_core.c                      |  5 +++--
 13 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/sys/amd64/include/counter.h b/sys/amd64/include/counter.h
index b571e70769a9..a79eab42fb1c 100644
--- a/sys/amd64/include/counter.h
+++ b/sys/amd64/include/counter.h
@@ -51,7 +51,7 @@ counter_u64_fetch_inline(uint64_t *p)
 	int i;
 
 	r = 0;
-	for (i = 0; i < mp_ncpus; i++)
+	CPU_FOREACH(i)
 		r += counter_u64_read_one((uint64_t *)p, i);
 
 	return (r);
diff --git a/sys/cddl/compat/opensolaris/sys/proc.h b/sys/cddl/compat/opensolaris/sys/proc.h
index 1fe2e9ab342f..5c9e8dec78c4 100644
--- a/sys/cddl/compat/opensolaris/sys/proc.h
+++ b/sys/cddl/compat/opensolaris/sys/proc.h
@@ -45,8 +45,8 @@
 #define	CPU		curcpu
 #define	minclsyspri	PRIBIO
 #define	maxclsyspri	PVM
-#define	max_ncpus	mp_ncpus
-#define	boot_max_ncpus	mp_ncpus
+#define	max_ncpus	(mp_maxid + 1)
+#define	boot_max_ncpus	(mp_maxid + 1)
 
 #define	TS_RUN	0
 
diff --git a/sys/dev/cpuctl/cpuctl.c b/sys/dev/cpuctl/cpuctl.c
index 6519e1b178a6..08a37c8ad1f0 100644
--- a/sys/dev/cpuctl/cpuctl.c
+++ b/sys/dev/cpuctl/cpuctl.c
@@ -120,7 +120,7 @@ static void
 set_cpu(int cpu, struct thread *td)
 {
 
-	KASSERT(cpu >= 0 && cpu < mp_ncpus && cpu_enabled(cpu),
+	KASSERT(cpu >= 0 && cpu <= mp_maxid && cpu_enabled(cpu),
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 	thread_lock(td);
 	sched_bind(td, cpu);
@@ -133,7 +133,7 @@ static void
 restore_cpu(int oldcpu, int is_bound, struct thread *td)
 {
 
-	KASSERT(oldcpu >= 0 && oldcpu < mp_ncpus && cpu_enabled(oldcpu),
+	KASSERT(oldcpu >= 0 && oldcpu <= mp_maxid && cpu_enabled(oldcpu),
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, oldcpu));
 	thread_lock(td);
 	if (is_bound == 0)
@@ -150,7 +150,7 @@ cpuctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
 	int ret;
 	int cpu = dev2unit(dev);
 
-	if (cpu >= mp_ncpus || !cpu_enabled(cpu)) {
+	if (cpu > mp_maxid || !cpu_enabled(cpu)) {
 		DPRINTF("[cpuctl,%d]: bad cpu number %d\n", __LINE__, cpu);
 		return (ENXIO);
 	}
@@ -201,7 +201,7 @@ cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data,
 	int is_bound = 0;
 	int oldcpu;
 
-	KASSERT(cpu >= 0 && cpu < mp_ncpus,
+	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 
 	/* Explicitly clear cpuid data to avoid returning stale info. */
@@ -245,7 +245,7 @@ cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd, struct thread *td)
 	int oldcpu;
 	int ret;
 
-	KASSERT(cpu >= 0 && cpu < mp_ncpus,
+	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 
 	/*
@@ -296,7 +296,7 @@ cpuctl_do_update(int cpu, cpuctl_update_args_t *data, struct thread *td)
 	char vendor[13];
 	int ret;
 
-	KASSERT(cpu >= 0 && cpu < mp_ncpus,
+	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 	DPRINTF("[cpuctl,%d]: XXX %d", __LINE__, cpu);
 
@@ -512,7 +512,7 @@ cpuctl_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
 	int cpu;
 
 	cpu = dev2unit(dev);
-	if (cpu >= mp_ncpus || !cpu_enabled(cpu)) {
+	if (cpu > mp_maxid || !cpu_enabled(cpu)) {
 		DPRINTF("[cpuctl,%d]: incorrect cpu number %d\n", __LINE__,
 		    cpu);
 		return (ENXIO);
@@ -531,15 +531,15 @@ cpuctl_modevent(module_t mod __unused, int type, void *data __unused)
 	case MOD_LOAD:
 		if (bootverbose)
 			printf("cpuctl: access to MSR registers/cpuid info.\n");
-		cpuctl_devs = malloc(sizeof(*cpuctl_devs) * mp_ncpus, M_CPUCTL,
+		cpuctl_devs = malloc(sizeof(*cpuctl_devs) * (mp_maxid + 1), M_CPUCTL,
 		    M_WAITOK | M_ZERO);
-		for (cpu = 0; cpu < mp_ncpus; cpu++)
+		CPU_FOREACH(cpu)
 			if (cpu_enabled(cpu))
 				cpuctl_devs[cpu] = make_dev(&cpuctl_cdevsw, cpu,
 				    UID_ROOT, GID_KMEM, 0640, "cpuctl%d", cpu);
 		break;
 	case MOD_UNLOAD:
-		for (cpu = 0; cpu < mp_ncpus; cpu++) {
+		CPU_FOREACH(cpu) {
 			if (cpuctl_devs[cpu] != NULL)
 				destroy_dev(cpuctl_devs[cpu]);
 		}
diff --git a/sys/i386/include/counter.h b/sys/i386/include/counter.h
index 01b09bb47bd2..86f3ec91fe18 100644
--- a/sys/i386/include/counter.h
+++ b/sys/i386/include/counter.h
@@ -98,13 +98,13 @@ counter_u64_fetch_inline(uint64_t *p)
 		 * critical section as well.
 		 */
 		critical_enter();
-		for (i = 0; i < mp_ncpus; i++) {
+		CPU_FOREACH(i) {
 			res += *(uint64_t *)((char *)p +
 			    sizeof(struct pcpu) * i);
 		}
 		critical_exit();
 	} else {
-		for (i = 0; i < mp_ncpus; i++)
+		CPU_FOREACH(i)
 			res += counter_u64_read_one_8b((uint64_t *)((char *)p +
 			    sizeof(struct pcpu) * i));
 	}
@@ -144,7 +144,7 @@ counter_u64_zero_inline(counter_u64_t c)
 
 	if ((cpu_feature & CPUID_CX8) == 0) {
 		critical_enter();
-		for (i = 0; i < mp_ncpus; i++)
+		CPU_FOREACH(i)
 			*(uint64_t *)((char *)c + sizeof(struct pcpu) * i) = 0;
 		critical_exit();
 	} else {
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
index a01f67aba1e6..fbfa1a647471 100644
--- a/sys/kern/subr_pcpu.c
+++ b/sys/kern/subr_pcpu.c
@@ -248,7 +248,7 @@ dpcpu_copy(void *s, int size)
 	uintptr_t dpcpu;
 	int i;
 
-	for (i = 0; i < mp_ncpus; ++i) {
+	CPU_FOREACH(i) {
 		dpcpu = dpcpu_off[i];
 		if (dpcpu == 0)
 			continue;
@@ -289,7 +289,7 @@ sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS)
 	int i;
 
 	count = 0;
-	for (i = 0; i < mp_ncpus; ++i) {
+	CPU_FOREACH(i) {
 		dpcpu = dpcpu_off[i];
 		if (dpcpu == 0)
 			continue;
@@ -306,7 +306,7 @@ sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS)
 	int i;
 
 	count = 0;
-	for (i = 0; i < mp_ncpus; ++i) {
+	CPU_FOREACH(i) {
 		dpcpu = dpcpu_off[i];
 		if (dpcpu == 0)
 			continue;
@@ -323,7 +323,7 @@ sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS)
 	int i;
 
 	count = 0;
-	for (i = 0; i < mp_ncpus; ++i) {
+	CPU_FOREACH(i) {
 		dpcpu = dpcpu_off[i];
 		if (dpcpu == 0)
 			continue;
diff --git a/sys/kern/subr_taskqueue.c b/sys/kern/subr_taskqueue.c
index 4be29f583ee5..12124b8aa005 100644
--- a/sys/kern/subr_taskqueue.c
+++ b/sys/kern/subr_taskqueue.c
@@ -832,6 +832,7 @@ static void
 taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx)
 {
 	struct taskqgroup_cpu *qcpu;
+	int i, j;
 
 	qcpu = &qgroup->tqg_queue[idx];
 	LIST_INIT(&qcpu->tgc_tasks);
@@ -839,7 +840,15 @@ taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx)
 	    taskqueue_thread_enqueue, &qcpu->tgc_taskq);
 	taskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
 	    "%s_%d", qgroup->tqg_name, idx);
-	qcpu->tgc_cpu = idx * qgroup->tqg_stride;
+
+	for (i = CPU_FIRST(), j = 0; j < idx * qgroup->tqg_stride;
+	    j++, i = CPU_NEXT(i)) {
+		/*
+		 * Wait: evaluate the idx * qgroup->tqg_stride'th CPU,
+		 * potentially wrapping the actual count
+		 */
+	}
+	qcpu->tgc_cpu = i;
 }
 
 static void
@@ -1017,13 +1026,14 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 	LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
 	cpuset_t mask;
 	struct grouptask *gtask;
-	int i, old_cnt, qid;
+	int i, k, old_cnt, qid, cpu;
 
 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
 
 	if (cnt < 1 || cnt * stride > mp_ncpus || !smp_started) {
-		printf("taskqgroup_adjust failed cnt: %d stride: %d mp_ncpus: %d smp_started: %d\n",
-			   cnt, stride, mp_ncpus, smp_started);
+		printf("taskqgroup_adjust failed cnt: %d stride: %d "
+		    "mp_ncpus: %d smp_started: %d\n", cnt, stride, mp_ncpus,
+		    smp_started);
 		return (EINVAL);
 	}
 	if (qgroup->tqg_adjusting) {
@@ -1081,8 +1091,11 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 	/*
 	 * Set new CPU and IRQ affinity
 	 */
+	cpu = CPU_FIRST();
 	for (i = 0; i < cnt; i++) {
-		qgroup->tqg_queue[i].tgc_cpu = i * qgroup->tqg_stride;
+		qgroup->tqg_queue[i].tgc_cpu = cpu;
+		for (k = 0; k < qgroup->tqg_stride; k++)
+			cpu = CPU_NEXT(cpu);
 		CPU_ZERO(&mask);
 		CPU_SET(qgroup->tqg_queue[i].tgc_cpu, &mask);
 		LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list) {
diff --git a/sys/net/flowtable.c b/sys/net/flowtable.c
index b837a8e6ed30..48f7ff053a7b 100644
--- a/sys/net/flowtable.c
+++ b/sys/net/flowtable.c
@@ -746,7 +746,7 @@ flowtable_alloc(struct flowtable *ft)
 		ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
 
 	ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
-	for (int i = 0; i < mp_ncpus; i++) {
+	CPU_FOREACH(i) {
 		bitstr_t **b;
 
 		b = zpcpu_get_cpu(ft->ft_masks, i);
diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index 955a1b261a29..67282b794da3 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -3848,7 +3848,7 @@ iflib_queues_alloc(if_ctx_t ctx)
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	iflib_fl_t fl = NULL;
-	int i, j, err, txconf, rxconf, fl_ifdi_offset;
+	int i, j, cpu, err, txconf, rxconf, fl_ifdi_offset;
 	iflib_dma_info_t ifdip;
 	uint32_t *rxqsizes = sctx->isc_rxqsizes;
 	uint32_t *txqsizes = sctx->isc_txqsizes;
@@ -3897,7 +3897,7 @@ iflib_queues_alloc(if_ctx_t ctx)
 	/*
 	 * XXX handle allocation failure
 	 */
-	for (txconf = i = 0; i < ntxqsets; i++, txconf++, txq++) {
+	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
@@ -3917,8 +3917,8 @@ iflib_queues_alloc(if_ctx_t ctx)
 		txq->ift_ctx = ctx;
 		txq->ift_id = i;
 		/* XXX fix this */
-		txq->ift_timer.c_cpu = i % mp_ncpus;
-		txq->ift_db_check.c_cpu = i % mp_ncpus;
+		txq->ift_timer.c_cpu = cpu;
+		txq->ift_db_check.c_cpu = cpu;
 		txq->ift_nbr = nbuf_rings;
 
 		if (iflib_txsd_alloc(txq)) {
diff --git a/sys/netinet/ip_id.c b/sys/netinet/ip_id.c
index 6a80b6e4b903..97d5851b349f 100644
--- a/sys/netinet/ip_id.c
+++ b/sys/netinet/ip_id.c
@@ -275,10 +275,12 @@ ip_fillid(struct ip *ip)
 static void
 ipid_sysinit(void)
 {
+	int i;
 
 	mtx_init(&V_ip_id_mtx, "ip_id_mtx", NULL, MTX_DEF);
 	V_ip_id = counter_u64_alloc(M_WAITOK);
-	for (int i = 0; i < mp_ncpus; i++)
+	
+	CPU_FOREACH(i)
 		arc4rand(zpcpu_get_cpu(V_ip_id, i), sizeof(uint64_t), 0);
 }
 VNET_SYSINIT(ip_id, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipid_sysinit, NULL);
diff --git a/sys/powerpc/include/counter.h b/sys/powerpc/include/counter.h
index 21a05770b270..500c3768c20e 100644
--- a/sys/powerpc/include/counter.h
+++ b/sys/powerpc/include/counter.h
@@ -54,7 +54,7 @@ counter_u64_fetch_inline(uint64_t *p)
 	int i;
 
 	r = 0;
-	for (i = 0; i < mp_ncpus; i++)
+	CPU_FOREACH(i)
 		r += counter_u64_read_one((uint64_t *)p, i);
 
 	return (r);
diff --git a/sys/powerpc/powerpc/mp_machdep.c b/sys/powerpc/powerpc/mp_machdep.c
index e219683f8b86..7d2dc9513d77 100644
--- a/sys/powerpc/powerpc/mp_machdep.c
+++ b/sys/powerpc/powerpc/mp_machdep.c
@@ -113,20 +113,16 @@ cpu_mp_setmaxid(void)
 	int error;
 
 	mp_ncpus = 0;
+	mp_maxid = 0;
 	error = platform_smp_first_cpu(&cpuref);
 	while (!error) {
 		mp_ncpus++;
+		mp_maxid = max(cpuref.cr_cpuid, mp_maxid);
 		error = platform_smp_next_cpu(&cpuref);
 	}
 	/* Sanity. */
 	if (mp_ncpus == 0)
 		mp_ncpus = 1;
-
-	/*
-	 * Set the largest cpuid we're going to use. This is necessary
-	 * for VM initialization.
-	 */
-	mp_maxid = min(mp_ncpus, MAXCPU) - 1;
 }
 
 int
diff --git a/sys/vm/uma.h b/sys/vm/uma.h
index 6ac78ef5cfcf..f4c2de8a5d25 100644
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@@ -276,7 +276,7 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
 					 * mini-dumps.
 					 */
 #define	UMA_ZONE_PCPU		0x8000	/*
-					 * Allocates mp_ncpus slabs sized to
+					 * Allocates mp_maxid + 1 slabs sized to
 					 * sizeof(struct pcpu).
 					 */
 
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index 0a56c55d9203..1ae83c4686c0 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -1227,7 +1227,7 @@ keg_small_init(uma_keg_t keg)
 	u_int shsize;
 
 	if (keg->uk_flags & UMA_ZONE_PCPU) {
-		u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
 
 		keg->uk_slabsize = sizeof(struct pcpu);
 		keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
@@ -3265,9 +3265,10 @@ uma_large_free(uma_slab_t slab)
 static void
 uma_zero_item(void *item, uma_zone_t zone)
 {
+	int i;
 
 	if (zone->uz_flags & UMA_ZONE_PCPU) {
-		for (int i = 0; i < mp_ncpus; i++)
+		CPU_FOREACH(i)
 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
 	} else
 		bzero(item, zone->uz_size);