iflib: Improve mapping of TX/RX queues to CPUs

iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache.  The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs.  When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs.  See the comment on get_cpuid_for_queue() for the
entire matrix.

The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset       (existing)
dev.<device>.<unit>.iflib.separate_txrx     (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)

The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu

When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.

Reviewed by:	kbowling
Tested by:	olivier, pkelsey
MFC after:	3 weeks
Differential Revision:	https://reviews.freebsd.org/D24094
This commit is contained in:
Patrick Kelsey 2021-04-26 00:25:59 -04:00
parent 4b84b4cca4
commit ca7005f189
2 changed files with 294 additions and 161 deletions

View File

@ -195,6 +195,8 @@ struct iflib_ctx {
uint16_t ifc_sysctl_core_offset;
#define CORE_OFFSET_UNSPECIFIED 0xffff
uint8_t ifc_sysctl_separate_txrx;
uint8_t ifc_sysctl_use_logical_cores;
bool ifc_cpus_are_physical_cores;
qidx_t ifc_sysctl_ntxds[8];
qidx_t ifc_sysctl_nrxds[8];
@ -725,7 +727,7 @@ struct cpu_offset {
SLIST_ENTRY(cpu_offset) entries;
cpuset_t set;
unsigned int refcount;
uint16_t offset;
uint16_t next_cpuid;
};
static struct mtx cpu_offset_mtx;
MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
@ -4682,41 +4684,291 @@ iflib_rem_pfil(if_ctx_t ctx)
pfil_head_unregister(pfil);
}
/*
* Advance forward by n members of the cpuset ctx->ifc_cpus starting from
* cpuid and wrapping as necessary.
*/
static unsigned int
cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
{
unsigned int first_valid;
unsigned int last_valid;
/* cpuid should always be in the valid set */
MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
/* valid set should never be empty */
MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
n = n % CPU_COUNT(&ctx->ifc_cpus);
while (n > 0) {
do {
cpuid++;
if (cpuid > last_valid)
cpuid = first_valid;
} while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
n--;
}
return (cpuid);
}
#if defined(SMP) && defined(SCHED_ULE)
extern struct cpu_group *cpu_top; /* CPU topology */
static int
find_child_with_core(int cpu, struct cpu_group *grp)
{
int i;
if (grp->cg_children == 0)
return -1;
MPASS(grp->cg_child);
for (i = 0; i < grp->cg_children; i++) {
if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
return i;
}
return -1;
}
/*
* Find an L2 neighbor of the given CPU or return -1 if none found. This
* does not distinguish among multiple L2 neighbors if the given CPU has
* more than one (it will always return the same result in that case).
*/
static int
find_l2_neighbor(int cpu)
{
struct cpu_group *grp;
int i;
grp = cpu_top;
if (grp == NULL)
return -1;
/*
* Find the smallest CPU group that contains the given core.
*/
i = 0;
while ((i = find_child_with_core(cpu, grp)) != -1) {
/*
* If the smallest group containing the given CPU has less
* than two members, we conclude the given CPU has no
* L2 neighbor.
*/
if (grp->cg_child[i].cg_count <= 1)
return (-1);
grp = &grp->cg_child[i];
}
/* Must share L2. */
if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
return -1;
/*
* Select the first member of the set that isn't the reference
* CPU, which at this point is guaranteed to exist.
*/
for (i = 0; i < CPU_SETSIZE; i++) {
if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
return (i);
}
/* Should never be reached */
return (-1);
}
#else
static int
find_l2_neighbor(int cpu)
{
return (-1);
}
#endif
/*
* CPU mapping behaviors
* ---------------------
* 'separate txrx' refers to the separate_txrx sysctl
* 'use logical' refers to the use_logical_cores sysctl
* 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
*
* separate use INTR
* txrx logical CPUS result
* ---------- --------- ------ ------------------------------------------------
* - - X RX and TX queues mapped to consecutive physical
* cores with RX/TX pairs on same core and excess
* of either following
* - X X RX and TX queues mapped to consecutive cores
* of any type with RX/TX pairs on same core and
* excess of either following
* X - X RX and TX queues mapped to consecutive physical
* cores; all RX then all TX
* X X X RX queues mapped to consecutive physical cores
* first, then TX queues mapped to L2 neighbor of
* the corresponding RX queue if one exists,
* otherwise to consecutive physical cores
* - n/a - RX and TX queues mapped to consecutive cores of
* any type with RX/TX pairs on same core and excess
* of either following
* X n/a - RX and TX queues mapped to consecutive cores of
* any type; all RX then all TX
*/
static unsigned int
get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
bool is_tx)
{
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
unsigned int core_index;
if (ctx->ifc_sysctl_separate_txrx) {
/*
* When using separate CPUs for TX and RX, the assignment
* will always be of a consecutive CPU out of the set of
* context CPUs, except for the specific case where the
* context CPUs are phsyical cores, the use of logical cores
* has been enabled, the assignment is for TX, the TX qid
* corresponds to an RX qid, and the CPU assigned to the
* corresponding RX queue has an L2 neighbor.
*/
if (ctx->ifc_sysctl_use_logical_cores &&
ctx->ifc_cpus_are_physical_cores &&
is_tx && qid < scctx->isc_nrxqsets) {
int l2_neighbor;
unsigned int rx_cpuid;
rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
l2_neighbor = find_l2_neighbor(rx_cpuid);
if (l2_neighbor != -1) {
return (l2_neighbor);
}
/*
* ... else fall through to the normal
* consecutive-after-RX assignment scheme.
*
* Note that we are assuming that all RX queue CPUs
* have an L2 neighbor, or all do not. If a mixed
* scenario is possible, we will have to keep track
* separately of how many queues prior to this one
* were not able to be assigned to an L2 neighbor.
*/
}
if (is_tx)
core_index = scctx->isc_nrxqsets + qid;
else
core_index = qid;
} else {
core_index = qid;
}
return (cpuid_advance(ctx, base_cpuid, core_index));
}
static uint16_t
get_ctx_core_offset(if_ctx_t ctx)
{
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
struct cpu_offset *op;
uint16_t qc;
uint16_t ret = ctx->ifc_sysctl_core_offset;
cpuset_t assigned_cpus;
unsigned int cores_consumed;
unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
unsigned int first_valid;
unsigned int last_valid;
unsigned int i;
if (ret != CORE_OFFSET_UNSPECIFIED)
return (ret);
first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
if (ctx->ifc_sysctl_separate_txrx)
qc = scctx->isc_ntxqsets + scctx->isc_nrxqsets;
else
qc = max(scctx->isc_ntxqsets, scctx->isc_nrxqsets);
if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
/*
* Align the user-chosen base CPU ID to the next valid CPU
* for this device. If the chosen base CPU ID is smaller
* than the first valid CPU or larger than the last valid
* CPU, we assume the user does not know what the valid
* range is for this device and is thinking in terms of a
* zero-based reference frame, and so we shift the given
* value into the valid range (and wrap accordingly) so the
* intent is translated to the proper frame of reference.
* If the base CPU ID is within the valid first/last, but
* does not correspond to a valid CPU, it is advanced to the
* next valid CPU (wrapping if necessary).
*/
if (base_cpuid < first_valid || base_cpuid > last_valid) {
/* shift from zero-based to first_valid-based */
base_cpuid += first_valid;
/* wrap to range [first_valid, last_valid] */
base_cpuid = (base_cpuid - first_valid) %
(last_valid - first_valid + 1);
}
if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
/*
* base_cpuid is in [first_valid, last_valid], but
* not a member of the valid set. In this case,
* there will always be a member of the valid set
* with a CPU ID that is greater than base_cpuid,
* and we simply advance to it.
*/
while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
base_cpuid++;
}
return (base_cpuid);
}
/*
* Determine how many cores will be consumed by performing the CPU
* assignments and counting how many of the assigned CPUs correspond
* to CPUs in the set of context CPUs. This is done using the CPU
* ID first_valid as the base CPU ID, as the base CPU must be within
* the set of context CPUs.
*
* Note not all assigned CPUs will be in the set of context CPUs
* when separate CPUs are being allocated to TX and RX queues,
* assignment to logical cores has been enabled, the set of context
* CPUs contains only physical CPUs, and TX queues are mapped to L2
* neighbors of CPUs that RX queues have been mapped to - in this
* case we do only want to count how many CPUs in the set of context
* CPUs have been consumed, as that determines the next CPU in that
* set to start allocating at for the next device for which
* core_offset is not set.
*/
CPU_ZERO(&assigned_cpus);
for (i = 0; i < scctx->isc_ntxqsets; i++)
CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
&assigned_cpus);
for (i = 0; i < scctx->isc_nrxqsets; i++)
CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
&assigned_cpus);
CPU_AND(&assigned_cpus, &ctx->ifc_cpus);
cores_consumed = CPU_COUNT(&assigned_cpus);
mtx_lock(&cpu_offset_mtx);
SLIST_FOREACH(op, &cpu_offsets, entries) {
if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
ret = op->offset;
op->offset += qc;
base_cpuid = op->next_cpuid;
op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
cores_consumed);
MPASS(op->refcount < UINT_MAX);
op->refcount++;
break;
}
}
if (ret == CORE_OFFSET_UNSPECIFIED) {
ret = 0;
if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
base_cpuid = first_valid;
op = malloc(sizeof(struct cpu_offset), M_IFLIB,
M_NOWAIT | M_ZERO);
if (op == NULL) {
device_printf(ctx->ifc_dev,
"allocation for cpu offset failed.\n");
} else {
op->offset = qc;
op->next_cpuid = cpuid_advance(ctx, base_cpuid,
cores_consumed);
op->refcount = 1;
CPU_COPY(&ctx->ifc_cpus, &op->set);
SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
@ -4724,7 +4976,7 @@ get_ctx_core_offset(if_ctx_t ctx)
}
mtx_unlock(&cpu_offset_mtx);
return (ret);
return (base_cpuid);
}
static void
@ -4855,7 +5107,9 @@ iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ct
if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
device_printf(dev, "Unable to fetch CPU list\n");
CPU_COPY(&all_cpus, &ctx->ifc_cpus);
}
ctx->ifc_cpus_are_physical_cores = false;
} else
ctx->ifc_cpus_are_physical_cores = true;
MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
/*
@ -5970,128 +6224,6 @@ iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
}
#ifdef SMP
static int
find_nth(if_ctx_t ctx, int qid)
{
cpuset_t cpus;
int i, cpuid, eqid, count;
CPU_COPY(&ctx->ifc_cpus, &cpus);
count = CPU_COUNT(&cpus);
eqid = qid % count;
/* clear up to the qid'th bit */
for (i = 0; i < eqid; i++) {
cpuid = CPU_FFS(&cpus);
MPASS(cpuid != 0);
CPU_CLR(cpuid-1, &cpus);
}
cpuid = CPU_FFS(&cpus);
MPASS(cpuid != 0);
return (cpuid-1);
}
#ifdef SCHED_ULE
extern struct cpu_group *cpu_top; /* CPU topology */
static int
find_child_with_core(int cpu, struct cpu_group *grp)
{
int i;
if (grp->cg_children == 0)
return -1;
MPASS(grp->cg_child);
for (i = 0; i < grp->cg_children; i++) {
if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
return i;
}
return -1;
}
/*
* Find the nth "close" core to the specified core
* "close" is defined as the deepest level that shares
* at least an L2 cache. With threads, this will be
* threads on the same core. If the shared cache is L3
* or higher, simply returns the same core.
*/
static int
find_close_core(int cpu, int core_offset)
{
struct cpu_group *grp;
int i;
int fcpu;
cpuset_t cs;
grp = cpu_top;
if (grp == NULL)
return cpu;
i = 0;
while ((i = find_child_with_core(cpu, grp)) != -1) {
/* If the child only has one cpu, don't descend */
if (grp->cg_child[i].cg_count <= 1)
break;
grp = &grp->cg_child[i];
}
/* If they don't share at least an L2 cache, use the same CPU */
if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
return cpu;
/* Now pick one */
CPU_COPY(&grp->cg_mask, &cs);
/* Add the selected CPU offset to core offset. */
for (i = 0; (fcpu = CPU_FFS(&cs)) != 0; i++) {
if (fcpu - 1 == cpu)
break;
CPU_CLR(fcpu - 1, &cs);
}
MPASS(fcpu);
core_offset += i;
CPU_COPY(&grp->cg_mask, &cs);
for (i = core_offset % grp->cg_count; i > 0; i--) {
MPASS(CPU_FFS(&cs));
CPU_CLR(CPU_FFS(&cs) - 1, &cs);
}
MPASS(CPU_FFS(&cs));
return CPU_FFS(&cs) - 1;
}
#else
static int
find_close_core(int cpu, int core_offset __unused)
{
return cpu;
}
#endif
static int
get_core_offset(if_ctx_t ctx, iflib_intr_type_t type, int qid)
{
switch (type) {
case IFLIB_INTR_TX:
/* TX queues get cores which share at least an L2 cache with the corresponding RX queue */
/* XXX handle multiple RX threads per core and more than two core per L2 group */
return qid / CPU_COUNT(&ctx->ifc_cpus) + 1;
case IFLIB_INTR_RX:
case IFLIB_INTR_RXTX:
/* RX queues get the specified core */
return qid / CPU_COUNT(&ctx->ifc_cpus);
default:
return -1;
}
}
#else
#define get_core_offset(ctx, type, qid) CPU_FIRST()
#define find_close_core(cpuid, tid) CPU_FIRST()
#define find_nth(ctx, gid) CPU_FIRST()
#endif
/* Just to avoid copy/paste */
static inline int
iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
@ -6099,21 +6231,14 @@ iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
const char *name)
{
device_t dev;
int co, cpuid, err, tid;
unsigned int base_cpuid, cpuid;
int err;
dev = ctx->ifc_dev;
co = ctx->ifc_sysctl_core_offset;
if (ctx->ifc_sysctl_separate_txrx && type == IFLIB_INTR_TX)
co += ctx->ifc_softc_ctx.isc_nrxqsets;
cpuid = find_nth(ctx, qid + co);
tid = get_core_offset(ctx, type, qid);
if (tid < 0) {
device_printf(dev, "get_core_offset failed\n");
return (EOPNOTSUPP);
}
cpuid = find_close_core(cpuid, tid);
err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev, irq->ii_res,
name);
base_cpuid = ctx->ifc_sysctl_core_offset;
cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev,
irq ? irq->ii_res : NULL, name);
if (err) {
device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
return (err);
@ -6202,8 +6327,8 @@ iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
return (0);
if (tqrid != -1) {
err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg,
q, name);
err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
name);
if (err)
return (err);
} else {
@ -6216,6 +6341,7 @@ iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
void
iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name)
{
device_t dev;
struct grouptask *gtask;
struct taskqgroup *tqg;
gtask_fn_t *fn;
@ -6247,14 +6373,11 @@ iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
default:
panic("unknown net intr type");
}
if (irq != NULL) {
err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg,
q, name);
if (err)
taskqgroup_attach(tqg, gtask, q, ctx->ifc_dev,
irq->ii_res, name);
} else {
taskqgroup_attach(tqg, gtask, q, NULL, NULL, name);
err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
if (err) {
dev = ctx->ifc_dev;
taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL,
name);
}
}
@ -6736,6 +6859,9 @@ iflib_add_device_sysctl_pre(if_ctx_t ctx)
SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
"use separate cores for TX and RX");
SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
"try to make use of logical cores for TX and RX");
/* XXX change for per-queue sizes */
SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
@ -6780,6 +6906,9 @@ iflib_add_device_sysctl_post(if_ctx_t ctx)
queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
queue_list = SYSCTL_CHILDREN(queue_node);
SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
CTLFLAG_RD,
&txq->ift_task.gt_cpu, 0, "cpu this queue is bound to");
#if MEMORY_LOGGING
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
CTLFLAG_RD,
@ -6862,6 +6991,9 @@ iflib_add_device_sysctl_post(if_ctx_t ctx)
queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
queue_list = SYSCTL_CHILDREN(queue_node);
SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
CTLFLAG_RD,
&rxq->ifr_task.gt_cpu, 0, "cpu this queue is bound to");
if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
CTLFLAG_RD,

View File

@ -65,6 +65,7 @@
#define CPU_OR_ATOMIC(d, s) BIT_OR_ATOMIC(CPU_SETSIZE, d, s)
#define CPU_COPY_STORE_REL(f, t) BIT_COPY_STORE_REL(CPU_SETSIZE, f, t)
#define CPU_FFS(p) BIT_FFS(CPU_SETSIZE, p)
#define CPU_FLS(p) BIT_FLS(CPU_SETSIZE, p)
#define CPU_COUNT(p) ((int)BIT_COUNT(CPU_SETSIZE, p))
#define CPUSET_FSET BITSET_FSET(_NCPUWORDS)
#define CPUSET_T_INITIALIZER BITSET_T_INITIALIZER