Simplify taskqgroup inititialization.

taskqgroup initialization was broken into two steps:

1. allocate the taskqgroup structure, at SI_SUB_TASKQ;
2. initialize taskqueues, start taskqueue threads, enqueue "binder"
   tasks to bind threads to specific CPUs, at SI_SUB_SMP.

Step 2 tries to handle the case where tasks have already been attached
to a queue, by migrating them to their intended queue.  In particular,
tasks can't be enqueued before step 2 has completed.  This breaks NFS
mountroot on systems using an iflib-based driver when EARLY_AP_STARTUP
is not defined, since mountroot happens before SI_SUB_SMP in this case.

Simplify initialization: do all initialization except for CPU binding at
SI_SUB_TASKQ.  This means that until CPU binding is completed, group
tasks may be executed on a CPU other than that to which they were bound,
but this should not be a problem for existing users of the taskqgroup
KPIs.

Reported by:	sbruno
Tested by:	bdragon, sbruno
MFC after:	1 month
Sponsored by:	The FreeBSD Foundation
Differential Revision:	https://reviews.freebsd.org/D24188
This commit is contained in:
markj 2020-03-30 14:22:52 +00:00
parent db8ecc4f03
commit 307559af97
3 changed files with 40 additions and 298 deletions

View File

@ -170,7 +170,7 @@ gtaskqueue_terminate(struct thread **pp, struct gtaskqueue *tq)
}
}
static void
static void __unused
gtaskqueue_free(struct gtaskqueue *queue)
{
@ -591,18 +591,16 @@ gtaskqueue_create_fast(const char *name, int mflags,
}
struct taskqgroup_cpu {
LIST_HEAD(, grouptask) tgc_tasks;
struct gtaskqueue *tgc_taskq;
int tgc_cnt;
int tgc_cpu;
LIST_HEAD(, grouptask) tgc_tasks;
struct gtaskqueue *tgc_taskq;
int tgc_cnt;
int tgc_cpu;
};
struct taskqgroup {
struct taskqgroup_cpu tqg_queue[MAXCPU];
struct mtx tqg_lock;
const char * tqg_name;
int tqg_adjusting;
int tqg_stride;
int tqg_cnt;
};
@ -625,13 +623,6 @@ taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
qcpu->tgc_cpu = cpu;
}
static void
taskqgroup_cpu_remove(struct taskqgroup *qgroup, int idx)
{
gtaskqueue_free(qgroup->tqg_queue[idx].tgc_taskq);
}
/*
* Find the taskq with least # of tasks that doesn't currently have any
* other queues from the uniq identifier.
@ -644,22 +635,22 @@ taskqgroup_find(struct taskqgroup *qgroup, void *uniq)
int strict;
mtx_assert(&qgroup->tqg_lock, MA_OWNED);
if (qgroup->tqg_cnt == 0)
return (0);
idx = -1;
mincnt = INT_MAX;
KASSERT(qgroup->tqg_cnt != 0,
("qgroup %s has no queues", qgroup->tqg_name));
/*
* Two passes; First scan for a queue with the least tasks that
* Two passes: first scan for a queue with the least tasks that
* does not already service this uniq id. If that fails simply find
* the queue with the least total tasks;
* the queue with the least total tasks.
*/
for (strict = 1; mincnt == INT_MAX; strict = 0) {
for (idx = -1, mincnt = INT_MAX, strict = 1; mincnt == INT_MAX;
strict = 0) {
for (i = 0; i < qgroup->tqg_cnt; i++) {
if (qgroup->tqg_queue[i].tgc_cnt > mincnt)
continue;
if (strict) {
LIST_FOREACH(n,
&qgroup->tqg_queue[i].tgc_tasks, gt_list)
LIST_FOREACH(n, &qgroup->tqg_queue[i].tgc_tasks,
gt_list)
if (n->gt_uniq == uniq)
break;
if (n != NULL)
@ -675,37 +666,15 @@ taskqgroup_find(struct taskqgroup *qgroup, void *uniq)
return (idx);
}
/*
* smp_started is unusable since it is not set for UP kernels or even for
* SMP kernels when there is 1 CPU. This is usually handled by adding a
* (mp_ncpus == 1) test, but that would be broken here since we need to
* to synchronize with the SI_SUB_SMP ordering. Even in the pure SMP case
* smp_started only gives a fuzzy ordering relative to SI_SUB_SMP.
*
* So maintain our own flag. It must be set after all CPUs are started
* and before SI_SUB_SMP:SI_ORDER_ANY so that the SYSINIT for delayed
* adjustment is properly delayed. SI_ORDER_FOURTH is clearly before
* SI_ORDER_ANY and unclearly after the CPUs are started. It would be
* simpler for adjustment to pass a flag indicating if it is delayed.
*/
static int tqg_smp_started;
static void
tqg_record_smp_started(void *arg)
{
tqg_smp_started = 1;
}
SYSINIT(tqg_record_smp_started, SI_SUB_SMP, SI_ORDER_FOURTH,
tqg_record_smp_started, NULL);
void
taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
void *uniq, device_t dev, struct resource *irq, const char *name)
{
int cpu, qid, error;
KASSERT(qgroup->tqg_cnt > 0,
("qgroup %s has no queues", qgroup->tqg_name));
gtask->gt_uniq = uniq;
snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask");
gtask->gt_dev = dev;
@ -716,7 +685,7 @@ taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
qgroup->tqg_queue[qid].tgc_cnt++;
LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
if (dev != NULL && irq != NULL && tqg_smp_started) {
if (dev != NULL && irq != NULL) {
cpu = qgroup->tqg_queue[qid].tgc_cpu;
gtask->gt_cpu = cpu;
mtx_unlock(&qgroup->tqg_lock);
@ -728,85 +697,19 @@ taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
mtx_unlock(&qgroup->tqg_lock);
}
static void
taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
{
int qid, cpu, error;
mtx_lock(&qgroup->tqg_lock);
qid = taskqgroup_find(qgroup, gtask->gt_uniq);
cpu = qgroup->tqg_queue[qid].tgc_cpu;
if (gtask->gt_dev != NULL && gtask->gt_irq != NULL) {
mtx_unlock(&qgroup->tqg_lock);
error = bus_bind_intr(gtask->gt_dev, gtask->gt_irq, cpu);
mtx_lock(&qgroup->tqg_lock);
if (error)
printf("%s: binding interrupt failed for %s: %d\n",
__func__, gtask->gt_name, error);
}
qgroup->tqg_queue[qid].tgc_cnt++;
LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
mtx_unlock(&qgroup->tqg_lock);
}
int
taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask,
void *uniq, int cpu, device_t dev, struct resource *irq, const char *name)
{
int i, qid, error;
qid = -1;
gtask->gt_uniq = uniq;
snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask");
gtask->gt_dev = dev;
gtask->gt_irq = irq;
gtask->gt_cpu = cpu;
mtx_lock(&qgroup->tqg_lock);
if (tqg_smp_started) {
for (i = 0; i < qgroup->tqg_cnt; i++)
if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
qid = i;
break;
}
if (qid == -1) {
mtx_unlock(&qgroup->tqg_lock);
printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu);
return (EINVAL);
}
} else
qid = 0;
qgroup->tqg_queue[qid].tgc_cnt++;
LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
cpu = qgroup->tqg_queue[qid].tgc_cpu;
mtx_unlock(&qgroup->tqg_lock);
if (dev != NULL && irq != NULL && tqg_smp_started) {
error = bus_bind_intr(dev, irq, cpu);
if (error)
printf("%s: binding interrupt failed for %s: %d\n",
__func__, gtask->gt_name, error);
}
return (0);
}
static int
taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
{
device_t dev;
struct resource *irq;
int cpu, error, i, qid;
qid = -1;
dev = gtask->gt_dev;
irq = gtask->gt_irq;
cpu = gtask->gt_cpu;
MPASS(tqg_smp_started);
mtx_lock(&qgroup->tqg_lock);
for (i = 0; i < qgroup->tqg_cnt; i++)
for (i = 0, qid = -1; i < qgroup->tqg_cnt; i++)
if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
qid = i;
break;
@ -818,8 +721,8 @@ taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtas
}
qgroup->tqg_queue[qid].tgc_cnt++;
LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
cpu = qgroup->tqg_queue[qid].tgc_cpu;
mtx_unlock(&qgroup->tqg_lock);
if (dev != NULL && irq != NULL) {
@ -853,10 +756,11 @@ taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask)
static void
taskqgroup_binder(void *ctx)
{
struct taskq_bind_task *gtask = (struct taskq_bind_task *)ctx;
struct taskq_bind_task *gtask;
cpuset_t mask;
int error;
gtask = ctx;
CPU_ZERO(&mask);
CPU_SET(gtask->bt_cpuid, &mask);
error = cpuset_setthread(curthread->td_tid, &mask);
@ -869,7 +773,7 @@ taskqgroup_binder(void *ctx)
free(gtask, M_DEVBUF);
}
static void
void
taskqgroup_bind(struct taskqgroup *qgroup)
{
struct taskq_bind_task *gtask;
@ -883,7 +787,7 @@ taskqgroup_bind(struct taskqgroup *qgroup)
return;
for (i = 0; i < qgroup->tqg_cnt; i++) {
gtask = malloc(sizeof (*gtask), M_DEVBUF, M_WAITOK);
gtask = malloc(sizeof(*gtask), M_DEVBUF, M_WAITOK);
GTASK_INIT(&gtask->bt_task, 0, 0, taskqgroup_binder, gtask);
gtask->bt_cpuid = qgroup->tqg_queue[i].tgc_cpu;
grouptaskqueue_enqueue(qgroup->tqg_queue[i].tgc_taskq,
@ -891,137 +795,22 @@ taskqgroup_bind(struct taskqgroup *qgroup)
}
}
static void
taskqgroup_config_init(void *arg)
{
struct taskqgroup *qgroup = qgroup_config;
LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
grouptask, gt_list);
qgroup->tqg_queue[0].tgc_cnt = 0;
taskqgroup_cpu_create(qgroup, 0, 0);
qgroup->tqg_cnt = 1;
qgroup->tqg_stride = 1;
}
SYSINIT(taskqgroup_config_init, SI_SUB_TASKQ, SI_ORDER_SECOND,
taskqgroup_config_init, NULL);
static int
_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
{
LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
struct grouptask *gtask;
int i, k, old_cnt, old_cpu, cpu;
mtx_assert(&qgroup->tqg_lock, MA_OWNED);
if (cnt < 1 || cnt * stride > mp_ncpus || !tqg_smp_started) {
printf("%s: failed cnt: %d stride: %d "
"mp_ncpus: %d tqg_smp_started: %d\n",
__func__, cnt, stride, mp_ncpus, tqg_smp_started);
return (EINVAL);
}
if (qgroup->tqg_adjusting) {
printf("%s failed: adjusting\n", __func__);
return (EBUSY);
}
qgroup->tqg_adjusting = 1;
old_cnt = qgroup->tqg_cnt;
old_cpu = 0;
if (old_cnt < cnt)
old_cpu = qgroup->tqg_queue[old_cnt].tgc_cpu;
mtx_unlock(&qgroup->tqg_lock);
/*
* Set up queue for tasks added before boot.
*/
if (old_cnt == 0) {
LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
grouptask, gt_list);
qgroup->tqg_queue[0].tgc_cnt = 0;
}
/*
* If new taskq threads have been added.
*/
cpu = old_cpu;
for (i = old_cnt; i < cnt; i++) {
taskqgroup_cpu_create(qgroup, i, cpu);
for (k = 0; k < stride; k++)
cpu = CPU_NEXT(cpu);
}
mtx_lock(&qgroup->tqg_lock);
qgroup->tqg_cnt = cnt;
qgroup->tqg_stride = stride;
/*
* Adjust drivers to use new taskqs.
*/
for (i = 0; i < old_cnt; i++) {
while ((gtask = LIST_FIRST(&qgroup->tqg_queue[i].tgc_tasks))) {
LIST_REMOVE(gtask, gt_list);
qgroup->tqg_queue[i].tgc_cnt--;
LIST_INSERT_HEAD(&gtask_head, gtask, gt_list);
}
}
mtx_unlock(&qgroup->tqg_lock);
while ((gtask = LIST_FIRST(&gtask_head))) {
LIST_REMOVE(gtask, gt_list);
if (gtask->gt_cpu == -1)
taskqgroup_attach_deferred(qgroup, gtask);
else if (taskqgroup_attach_cpu_deferred(qgroup, gtask))
taskqgroup_attach_deferred(qgroup, gtask);
}
#ifdef INVARIANTS
mtx_lock(&qgroup->tqg_lock);
for (i = 0; i < qgroup->tqg_cnt; i++) {
MPASS(qgroup->tqg_queue[i].tgc_taskq != NULL);
LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list)
MPASS(gtask->gt_taskqueue != NULL);
}
mtx_unlock(&qgroup->tqg_lock);
#endif
/*
* If taskq thread count has been reduced.
*/
for (i = cnt; i < old_cnt; i++)
taskqgroup_cpu_remove(qgroup, i);
taskqgroup_bind(qgroup);
mtx_lock(&qgroup->tqg_lock);
qgroup->tqg_adjusting = 0;
return (0);
}
int
taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
{
int error;
mtx_lock(&qgroup->tqg_lock);
error = _taskqgroup_adjust(qgroup, cnt, stride);
mtx_unlock(&qgroup->tqg_lock);
return (error);
}
struct taskqgroup *
taskqgroup_create(const char *name)
taskqgroup_create(const char *name, int cnt, int stride)
{
struct taskqgroup *qgroup;
int cpu, i, j;
qgroup = malloc(sizeof(*qgroup), M_GTASKQUEUE, M_WAITOK | M_ZERO);
mtx_init(&qgroup->tqg_lock, "taskqgroup", NULL, MTX_DEF);
qgroup->tqg_name = name;
LIST_INIT(&qgroup->tqg_queue[0].tgc_tasks);
qgroup->tqg_cnt = cnt;
for (cpu = i = 0; i < cnt; i++) {
taskqgroup_cpu_create(qgroup, i, cpu);
for (j = 0; j < stride; j++)
cpu = CPU_NEXT(cpu);
}
return (qgroup);
}

View File

@ -1410,29 +1410,6 @@ iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
iflib_dma_free(*dmaiter);
}
#ifdef EARLY_AP_STARTUP
static const int iflib_started = 1;
#else
/*
* We used to abuse the smp_started flag to decide if the queues have been
* fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
* That gave bad races, since the SYSINIT() runs strictly after smp_started
* is set. Run a SYSINIT() strictly after that to just set a usable
* completion flag.
*/
static int iflib_started;
static void
iflib_record_started(void *arg)
{
iflib_started = 1;
}
SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
iflib_record_started, NULL);
#endif
static int
iflib_fast_intr(void *arg)
{
@ -1440,9 +1417,6 @@ iflib_fast_intr(void *arg)
struct grouptask *gtask = info->ifi_task;
int result;
if (!iflib_started)
return (FILTER_STRAY);
DBG_COUNTER_INC(fast_intrs);
if (info->ifi_filter != NULL) {
result = info->ifi_filter(info->ifi_filter_arg);
@ -1467,9 +1441,6 @@ iflib_fast_intr_rxtx(void *arg)
qidx_t txqid;
bool intr_enable, intr_legacy;
if (!iflib_started)
return (FILTER_STRAY);
DBG_COUNTER_INC(fast_intrs);
if (info->ifi_filter != NULL) {
result = info->ifi_filter(info->ifi_filter_arg);
@ -1522,9 +1493,6 @@ iflib_fast_intr_ctx(void *arg)
struct grouptask *gtask = info->ifi_task;
int result;
if (!iflib_started)
return (FILTER_STRAY);
DBG_COUNTER_INC(fast_intrs);
if (info->ifi_filter != NULL) {
result = info->ifi_filter(info->ifi_filter_arg);
@ -4746,18 +4714,6 @@ iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ct
*/
ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
/*
* Group taskqueues aren't properly set up until SMP is started,
* so we disable interrupts until we can handle them post
* SI_SUB_SMP.
*
* XXX: disabling interrupts doesn't actually work, at least for
* the non-MSI case. When they occur before SI_SUB_SMP completes,
* we do null handling and depend on this not causing too large an
* interrupt storm.
*/
IFDI_INTR_DISABLE(ctx);
if (msix > 1) {
/*
* When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable

View File

@ -77,9 +77,9 @@ int taskqgroup_attach_cpu(struct taskqgroup *qgroup,
struct grouptask *grptask, void *uniq, int cpu, device_t dev,
struct resource *irq, const char *name);
void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask);
struct taskqgroup *taskqgroup_create(const char *name);
struct taskqgroup *taskqgroup_create(const char *name, int cnt, int stride);
void taskqgroup_destroy(struct taskqgroup *qgroup);
int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride);
void taskqgroup_bind(struct taskqgroup *qgroup);
void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask,
gtask_fn_t *fn, const char *name);
void taskqgroup_config_gtask_deinit(struct grouptask *gtask);
@ -107,22 +107,19 @@ struct taskqgroup *qgroup_##name; \
static void \
taskqgroup_define_##name(void *arg) \
{ \
qgroup_##name = taskqgroup_create(#name); \
qgroup_##name = taskqgroup_create(#name, (cnt), (stride)); \
} \
\
SYSINIT(taskqgroup_##name, SI_SUB_TASKQ, SI_ORDER_FIRST, \
taskqgroup_define_##name, NULL); \
taskqgroup_define_##name, NULL); \
\
static void \
taskqgroup_adjust_##name(void *arg) \
taskqgroup_bind_##name(void *arg) \
{ \
taskqgroup_adjust(qgroup_##name, (cnt), (stride)); \
taskqgroup_bind(qgroup_##name); \
} \
\
SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY, \
taskqgroup_adjust_##name, NULL)
SYSINIT(taskqgroup_bind_##name, SI_SUB_SMP, SI_ORDER_ANY, \
taskqgroup_bind_##name, NULL)
TASKQGROUP_DECLARE(net);
TASKQGROUP_DECLARE(softirq);
#endif /* !_SYS_GTASKQUEUE_H_ */