nvme: Add exclusion for ISR

Add a basically uncontended spinlock that we take out while the ISR is
running. This has two effects: First, when we get a timeout, we can
safely call the nvme_qpair_process_completions w/o racing any ISRs.
Second, we can use it to ensure that we don't reset the card while
the ISRs are active (right now we just sleep and hope for the best,
which usually is fine, but not always).

Sponsored by:		Netflix
MFC After:		2 weeks
Reviewed by:		chuck, gallatin
Differential Revision:	https://reviews.freebsd.org/D41452
This commit is contained in:
Warner Losh 2023-08-25 10:10:16 -06:00
parent d4959bfcd1
commit 8052b01e7e
4 changed files with 148 additions and 52 deletions

View File

@ -416,6 +416,34 @@ nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
}
}
static void
nvme_pre_reset(struct nvme_controller *ctrlr)
{
/*
* Make sure that all the ISRs are done before proceeding with the reset
* (and also keep any stray interrupts that happen during this process
* from racing this process). For startup, this is a nop, since the
* hardware is in a good state. But for recovery, where we randomly
* reset the hardware, this ensure that we're not racing the ISRs.
*/
mtx_lock(&ctrlr->adminq.recovery);
for (int i = 0; i < ctrlr->num_io_queues; i++) {
mtx_lock(&ctrlr->ioq[i].recovery);
}
}
static void
nvme_post_reset(struct nvme_controller *ctrlr)
{
/*
* Reset complete, unblock ISRs
*/
mtx_unlock(&ctrlr->adminq.recovery);
for (int i = 0; i < ctrlr->num_io_queues; i++) {
mtx_unlock(&ctrlr->ioq[i].recovery);
}
}
static int
nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
{
@ -427,9 +455,11 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
err = nvme_ctrlr_disable(ctrlr);
if (err != 0)
return err;
goto out;
err = nvme_ctrlr_enable(ctrlr);
out:
TSEXIT();
return (err);
}
@ -1157,6 +1187,11 @@ nvme_ctrlr_start_config_hook(void *arg)
TSENTER();
/*
* Don't call pre/post reset here. We've not yet created the qpairs,
* haven't setup the ISRs, so there's no need to 'drain' them or
* 'exclude' them.
*/
if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
fail:
nvme_ctrlr_fail(ctrlr);
@ -1201,16 +1236,9 @@ nvme_ctrlr_reset_task(void *arg, int pending)
int status;
nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
nvme_pre_reset(ctrlr);
status = nvme_ctrlr_hw_reset(ctrlr);
/*
* Use pause instead of DELAY, so that we yield to any nvme interrupt
* handlers on this CPU that were blocked on a qpair lock. We want
* all nvme interrupts completed before proceeding with restarting the
* controller.
*
* XXX - any way to guarantee the interrupt handlers have quiesced?
*/
pause("nvmereset", hz / 10);
nvme_post_reset(ctrlr);
if (status == 0)
nvme_ctrlr_start(ctrlr, true);
else
@ -1697,6 +1725,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
if (ctrlr->is_failed)
return (0);
nvme_pre_reset(ctrlr);
if (nvme_ctrlr_hw_reset(ctrlr) != 0)
goto fail;
#ifdef NVME_2X_RESET
@ -1708,6 +1737,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
if (nvme_ctrlr_hw_reset(ctrlr) != 0)
goto fail;
#endif
nvme_post_reset(ctrlr);
/*
* Now that we've reset the hardware, we can restart the controller. Any
@ -1724,6 +1754,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
* the controller. However, we have to return success for the resume
* itself, due to questionable APIs.
*/
nvme_post_reset(ctrlr);
nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
nvme_ctrlr_fail(ctrlr);
(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);

View File

@ -162,10 +162,9 @@ struct nvme_qpair {
struct resource *res;
void *tag;
struct callout timer;
sbintime_t deadline;
bool timer_armed;
enum nvme_recovery recovery_state;
struct callout timer; /* recovery lock */
bool timer_armed; /* recovery lock */
enum nvme_recovery recovery_state; /* recovery lock */
uint32_t num_entries;
uint32_t num_trackers;
@ -182,6 +181,7 @@ struct nvme_qpair {
int64_t num_retries;
int64_t num_failures;
int64_t num_ignored;
int64_t num_recovery_nolock;
struct nvme_command *cmd;
struct nvme_completion *cpl;
@ -200,7 +200,7 @@ struct nvme_qpair {
struct nvme_tracker **act_tr;
struct mtx_padalign lock;
struct mtx_padalign recovery;
} __aligned(CACHE_LINE_SIZE);
struct nvme_namespace {

View File

@ -528,14 +528,17 @@ nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
nvme_free_request(req);
}
bool
nvme_qpair_process_completions(struct nvme_qpair *qpair)
/* Locked version of completion processor */
static bool
_nvme_qpair_process_completions(struct nvme_qpair *qpair)
{
struct nvme_tracker *tr;
struct nvme_completion cpl;
int done = 0;
bool done = false;
bool in_panic = dumping || SCHEDULER_STOPPED();
mtx_assert(&qpair->recovery, MA_OWNED);
/*
* qpair is not enabled, likely because a controller reset is in
* progress. Ignore the interrupt - any I/O that was associated with
@ -629,7 +632,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
else
tr = NULL;
done++;
done = true;
if (tr != NULL) {
nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
qpair->sq_head = cpl.sqhd;
@ -666,12 +669,35 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
}
}
if (done != 0) {
if (done) {
bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
qpair->cq_hdbl_off, qpair->cq_head);
}
return (done != 0);
return (done);
}
bool
nvme_qpair_process_completions(struct nvme_qpair *qpair)
{
bool done;
/*
* Interlock with reset / recovery code. This is an usually uncontended
* to make sure that we drain out of the ISRs before we reset the card
* and to prevent races with the recovery process called from a timeout
* context.
*/
if (!mtx_trylock(&qpair->recovery)) {
qpair->num_recovery_nolock++;
return (false);
}
done = _nvme_qpair_process_completions(qpair);
mtx_unlock(&qpair->recovery);
return (done);
}
static void
@ -699,6 +725,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
qpair->ctrlr = ctrlr;
mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
/* Note: NVMe PRP format is restricted to 4-byte alignment. */
err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
@ -765,7 +792,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
qpair->cpl_bus_addr = queuemem_phys + cmdsz;
prpmem_phys = queuemem_phys + cmdsz + cplsz;
callout_init(&qpair->timer, 1);
callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
qpair->timer_armed = false;
qpair->recovery_state = RECOVERY_WAITING;
@ -903,6 +930,8 @@ nvme_qpair_destroy(struct nvme_qpair *qpair)
if (mtx_initialized(&qpair->lock))
mtx_destroy(&qpair->lock);
if (mtx_initialized(&qpair->recovery))
mtx_destroy(&qpair->recovery);
if (qpair->res) {
bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
@ -975,14 +1004,12 @@ nvme_qpair_timeout(void *arg)
struct nvme_controller *ctrlr = qpair->ctrlr;
struct nvme_tracker *tr;
sbintime_t now;
bool idle;
bool idle = false;
bool needs_reset;
uint32_t csts;
uint8_t cfs;
mtx_lock(&qpair->lock);
idle = TAILQ_EMPTY(&qpair->outstanding_tr);
mtx_assert(&qpair->recovery, MA_OWNED);
switch (qpair->recovery_state) {
case RECOVERY_NONE:
@ -1003,25 +1030,10 @@ nvme_qpair_timeout(void *arg)
goto do_reset;
/*
* Next, check to see if we have any completions. If we do,
* we've likely missed an interrupt, but the card is otherwise
* fine. This will also catch all the commands that are about
* to timeout (but there's still a tiny race). Since the timeout
* is long relative to the race between here and the check below,
* this is still a win.
* Process completions. We already have the recovery lock, so
* call the locked version.
*/
mtx_unlock(&qpair->lock);
nvme_qpair_process_completions(qpair);
mtx_lock(&qpair->lock);
if (qpair->recovery_state != RECOVERY_NONE) {
/*
* Somebody else adjusted recovery state while unlocked,
* we should bail. Unlock the qpair and return without
* doing anything else.
*/
mtx_unlock(&qpair->lock);
return;
}
_nvme_qpair_process_completions(qpair);
/*
* Check to see if we need to timeout any commands. If we do, then
@ -1029,7 +1041,13 @@ nvme_qpair_timeout(void *arg)
*/
now = getsbinuptime();
needs_reset = false;
idle = true;
mtx_lock(&qpair->lock);
TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
/*
* Skip async commands, they are posted to the card for
* an indefinite amount of time and have no deadline.
*/
if (tr->deadline == SBT_MAX)
continue;
if (now > tr->deadline) {
@ -1055,6 +1073,7 @@ nvme_qpair_timeout(void *arg)
idle = false;
}
}
mtx_unlock(&qpair->lock);
if (!needs_reset)
break;
@ -1076,6 +1095,7 @@ nvme_qpair_timeout(void *arg)
break;
case RECOVERY_WAITING:
nvme_printf(ctrlr, "waiting for reset to complete\n");
idle = false; /* We want to keep polling */
break;
}
@ -1087,7 +1107,6 @@ nvme_qpair_timeout(void *arg)
} else {
qpair->timer_armed = false;
}
mtx_unlock(&qpair->lock);
}
/*
@ -1196,9 +1215,12 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
/*
* No tracker is available, or the qpair is disabled due to
* an in-progress controller-level reset or controller
* failure.
* No tracker is available, or the qpair is disabled due to an
* in-progress controller-level reset or controller failure. If
* we lose the race with recovery_state, then we may add an
* extra request to the queue which will be resubmitted later.
* We only set recovery_state to NONE with qpair->lock also
* held.
*/
if (qpair->ctrlr->is_failed) {
@ -1260,7 +1282,10 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
static void
nvme_qpair_enable(struct nvme_qpair *qpair)
{
mtx_assert(&qpair->lock, MA_OWNED);
if (mtx_initialized(&qpair->recovery))
mtx_assert(&qpair->recovery, MA_OWNED);
if (mtx_initialized(&qpair->lock))
mtx_assert(&qpair->lock, MA_OWNED);
qpair->recovery_state = RECOVERY_NONE;
}
@ -1311,9 +1336,11 @@ nvme_admin_qpair_enable(struct nvme_qpair *qpair)
nvme_printf(qpair->ctrlr,
"done aborting outstanding admin\n");
mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);
nvme_qpair_enable(qpair);
mtx_unlock(&qpair->lock);
mtx_unlock(&qpair->recovery);
}
void
@ -1340,8 +1367,8 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
if (report)
nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);
nvme_qpair_enable(qpair);
STAILQ_INIT(&temp);
@ -1360,6 +1387,7 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
mtx_unlock(&qpair->lock);
mtx_unlock(&qpair->recovery);
}
static void
@ -1367,27 +1395,40 @@ nvme_qpair_disable(struct nvme_qpair *qpair)
{
struct nvme_tracker *tr, *tr_temp;
mtx_lock(&qpair->lock);
if (mtx_initialized(&qpair->recovery))
mtx_assert(&qpair->recovery, MA_OWNED);
if (mtx_initialized(&qpair->lock))
mtx_assert(&qpair->lock, MA_OWNED);
qpair->recovery_state = RECOVERY_WAITING;
TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
tr->deadline = SBT_MAX;
}
mtx_unlock(&qpair->lock);
}
void
nvme_admin_qpair_disable(struct nvme_qpair *qpair)
{
mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);
nvme_qpair_disable(qpair);
nvme_admin_qpair_abort_aers(qpair);
mtx_unlock(&qpair->lock);
mtx_unlock(&qpair->recovery);
}
void
nvme_io_qpair_disable(struct nvme_qpair *qpair)
{
mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);
nvme_qpair_disable(qpair);
mtx_unlock(&qpair->lock);
mtx_unlock(&qpair->recovery);
}
void

View File

@ -163,6 +163,7 @@ nvme_qpair_reset_stats(struct nvme_qpair *qpair)
qpair->num_retries = 0;
qpair->num_failures = 0;
qpair->num_ignored = 0;
qpair->num_recovery_nolock = 0;
}
static int
@ -240,6 +241,21 @@ nvme_sysctl_num_ignored(SYSCTL_HANDLER_ARGS)
return (sysctl_handle_64(oidp, &num_ignored, 0, req));
}
static int
nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
{
struct nvme_controller *ctrlr = arg1;
int64_t num;
int i;
num = ctrlr->adminq.num_recovery_nolock;
for (i = 0; i < ctrlr->num_io_queues; i++)
num += ctrlr->ioq[i].num_recovery_nolock;
return (sysctl_handle_64(oidp, &num, 0, req));
}
static int
nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
{
@ -298,6 +314,9 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair,
SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
CTLFLAG_RD, &qpair->num_ignored,
"Number of interrupts posted, but were administratively ignored");
SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock",
CTLFLAG_RD, &qpair->num_recovery_nolock,
"Number of times that we failed to lock recovery in the ISR");
SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
"dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
@ -366,6 +385,11 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
ctrlr, 0, nvme_sysctl_num_ignored, "IU",
"Number of interrupts ignored administratively");
SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
"num_recovery_nolock", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU",
"Number of times that we failed to lock recovery in the ISR");
SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
"reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr,
0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");