By default, always escalate to controller reset when an I/O times out.

While aborts are typically cleaner than a full controller reset, many times
an I/O timeout indicates other controller-level issues where aborts may not
work.  NVMe drivers for other operating systems are also defaulting to
controller reset rather than aborts for timed out I/O.

Sponsored by:	Intel
Reviewed by:	carl
This commit is contained in:
Jim Harris 2013-03-26 20:32:57 +00:00
parent a0dc59709c
commit 48ce317898
3 changed files with 35 additions and 23 deletions

View File

@ -422,12 +422,8 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
void
nvme_ctrlr_reset(struct nvme_controller *ctrlr)
{
int status;
status = nvme_ctrlr_hw_reset(ctrlr);
DELAY(100*1000);
if (status == 0)
taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->restart_task);
taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
}
static int
@ -686,11 +682,24 @@ nvme_ctrlr_start(void *ctrlr_arg)
}
static void
nvme_ctrlr_restart_task(void *arg, int pending)
nvme_ctrlr_reset_task(void *arg, int pending)
{
struct nvme_controller *ctrlr = arg;
struct nvme_controller *ctrlr = arg;
int status;
nvme_ctrlr_start(ctrlr);
device_printf(ctrlr->dev, "resetting controller");
status = nvme_ctrlr_hw_reset(ctrlr);
/*
* Use pause instead of DELAY, so that we yield to any nvme interrupt
* handlers on this CPU that were blocked on a qpair lock. We want
* all nvme interrupts completed before proceeding with restarting the
* controller.
*
* XXX - any way to guarantee the interrupt handlers have quiesced?
*/
pause("nvmereset", hz / 10);
if (status == 0)
nvme_ctrlr_start(ctrlr);
}
static void
@ -841,6 +850,9 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
ctrlr->force_intx = 0;
TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
ctrlr->enable_aborts = 0;
TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
ctrlr->msix_enabled = 1;
if (ctrlr->force_intx) {
@ -879,7 +891,7 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
ctrlr->cdev->si_drv1 = (void *)ctrlr;
TASK_INIT(&ctrlr->restart_task, 0, nvme_ctrlr_restart_task, ctrlr);
TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
taskqueue_thread_enqueue, &ctrlr->taskqueue);
taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");

View File

@ -230,6 +230,7 @@ struct nvme_controller {
uint32_t msix_enabled;
uint32_t force_intx;
uint32_t enable_aborts;
uint32_t num_io_queues;
boolean_t per_cpu_io_queues;
@ -239,7 +240,7 @@ struct nvme_controller {
uint32_t ns_identified;
uint32_t queues_created;
uint32_t num_start_attempts;
struct task restart_task;
struct task reset_task;
struct taskqueue *taskqueue;
/* For shared legacy interrupt. */

View File

@ -460,21 +460,20 @@ nvme_timeout(void *arg)
struct nvme_controller *ctrlr = qpair->ctrlr;
union csts_register csts;
/* Read csts to get value of cfs - controller fatal status. */
csts.raw = nvme_mmio_read_4(ctrlr, csts);
if (csts.bits.cfs == 1) {
/*
* The controller is reporting fatal status. Don't bother
* trying to abort the timed out command - proceed
* immediately to a controller-level reset.
*/
device_printf(ctrlr->dev,
"controller reports fatal status, resetting...\n");
nvme_ctrlr_reset(ctrlr);
return;
}
device_printf(ctrlr->dev, "i/o timeout, csts.cfs=%d\n", csts.bits.cfs);
nvme_dump_command(&tr->req->cmd);
nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
nvme_abort_complete, tr);
if (ctrlr->enable_aborts && csts.bits.cfs == 0) {
/*
* If aborts are enabled, only use them if the controller is
* not reporting fatal status.
*/
nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
nvme_abort_complete, tr);
} else
nvme_ctrlr_reset(ctrlr);
}
void