Add handling for controller fatal status (csts.cfs).

On any I/O timeout, check for csts.cfs==1. If set, the controller is reporting fatal status and we reset the controller immediately, rather than trying to abort the timed out command. This changeset also includes deferring the controller start portion of the reset to a separate task. This ensures we are always performing a controller start operation from a consistent context. Sponsored by: Intel Reviewed by: carl
2013-03-26 19:58:17 +00:00 · 2013-03-26 19:58:17 +00:00 · 12d191ec12
commit 12d191ec12
parent dbba74428b
3 changed files with 37 additions and 3 deletions
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@ -427,7 +427,7 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr)
 	status = nvme_ctrlr_hw_reset(ctrlr);
 	DELAY(100*1000);
 	if (status == 0)
-		nvme_ctrlr_start(ctrlr);
+		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->restart_task);
 }

 static int
@ -685,6 +685,14 @@ nvme_ctrlr_start(void *ctrlr_arg)
 	ctrlr->num_start_attempts++;
 }

+static void
+nvme_ctrlr_restart_task(void *arg, int pending)
+{
+	struct nvme_controller *ctrlr = arg;
+
+	nvme_ctrlr_start(ctrlr);
+}
+
 static void
 nvme_ctrlr_intx_handler(void *arg)
 {
@ -864,6 +872,11 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)

 	ctrlr->cdev->si_drv1 = (void *)ctrlr;

+	TASK_INIT(&ctrlr->restart_task, 0, nvme_ctrlr_restart_task, ctrlr);
+	ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
+	    taskqueue_thread_enqueue, &ctrlr->taskqueue);
+	taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");
+
 	return (0);
 }

@ -872,6 +885,8 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 {
 	int				i;

+	taskqueue_free(ctrlr->taskqueue);
+
 	for (i = 0; i < NVME_MAX_NAMESPACES; i++)
 		nvme_ns_destruct(&ctrlr->ns[i]);

--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@ -36,6 +36,7 @@
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/systm.h>
+#include <sys/taskqueue.h>

 #include <vm/uma.h>

@ -236,6 +237,8 @@ struct nvme_controller {
 	uint32_t		ns_identified;
 	uint32_t		queues_created;
 	uint32_t		num_start_attempts;
+	struct task		restart_task;
+	struct taskqueue	*taskqueue;

 	/* For shared legacy interrupt. */
 	int			rid;
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@ -98,7 +98,7 @@ nvme_qpair_construct_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr,
 	bus_dmamap_load(qpair->dma_tag, tr->prp_dma_map, tr->prp,
 	    sizeof(tr->prp), nvme_single_map, &tr->prp_bus_addr, 0);

-	callout_init_mtx(&tr->timer, &qpair->lock, 0);
+	callout_init(&tr->timer, 1);
 	tr->cid = cid;
 	tr->qpair = qpair;
 }
@ -456,8 +456,24 @@ static void
 nvme_timeout(void *arg)
 {
 	struct nvme_tracker	*tr = arg;
+	struct nvme_qpair	*qpair = tr->qpair;
+	struct nvme_controller	*ctrlr = qpair->ctrlr;
+	union csts_register	csts;

-	nvme_ctrlr_cmd_abort(tr->qpair->ctrlr, tr->cid, tr->qpair->id,
+	csts.raw = nvme_mmio_read_4(ctrlr, csts);
+	if (csts.bits.cfs == 1) {
+		/*
+		 * The controller is reporting fatal status.  Don't bother
+		 *  trying to abort the timed out command - proceed
+		 *  immediately to a controller-level reset.
+		 */
+		device_printf(ctrlr->dev,
+		    "controller reports fatal status, resetting...\n");
+		nvme_ctrlr_reset(ctrlr);
+		return;
+	}
+
+	nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
 	    nvme_abort_complete, tr);
 }