nvme: Use shared timeout rather than timeout per transaction

Keep track of the approximate time commands are 'due' and the next deadline for a command. twice a second, wake up to see if any commands have entered timeout. If so, quiessce and then enter a recovery mode half the timeout further in the future to allow the ISR to complete. Once we exit recovery mode, we go back to operations as normal. Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D28583
2021-09-23 16:31:32 -06:00 · 2021-09-23 16:31:32 -06:00 · 502dc84a8b
commit 502dc84a8b
parent cb13059663
3 changed files with 133 additions and 68 deletions
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@ -232,7 +232,8 @@ nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
 	mtx_lock(&ctrlr->lock);
 	STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
 	mtx_unlock(&ctrlr->lock);
-	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
+	if (!ctrlr->is_dying)
+		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
 }

 static void
@ -435,7 +436,8 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr)
 		 */
 		return;

-	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
+	if (!ctrlr->is_dying)
+		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
 }

 static int
@ -1481,6 +1483,8 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 {
 	int	gone, i;

+	ctrlr->is_dying = true;
+
 	if (ctrlr->resource == NULL)
 		goto nores;
 	if (!mtx_initialized(&ctrlr->adminq.lock))
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@ -151,7 +151,7 @@ struct nvme_tracker {
 	TAILQ_ENTRY(nvme_tracker)	tailq;
 	struct nvme_request		*req;
 	struct nvme_qpair		*qpair;
-	struct callout			timer;
+	sbintime_t			deadline;
 	bus_dmamap_t			payload_dma_map;
 	uint16_t			cid;

@ -159,6 +159,12 @@ struct nvme_tracker {
 	bus_addr_t			prp_bus_addr;
 };

+enum nvme_recovery {
+	RECOVERY_NONE = 0,		/* Normal operations */
+	RECOVERY_START,			/* Deadline has passed, start recovering */
+	RECOVERY_RESET,			/* This pass, initiate reset of controller */
+	RECOVERY_WAITING,		/* waiting for the reset to complete */
+};
 struct nvme_qpair {
 	struct nvme_controller	*ctrlr;
 	uint32_t		id;
@ -170,6 +176,11 @@ struct nvme_qpair {
 	struct resource		*res;
 	void 			*tag;

+	struct callout		timer;
+	sbintime_t		deadline;
+	bool			timer_armed;
+	enum nvme_recovery	recovery_state;
+
 	uint32_t		num_entries;
 	uint32_t		num_trackers;
 	uint32_t		sq_tdbl_off;
@ -201,8 +212,6 @@ struct nvme_qpair {

 	struct nvme_tracker	**act_tr;

-	bool			is_enabled;
-
 	struct mtx		lock __aligned(CACHE_LINE_SIZE);

 } __aligned(CACHE_LINE_SIZE);
@ -305,6 +314,7 @@ struct nvme_controller {
 	uint32_t			notification_sent;

 	bool				is_failed;
+	bool				is_dying;
 	STAILQ_HEAD(, nvme_request)	fail_req;

 	/* Host Memory Buffer */
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@ -452,7 +452,6 @@ nvme_qpair_complete_tracker(struct nvme_tracker *tr,
 	}

 	mtx_lock(&qpair->lock);
-	callout_stop(&tr->timer);

 	if (retry) {
 		req->retries++;
@ -544,7 +543,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 	 * progress.  Ignore the interrupt - any I/O that was associated with
 	 * this interrupt will get retried when the reset is complete.
 	 */
-	if (!qpair->is_enabled)
+	if (qpair->recovery_state != RECOVERY_NONE)
 		return (false);

 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
@ -739,6 +738,10 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
 	qpair->cpl_bus_addr = queuemem_phys + cmdsz;
 	prpmem_phys = queuemem_phys + cmdsz + cplsz;

+	callout_init(&qpair->timer, 1);
+	qpair->timer_armed = false;
+	qpair->recovery_state = RECOVERY_NONE;
+
 	/*
 	 * Calcuate the stride of the doorbell register. Many emulators set this
 	 * value to correspond to a cache line. However, some hardware has set
@ -776,7 +779,6 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
 		    DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
 		bus_dmamap_create(qpair->dma_tag_payload, 0,
 		    &tr->payload_dma_map);
-		callout_init(&tr->timer, 1);
 		tr->cid = i;
 		tr->qpair = qpair;
 		tr->prp = (uint64_t *)prp_list;
@ -835,6 +837,8 @@ nvme_qpair_destroy(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr;

+	callout_drain(&qpair->timer);
+
 	if (qpair->tag) {
 		bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
 		qpair->tag = NULL;
@ -914,65 +918,101 @@ nvme_io_qpair_destroy(struct nvme_qpair *qpair)
 }

 static void
-nvme_abort_complete(void *arg, const struct nvme_completion *status)
+nvme_qpair_timeout(void *arg)
 {
-	struct nvme_tracker	*tr = arg;
-
-	/*
-	 * If cdw0 == 1, the controller was not able to abort the command
-	 *  we requested.  We still need to check the active tracker array,
-	 *  to cover race where I/O timed out at same time controller was
-	 *  completing the I/O.
-	 */
-	if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
-		/*
-		 * An I/O has timed out, and the controller was unable to
-		 *  abort it for some reason.  Construct a fake completion
-		 *  status, and then complete the I/O's tracker manually.
-		 */
-		nvme_printf(tr->qpair->ctrlr,
-		    "abort command failed, aborting command manually\n");
-		nvme_qpair_manual_complete_tracker(tr,
-		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
-	}
-}
-
-static void
-nvme_timeout(void *arg)
-{
-	struct nvme_tracker	*tr = arg;
-	struct nvme_qpair	*qpair = tr->qpair;
+	struct nvme_qpair	*qpair = arg;
 	struct nvme_controller	*ctrlr = qpair->ctrlr;
+	struct nvme_tracker	*tr;
+	struct nvme_tracker	*tr_temp;
+	sbintime_t		now;
+	bool			idle;
 	uint32_t		csts;
 	uint8_t			cfs;

-	/*
-	 * Read csts to get value of cfs - controller fatal status.
-	 * If no fatal status, try to call the completion routine, and
-	 * if completes transactions, report a missed interrupt and
-	 * return (this may need to be rate limited). Otherwise, if
-	 * aborts are enabled and the controller is not reporting
-	 * fatal status, abort the command. Otherwise, just reset the
-	 * controller and hope for the best.
-	 */
-	csts = nvme_mmio_read_4(ctrlr, csts);
-	cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;
-	if (cfs == 0 && nvme_qpair_process_completions(qpair)) {
-		nvme_printf(ctrlr, "Missing interrupt\n");
-		return;
-	}
-	if (ctrlr->enable_aborts && cfs == 0) {
-		nvme_printf(ctrlr, "Aborting command due to a timeout.\n");
-		nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
-		    nvme_abort_complete, tr);
-	} else {
+	mtx_lock(&qpair->lock);
+	idle = TAILQ_EMPTY(&qpair->outstanding_tr);
+again:
+	switch (qpair->recovery_state) {
+	case RECOVERY_NONE:
+		if (idle)
+			break;
+		now = getsbinuptime();
+		TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
+			if (now > tr->deadline && tr->deadline != 0) {
+				/*
+				 * We're now passed our earliest deadline. We
+				 * need to do expensive things to cope, but next
+				 * time. Flag that and close the door to any
+				 * further processing.
+				 */
+				qpair->recovery_state = RECOVERY_START;
+				nvme_printf(ctrlr, "RECOVERY_START %jd vs %jd\n",
+				    (uintmax_t)now, (uintmax_t)tr->deadline);
+				break;
+			}
+		}
+		break;
+	case RECOVERY_START:
+		/*
+		 * Read csts to get value of cfs - controller fatal status.
+		 * If no fatal status, try to call the completion routine, and
+		 * if completes transactions, report a missed interrupt and
+		 * return (this may need to be rate limited). Otherwise, if
+		 * aborts are enabled and the controller is not reporting
+		 * fatal status, abort the command. Otherwise, just reset the
+		 * controller and hope for the best.
+		 */
+		csts = nvme_mmio_read_4(ctrlr, csts);
+		cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;
+		if (cfs) {
+			nvme_printf(ctrlr, "Controller in fatal status, resetting\n");
+			qpair->recovery_state = RECOVERY_RESET;
+			goto again;
+		}
+		mtx_unlock(&qpair->lock);
+		if (nvme_qpair_process_completions(qpair)) {
+			nvme_printf(ctrlr, "Completions present in output without an interrupt\n");
+			qpair->recovery_state = RECOVERY_NONE;
+		} else {
+			nvme_printf(ctrlr, "timeout with nothing complete, resetting\n");
+			qpair->recovery_state = RECOVERY_RESET;
+			mtx_lock(&qpair->lock);
+			goto again;
+		}
+		mtx_lock(&qpair->lock);
+		break;
+	case RECOVERY_RESET:
+		/*
+		 * If we get here due to a possible surprise hot-unplug event,
+		 * then we let nvme_ctrlr_reset confirm and fail the
+		 * controller.
+		 */
 		nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
-		    (csts == NVME_GONE) ? " and possible hot unplug" :
-		    (cfs ? " and fatal error status" : ""));
+		    cfs ? " and fatal error status" : "");
+		nvme_printf(ctrlr, "RECOVERY_WAITING\n");
+		qpair->recovery_state = RECOVERY_WAITING;
 		nvme_ctrlr_reset(ctrlr);
+		break;
+	case RECOVERY_WAITING:
+		nvme_printf(ctrlr, "waiting\n");
+		break;
 	}
+
+	/*
+	 * Rearm the timeout.
+	 */
+	if (!idle) {
+		callout_schedule(&qpair->timer, hz / 2);
+	} else {
+		qpair->timer_armed = false;
+	}
+	mtx_unlock(&qpair->lock);
 }

+/*
+ * Submit the tracker to the hardware. Must already be in the
+ * outstanding queue when called.
+ */
 void
 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
 {
@ -989,12 +1029,17 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)

 	if (req->timeout) {
 		if (req->cb_fn == nvme_completion_poll_cb)
-			timeout = hz;
+			timeout = 1;
 		else
-			timeout = ctrlr->timeout_period * hz;
-		callout_reset_on(&tr->timer, timeout, nvme_timeout, tr,
-		    qpair->cpu);
-	}
+			timeout = ctrlr->timeout_period;
+		tr->deadline = getsbinuptime() + timeout * SBT_1S;
+		if (!qpair->timer_armed) {
+			qpair->timer_armed = true;
+			callout_reset_on(&qpair->timer, hz / 2,
+			    nvme_qpair_timeout, qpair, qpair->cpu);
+		}
+	} else
+		tr->deadline = SBT_MAX;

 	/* Copy the command from the tracker to the submission queue. */
 	memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
@ -1069,7 +1114,7 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 	tr = TAILQ_FIRST(&qpair->free_tr);
 	req->qpair = qpair;

-	if (tr == NULL || !qpair->is_enabled) {
+	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
 		/*
 		 * No tracker is available, or the qpair is disabled due to
 		 *  an in-progress controller-level reset or controller
@ -1096,6 +1141,8 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)

 	TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
 	TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
+	if (!qpair->timer_armed)
+		tr->deadline = SBT_MAX;
 	tr->req = req;

 	switch (req->type) {
@ -1164,8 +1211,9 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 static void
 nvme_qpair_enable(struct nvme_qpair *qpair)
 {
+	mtx_assert(&qpair->lock, MA_OWNED);

-	qpair->is_enabled = true;
+	qpair->recovery_state = RECOVERY_NONE;
 }

 void
@ -1208,7 +1256,9 @@ nvme_admin_qpair_enable(struct nvme_qpair *qpair)
 		    NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
 	}

+	mtx_lock(&qpair->lock);
 	nvme_qpair_enable(qpair);
+	mtx_unlock(&qpair->lock);
 }

 void
@ -1251,12 +1301,13 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
 static void
 nvme_qpair_disable(struct nvme_qpair *qpair)
 {
-	struct nvme_tracker *tr;
+	struct nvme_tracker	*tr, *tr_temp;

-	qpair->is_enabled = false;
 	mtx_lock(&qpair->lock);
-	TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq)
-		callout_stop(&tr->timer);
+	qpair->recovery_state = RECOVERY_WAITING;
+	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
+		tr->deadline = SBT_MAX;
+	}
 	mtx_unlock(&qpair->lock);
 }