From 0ac2e39d572ac7aa35dee7adbf32f7178f29c8a3 Mon Sep 17 00:00:00 2001 From: imp Date: Fri, 16 Mar 2018 05:23:48 +0000 Subject: [PATCH] Try polling the qpairs on timeout. On some systems, we're getting timeouts when we use multiple queues on drives that work perfectly well on other systems. On a hunch, Jim Harris suggested I poll the completion queue when we get a timeout. This patch polls the completion queue if no fatal status was indicated. If it had pending I/O, we complete that request and return. Otherwise, if aborts are enabled and no fatal status, we abort the command and return. Otherwise we reset the card. This may clear up the problem, or we may see it result in lots of timeouts and a performance problem. Either way, we'll know the next step. We may also need to pay attention to the fatal status bit of the controller. PR: 211713 Suggested by: Jim Harris Sponsored by: Netflix --- sys/dev/nvme/nvme_private.h | 2 +- sys/dev/nvme/nvme_qpair.c | 32 +++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 348737231e89..5bc7b26c1af6 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -426,7 +426,7 @@ int nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, struct nvme_controller *ctrlr); void nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr); -void nvme_qpair_process_completions(struct nvme_qpair *qpair); +bool nvme_qpair_process_completions(struct nvme_qpair *qpair); void nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req); void nvme_qpair_reset(struct nvme_qpair *qpair); diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c index 1d79b413ba04..576110d100ae 100644 --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -469,11 +469,12 @@ nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, nvme_free_request(req); } -void +bool nvme_qpair_process_completions(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_completion cpl; + int done = 0; qpair->num_intr_handler_calls++; @@ -484,7 +485,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair) * associated with this interrupt will get retried when the * reset is complete. */ - return; + return (false); while (1) { cpl = qpair->cpl[qpair->cq_head]; @@ -500,6 +501,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair) if (tr != NULL) { nvme_qpair_complete_tracker(qpair, tr, &cpl, TRUE); qpair->sq_head = cpl.sqhd; + done++; } else { nvme_printf(qpair->ctrlr, "cpl does not map to outstanding cmd\n"); @@ -516,6 +518,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair) nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl, qpair->cq_head); } + return (done != 0); } static void @@ -770,19 +773,30 @@ nvme_timeout(void *arg) uint32_t csts; uint8_t cfs; - /* Read csts to get value of cfs - controller fatal status. */ + /* + * Read csts to get value of cfs - controller fatal status. + * If no fatal status, try to call the completion routine, and + * if completes transactions, report a missed interrupt and + * return (this may need to be rate limited). Otherwise, if + * aborts are enabled and the controller is not reporting + * fatal status, abort the command. Otherwise, just reset the + * controller and hope for the best. + */ csts = nvme_mmio_read_4(ctrlr, csts); - cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; + if (cfs == 0 && nvme_qpair_process_completions(qpair)) { + nvme_printf(ctrlr, "Missing interrupt\n"); + return; + } if (ctrlr->enable_aborts && cfs == 0) { - /* - * If aborts are enabled, only use them if the controller is - * not reporting fatal status. - */ + nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, nvme_abort_complete, tr); - } else + } else { + nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", + cfs ? " and fatal error status" : ""); nvme_ctrlr_reset(ctrlr); + } } void