From 7f1c5255199753e2b34f3179a95852e22e1bb7e4 Mon Sep 17 00:00:00 2001 From: imp Date: Fri, 19 Jul 2019 18:39:24 +0000 Subject: [PATCH] Keep track of the number of commands that exhaust their retry limit. While we print failure messages on the console, sometimes logs are lost or overwhelmed. Keeping a count of how many times we've failed retriable commands helps get a magnitude of the problem. --- sys/dev/nvme/nvme_private.h | 1 + sys/dev/nvme/nvme_qpair.c | 9 ++++++--- sys/dev/nvme/nvme_sysctl.c | 24 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index e7e923f2e0bc..854750cf9fbf 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -191,6 +191,7 @@ struct nvme_qpair { int64_t num_cmds; int64_t num_intr_handler_calls; int64_t num_retries; + int64_t num_failures; struct nvme_command *cmd; struct nvme_completion *cpl; diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c index 7c28f04213a7..cabaeba33918 100644 --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -387,14 +387,16 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, struct nvme_completion *cpl, error_print_t print_on_error) { struct nvme_request *req; - boolean_t retry, error; + boolean_t retry, error, retriable; req = tr->req; error = nvme_completion_is_error(cpl); - retry = error && nvme_completion_is_retry(cpl) && - req->retries < nvme_retry_count; + retriable = nvme_completion_is_retry(cpl); + retry = error && retriable && req->retries < nvme_retry_count; if (retry) qpair->num_retries++; + if (error && req->retries >= nvme_retry_count && retriable) + qpair->num_failures++; if (error && (print_on_error == ERROR_PRINT_ALL || (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) { @@ -687,6 +689,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, qpair->num_cmds = 0; qpair->num_intr_handler_calls = 0; qpair->num_retries = 0; + qpair->num_failures = 0; qpair->cmd = (struct nvme_command *)queuemem; qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz); prpmem = (uint8_t *)(queuemem + cmdsz + cplsz); diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c index b09fa20dd1ee..15da68edcf4c 100644 --- a/sys/dev/nvme/nvme_sysctl.c +++ b/sys/dev/nvme/nvme_sysctl.c @@ -167,6 +167,7 @@ nvme_qpair_reset_stats(struct nvme_qpair *qpair) qpair->num_cmds = 0; qpair->num_intr_handler_calls = 0; qpair->num_retries = 0; + qpair->num_failures = 0; } static int @@ -214,6 +215,21 @@ nvme_sysctl_num_retries(SYSCTL_HANDLER_ARGS) return (sysctl_handle_64(oidp, &num_retries, 0, req)); } +static int +nvme_sysctl_num_failures(SYSCTL_HANDLER_ARGS) +{ + struct nvme_controller *ctrlr = arg1; + int64_t num_failures = 0; + int i; + + num_failures = ctrlr->adminq.num_failures; + + for (i = 0; i < ctrlr->num_io_queues; i++) + num_failures += ctrlr->ioq[i].num_failures; + + return (sysctl_handle_64(oidp, &num_failures, 0, req)); +} + static int nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS) { @@ -267,6 +283,9 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair, "coalescing)"); SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_retries", CTLFLAG_RD, &qpair->num_retries, "Number of commands retried"); + SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_failures", + CTLFLAG_RD, &qpair->num_failures, + "Number of commands ending in failure after all retries"); SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO, "dump_debug", CTLTYPE_UINT | CTLFLAG_RW, qpair, 0, @@ -323,6 +342,11 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr) ctrlr, 0, nvme_sysctl_num_retries, "IU", "Number of commands retried"); + SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, + "num_failures", CTLTYPE_S64 | CTLFLAG_RD, + ctrlr, 0, nvme_sysctl_num_failures, "IU", + "Number of commands ending in failure after all retries"); + SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, "reset_stats", CTLTYPE_UINT | CTLFLAG_RW, ctrlr, 0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");