Keep track of the number of commands that exhaust their retry limit.
While we print failure messages on the console, sometimes logs are lost or overwhelmed. Keeping a count of how many times we've failed retriable commands helps get a magnitude of the problem.
This commit is contained in:
parent
64b34948ed
commit
7f1c525519
@ -191,6 +191,7 @@ struct nvme_qpair {
|
||||
int64_t num_cmds;
|
||||
int64_t num_intr_handler_calls;
|
||||
int64_t num_retries;
|
||||
int64_t num_failures;
|
||||
|
||||
struct nvme_command *cmd;
|
||||
struct nvme_completion *cpl;
|
||||
|
@ -387,14 +387,16 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr,
|
||||
struct nvme_completion *cpl, error_print_t print_on_error)
|
||||
{
|
||||
struct nvme_request *req;
|
||||
boolean_t retry, error;
|
||||
boolean_t retry, error, retriable;
|
||||
|
||||
req = tr->req;
|
||||
error = nvme_completion_is_error(cpl);
|
||||
retry = error && nvme_completion_is_retry(cpl) &&
|
||||
req->retries < nvme_retry_count;
|
||||
retriable = nvme_completion_is_retry(cpl);
|
||||
retry = error && retriable && req->retries < nvme_retry_count;
|
||||
if (retry)
|
||||
qpair->num_retries++;
|
||||
if (error && req->retries >= nvme_retry_count && retriable)
|
||||
qpair->num_failures++;
|
||||
|
||||
if (error && (print_on_error == ERROR_PRINT_ALL ||
|
||||
(!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
|
||||
@ -687,6 +689,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
|
||||
qpair->num_cmds = 0;
|
||||
qpair->num_intr_handler_calls = 0;
|
||||
qpair->num_retries = 0;
|
||||
qpair->num_failures = 0;
|
||||
qpair->cmd = (struct nvme_command *)queuemem;
|
||||
qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
|
||||
prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
|
||||
|
@ -167,6 +167,7 @@ nvme_qpair_reset_stats(struct nvme_qpair *qpair)
|
||||
qpair->num_cmds = 0;
|
||||
qpair->num_intr_handler_calls = 0;
|
||||
qpair->num_retries = 0;
|
||||
qpair->num_failures = 0;
|
||||
}
|
||||
|
||||
static int
|
||||
@ -214,6 +215,21 @@ nvme_sysctl_num_retries(SYSCTL_HANDLER_ARGS)
|
||||
return (sysctl_handle_64(oidp, &num_retries, 0, req));
|
||||
}
|
||||
|
||||
static int
|
||||
nvme_sysctl_num_failures(SYSCTL_HANDLER_ARGS)
|
||||
{
|
||||
struct nvme_controller *ctrlr = arg1;
|
||||
int64_t num_failures = 0;
|
||||
int i;
|
||||
|
||||
num_failures = ctrlr->adminq.num_failures;
|
||||
|
||||
for (i = 0; i < ctrlr->num_io_queues; i++)
|
||||
num_failures += ctrlr->ioq[i].num_failures;
|
||||
|
||||
return (sysctl_handle_64(oidp, &num_failures, 0, req));
|
||||
}
|
||||
|
||||
static int
|
||||
nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
|
||||
{
|
||||
@ -267,6 +283,9 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair,
|
||||
"coalescing)");
|
||||
SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_retries",
|
||||
CTLFLAG_RD, &qpair->num_retries, "Number of commands retried");
|
||||
SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_failures",
|
||||
CTLFLAG_RD, &qpair->num_failures,
|
||||
"Number of commands ending in failure after all retries");
|
||||
|
||||
SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
|
||||
"dump_debug", CTLTYPE_UINT | CTLFLAG_RW, qpair, 0,
|
||||
@ -323,6 +342,11 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
|
||||
ctrlr, 0, nvme_sysctl_num_retries, "IU",
|
||||
"Number of commands retried");
|
||||
|
||||
SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
|
||||
"num_failures", CTLTYPE_S64 | CTLFLAG_RD,
|
||||
ctrlr, 0, nvme_sysctl_num_failures, "IU",
|
||||
"Number of commands ending in failure after all retries");
|
||||
|
||||
SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
|
||||
"reset_stats", CTLTYPE_UINT | CTLFLAG_RW, ctrlr, 0,
|
||||
nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");
|
||||
|
Loading…
Reference in New Issue
Block a user