bdev/nvme: Allow the user to control the I/O timeout behavior
The user can now not only specify an optional timeout for commands, but also the action to take when a timeout is detected. Change-Id: I7d7cdd846d580e0b3a5f733d398ee9b19d6fe034 Signed-off-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
parent
193f4f8392
commit
acd0b4573d
@ -100,10 +100,13 @@
|
||||
# The number of attempts per I/O when an I/O fails. Do not include
|
||||
# this key to get the default behavior.
|
||||
NvmeRetryCount 4
|
||||
# Registers the application to receive timeout callback and to reset the controller.
|
||||
ResetControllerOnTimeout Yes
|
||||
# Timeout value.
|
||||
NvmeTimeoutValue 30
|
||||
# Timeout for each command, in seconds. If 0, don't track timeouts.
|
||||
NvmeTimeoutValue 0
|
||||
# Action to take on command time out. Only valid when Timeout is greater
|
||||
# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort
|
||||
# the command, or 'None' to just print a message but do nothing.
|
||||
# Admin command timeouts will always result in a reset.
|
||||
ActionOnTimeout None
|
||||
# Set how often the admin queue is polled for asynchronous events.
|
||||
# Units in microseconds.
|
||||
AdminPollRate 100000
|
||||
|
@ -91,10 +91,13 @@
|
||||
# The number of attempts per I/O when an I/O fails. Do not include
|
||||
# this key to get the default behavior.
|
||||
NvmeRetryCount 4
|
||||
# Registers the application to receive timeout callback and to reset the controller.
|
||||
ResetControllerOnTimeout Yes
|
||||
# Timeout value.
|
||||
NvmeTimeoutValue 30
|
||||
# Timeout for each command, in seconds. If 0, don't track timeouts.
|
||||
NvmeTimeoutValue 0
|
||||
# Action to take on command time out. Only valid when Timeout is greater
|
||||
# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort
|
||||
# the command, or 'None' to just print a message but do nothing.
|
||||
# Admin command timeouts will always result in a reset.
|
||||
ActionOnTimeout None
|
||||
# Set how often the admin queue is polled for asynchronous events.
|
||||
# Units in microseconds.
|
||||
AdminPollRate 100000
|
||||
|
@ -79,13 +79,13 @@
|
||||
# The number of attempts per I/O when an I/O fails. Do not include
|
||||
# this key to get the default behavior.
|
||||
NvmeRetryCount 4
|
||||
# The maximum number of NVMe controllers to claim. Do not include this key to
|
||||
# claim all of them.
|
||||
NumControllers 2
|
||||
# Registers the application to receive timeout callback and to reset the controller.
|
||||
ResetControllerOnTimeout Yes
|
||||
# Timeout value.
|
||||
NvmeTimeoutValue 30
|
||||
# Timeout for each command, in seconds. If 0, don't track timeouts.
|
||||
NvmeTimeoutValue 0
|
||||
# Action to take on command time out. Only valid when Timeout is greater
|
||||
# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort
|
||||
# the command, or 'None' to just print a message but do nothing.
|
||||
# Admin command timeouts will always result in a reset.
|
||||
ActionOnTimeout None
|
||||
# Set how often the admin queue is polled for asynchronous events.
|
||||
# Units in microseconds.
|
||||
AdminPollRate 100000
|
||||
|
@ -111,8 +111,14 @@ struct nvme_probe_ctx {
|
||||
const char *names[NVME_MAX_CONTROLLERS];
|
||||
};
|
||||
|
||||
enum timeout_action {
|
||||
TIMEOUT_ACTION_NONE = 0,
|
||||
TIMEOUT_ACTION_RESET,
|
||||
TIMEOUT_ACTION_ABORT,
|
||||
};
|
||||
|
||||
static int g_hot_insert_nvme_controller_index = 0;
|
||||
static bool g_reset_controller_on_timeout = false;
|
||||
static enum timeout_action g_action_on_timeout = TIMEOUT_ACTION_NONE;
|
||||
static int g_timeout = 0;
|
||||
static int g_nvme_adminq_poll_timeout_us = 0;
|
||||
static bool g_nvme_hotplug_enabled;
|
||||
@ -559,6 +565,21 @@ probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
spdk_nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
|
||||
{
|
||||
struct spdk_nvme_ctrlr *ctrlr = ctx;
|
||||
int rc;
|
||||
|
||||
if (spdk_nvme_cpl_is_error(cpl)) {
|
||||
SPDK_WARNLOG("Abort failed. Resetting controller.\n");
|
||||
rc = spdk_nvme_ctrlr_reset(ctrlr);
|
||||
if (rc) {
|
||||
SPDK_ERRLOG("Resetting controller failed.\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
|
||||
struct spdk_nvme_qpair *qpair, uint16_t cid)
|
||||
@ -567,9 +588,27 @@ timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
|
||||
|
||||
SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
|
||||
|
||||
rc = spdk_nvme_ctrlr_reset(ctrlr);
|
||||
if (rc) {
|
||||
SPDK_ERRLOG("resetting controller failed\n");
|
||||
switch (g_action_on_timeout) {
|
||||
case TIMEOUT_ACTION_ABORT:
|
||||
if (qpair) {
|
||||
rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
|
||||
spdk_nvme_abort_cpl, ctrlr);
|
||||
if (rc == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
SPDK_ERRLOG("Unable to send abort. Resetting.\n");
|
||||
}
|
||||
|
||||
/* Fallthrough */
|
||||
case TIMEOUT_ACTION_RESET:
|
||||
rc = spdk_nvme_ctrlr_reset(ctrlr);
|
||||
if (rc) {
|
||||
SPDK_ERRLOG("Resetting controller failed.\n");
|
||||
}
|
||||
break;
|
||||
case TIMEOUT_ACTION_NONE:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -621,7 +660,7 @@ attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
|
||||
sizeof(struct nvme_io_channel));
|
||||
TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq);
|
||||
|
||||
if (g_reset_controller_on_timeout) {
|
||||
if (g_action_on_timeout != TIMEOUT_ACTION_NONE) {
|
||||
spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_timeout,
|
||||
timeout_cb, NULL);
|
||||
}
|
||||
@ -763,13 +802,32 @@ bdev_nvme_library_init(void)
|
||||
probe_ctx.count++;
|
||||
}
|
||||
|
||||
g_reset_controller_on_timeout =
|
||||
spdk_conf_section_get_boolval(sp, "ResetControllerOnTimeout", false);
|
||||
|
||||
if ((g_timeout = spdk_conf_section_get_intval(sp, "NvmeTimeoutValue")) < 0) {
|
||||
g_timeout = 0;
|
||||
}
|
||||
|
||||
if (g_timeout > 0) {
|
||||
val = spdk_conf_section_get_val(sp, "ActionOnTimeout");
|
||||
if (val != NULL) {
|
||||
if (!strcasecmp(val, "Reset")) {
|
||||
g_action_on_timeout = TIMEOUT_ACTION_RESET;
|
||||
} else if (!strcasecmp(val, "Abort")) {
|
||||
g_action_on_timeout = TIMEOUT_ACTION_ABORT;
|
||||
}
|
||||
} else {
|
||||
/* Handle old name for backward compatibility */
|
||||
val = spdk_conf_section_get_val(sp, "ResetControllerOnTimeout");
|
||||
if (val) {
|
||||
SPDK_WARNLOG("ResetControllerOnTimeout was renamed to ActionOnTimeout\n");
|
||||
SPDK_WARNLOG("Please update your configuration file\n");
|
||||
|
||||
if (spdk_conf_section_get_boolval(sp, "ResetControllerOnTimeout", false)) {
|
||||
g_action_on_timeout = TIMEOUT_ACTION_RESET;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
g_nvme_adminq_poll_timeout_us = spdk_conf_section_get_intval(sp, "AdminPollRate");
|
||||
if (g_nvme_adminq_poll_timeout_us <= 0) {
|
||||
g_nvme_adminq_poll_timeout_us = 1000000;
|
||||
|
Loading…
x
Reference in New Issue
Block a user