Add controller reset capability to nvme(4) and ability to explicitly

invoke it from nvmecontrol(8).

Controller reset will be performed in cases where I/O are repeatedly
timing out, the controller reports an unrecoverable condition, or
when explicitly requested via IOCTL or an nvme consumer.  Since the
controller may be in such a state where it cannot even process queue
deletion requests, we will perform a controller reset without trying
to clean up anything on the controller first.

Sponsored by:	Intel
Reviewed by:	carl
This commit is contained in:
jimharris 2013-03-26 19:50:46 +00:00
parent bd33256583
commit 93fd264895
8 changed files with 265 additions and 101 deletions

View File

@ -33,7 +33,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd September 17, 2012
.Dd March 26, 2013
.Dt NVMECONTROL 8
.Os
.Sh NAME
@ -54,7 +54,10 @@
.Op Fl p
.Aq Fl s Ar size_in_bytes
.Aq Fl t Ar time_in_sec
.Aq device id
.Aq namespace id
.Nm
.Ic reset
.Aq controller id
.Sh DESCRIPTION
NVM Express (NVMe) is a storage protocol standard, for SSDs and other
high-speed storage devices over PCI Express.
@ -62,6 +65,7 @@ high-speed storage devices over PCI Express.
.Dl nvmecontrol devlist
.Pp
Display a list of NVMe controllers and namespaces along with their device nodes.
.Pp
.Dl nvmecontrol identify nvme0
.Pp
Display a human-readable summary of the nvme0 IDENTIFY_CONTROLLER data.
@ -76,6 +80,10 @@ Display a hexadecimal dump of the nvme0 IDENTIFY_NAMESPACE data for namespace
Run a performance test on nvme0ns1 using 32 kernel threads for 30 seconds. Each
thread will issue a single 512 byte read command. Results are printed to
stdout when 30 seconds expires.
.Pp
.Dl nvmecontrol reset nvme0
.Pp
Perform a controller-level reset of the nvme0 controller.
.Sh AUTHORS
.An -nosplit
.Nm

View File

@ -56,6 +56,9 @@ __FBSDID("$FreeBSD$");
" <-i intr|wait> [-f refthread] [-p]\n" \
" <namespace id>\n"
#define RESET_USAGE \
" nvmecontrol reset <controller id>\n"
static void perftest_usage(void);
static void
@ -64,6 +67,7 @@ usage(void)
fprintf(stderr, "usage:\n");
fprintf(stderr, DEVLIST_USAGE);
fprintf(stderr, IDENTIFY_USAGE);
fprintf(stderr, RESET_USAGE);
fprintf(stderr, PERFTEST_USAGE);
exit(EX_USAGE);
}
@ -580,6 +584,41 @@ perftest(int argc, char *argv[])
exit(EX_OK);
}
static void
reset_ctrlr(int argc, char *argv[])
{
struct stat devstat;
char path[64];
int ch, fd;
while ((ch = getopt(argc, argv, "")) != -1) {
switch ((char)ch) {
default:
usage();
}
}
sprintf(path, "/dev/%s", argv[optind]);
if (stat(path, &devstat) != 0) {
printf("Invalid device node '%s'.\n", path);
exit(EX_IOERR);
}
fd = open(path, O_RDWR);
if (fd < 0) {
printf("Could not open %s.\n", path);
exit(EX_NOPERM);
}
if (ioctl(fd, NVME_RESET_CONTROLLER) == -1) {
printf("ioctl to %s failed.\n", path);
exit(EX_IOERR);
}
exit(EX_OK);
}
int
main(int argc, char *argv[])
{
@ -593,6 +632,8 @@ main(int argc, char *argv[])
identify(argc-1, &argv[1]);
else if (strcmp(argv[1], "perftest") == 0)
perftest(argc-1, &argv[1]);
else if (strcmp(argv[1], "reset") == 0)
reset_ctrlr(argc-1, &argv[1]);
usage();

View File

@ -255,7 +255,7 @@ nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
}
}
nvme_qpair_submit_cmd(tr->qpair, tr);
nvme_qpair_submit_tracker(tr->qpair, tr);
}
static int
@ -274,11 +274,11 @@ nvme_attach(device_t dev)
* to cc.en==0. This is because we don't really know what status
* the controller was left in when boot handed off to OS.
*/
status = nvme_ctrlr_reset(ctrlr);
status = nvme_ctrlr_hw_reset(ctrlr);
if (status != 0)
return (status);
status = nvme_ctrlr_reset(ctrlr);
status = nvme_ctrlr_hw_reset(ctrlr);
if (status != 0)
return (status);

View File

@ -37,6 +37,7 @@
#define NVME_IDENTIFY_NAMESPACE _IOR('n', 1, struct nvme_namespace_data)
#define NVME_IO_TEST _IOWR('n', 2, struct nvme_io_test)
#define NVME_BIO_TEST _IOWR('n', 4, struct nvme_io_test)
#define NVME_RESET_CONTROLLER _IO('n', 5)
/*
* Use to mark a command to apply to all namespaces, or to retrieve global

View File

@ -405,13 +405,31 @@ nvme_ctrlr_enable(struct nvme_controller *ctrlr)
}
int
nvme_ctrlr_reset(struct nvme_controller *ctrlr)
nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
{
int i;
nvme_admin_qpair_disable(&ctrlr->adminq);
for (i = 0; i < ctrlr->num_io_queues; i++)
nvme_io_qpair_disable(&ctrlr->ioq[i]);
DELAY(100*1000);
nvme_ctrlr_disable(ctrlr);
return (nvme_ctrlr_enable(ctrlr));
}
void
nvme_ctrlr_reset(struct nvme_controller *ctrlr)
{
int status;
status = nvme_ctrlr_hw_reset(ctrlr);
DELAY(100*1000);
if (status == 0)
nvme_ctrlr_start(ctrlr);
}
static int
nvme_ctrlr_identify(struct nvme_controller *ctrlr)
{
@ -626,6 +644,9 @@ void
nvme_ctrlr_start(void *ctrlr_arg)
{
struct nvme_controller *ctrlr = ctrlr_arg;
int i;
nvme_admin_qpair_enable(&ctrlr->adminq);
if (nvme_ctrlr_identify(ctrlr) != 0)
goto err;
@ -642,18 +663,28 @@ nvme_ctrlr_start(void *ctrlr_arg)
nvme_ctrlr_configure_aer(ctrlr);
nvme_ctrlr_configure_int_coalescing(ctrlr);
for (i = 0; i < ctrlr->num_io_queues; i++)
nvme_io_qpair_enable(&ctrlr->ioq[i]);
ctrlr->is_started = TRUE;
err:
if (ctrlr->num_start_attempts == 0) {
/*
* Initialize sysctls, even if controller failed to start, to
* assist with debugging admin queue pair.
* assist with debugging admin queue pair. Only run this
* code on the initial start attempt though, and not
* subsequent start attempts due to controller-level resets.
*
*/
nvme_sysctl_initialize_ctrlr(ctrlr);
config_intrhook_disestablish(&ctrlr->config_hook);
}
ctrlr->num_start_attempts++;
}
static void
nvme_ctrlr_intx_handler(void *arg)
{
@ -730,6 +761,9 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
return (ENXIO);
memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
break;
case NVME_RESET_CONTROLLER:
nvme_ctrlr_reset(ctrlr);
break;
default:
return (ENOTTY);
}
@ -752,6 +786,7 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
ctrlr->dev = dev;
ctrlr->is_started = FALSE;
ctrlr->num_start_attempts = 0;
status = nvme_ctrlr_allocate_bar(ctrlr);
@ -835,14 +870,10 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
void
nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
{
struct nvme_namespace *ns;
int i;
for (i = 0; i < NVME_MAX_NAMESPACES; i++) {
ns = &ctrlr->ns[i];
if (ns->cdev)
destroy_dev(ns->cdev);
}
for (i = 0; i < NVME_MAX_NAMESPACES; i++)
nvme_ns_destruct(&ctrlr->ns[i]);
if (ctrlr->cdev)
destroy_dev(ctrlr->cdev);
@ -853,13 +884,6 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
free(ctrlr->ioq, M_NVME);
/* Manually abort outstanding async event requests. */
for (i = 0; i < ctrlr->num_aers; i++) {
nvme_qpair_manual_abort_request(&ctrlr->adminq,
ctrlr->aer[i].req, NVME_SCT_GENERIC,
NVME_SC_ABORTED_SQ_DELETION, FALSE);
}
nvme_admin_qpair_destroy(&ctrlr->adminq);
if (ctrlr->resource != NULL) {

View File

@ -345,6 +345,13 @@ nvme_ns_construct(struct nvme_namespace *ns, uint16_t id,
if (ctrlr->cdata.vwc.present)
ns->flags |= NVME_NS_FLUSH_SUPPORTED;
/*
* cdev may have already been created, if we are reconstructing the
* namespace after a controller-level reset.
*/
if (ns->cdev != NULL)
return (0);
/*
* MAKEDEV_ETERNAL was added in r210923, for cdevs that will never
* be destroyed. This avoids refcounting on the cdev object.
@ -361,9 +368,15 @@ nvme_ns_construct(struct nvme_namespace *ns, uint16_t id,
device_get_unit(ctrlr->dev), ns->id);
#endif
if (ns->cdev) {
if (ns->cdev != NULL)
ns->cdev->si_drv1 = ns;
}
return (0);
}
void nvme_ns_destruct(struct nvme_namespace *ns)
{
if (ns->cdev != NULL)
destroy_dev(ns->cdev);
}

View File

@ -180,6 +180,8 @@ struct nvme_qpair {
struct nvme_tracker **act_tr;
boolean_t is_enabled;
struct mtx lock __aligned(CACHE_LINE_SIZE);
} __aligned(CACHE_LINE_SIZE);
@ -233,6 +235,7 @@ struct nvme_controller {
struct intr_config_hook config_hook;
uint32_t ns_identified;
uint32_t queues_created;
uint32_t num_start_attempts;
/* For shared legacy interrupt. */
int rid;
@ -361,7 +364,8 @@ void nvme_payload_map_uio(void *arg, bus_dma_segment_t *seg, int nseg,
int nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev);
void nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev);
int nvme_ctrlr_reset(struct nvme_controller *ctrlr);
int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr);
void nvme_ctrlr_reset(struct nvme_controller *ctrlr);
/* ctrlr defined as void * to allow use with config_intrhook. */
void nvme_ctrlr_start(void *ctrlr_arg);
void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
@ -373,21 +377,23 @@ void nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
uint16_t vector, uint32_t num_entries,
uint32_t num_trackers, uint32_t max_xfer_size,
struct nvme_controller *ctrlr);
void nvme_qpair_submit_cmd(struct nvme_qpair *qpair,
void nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
struct nvme_tracker *tr);
void nvme_qpair_process_completions(struct nvme_qpair *qpair);
void nvme_qpair_submit_request(struct nvme_qpair *qpair,
struct nvme_request *req);
void nvme_qpair_manual_abort_request(struct nvme_qpair *qpair,
struct nvme_request *req, uint32_t sct,
uint32_t sc, boolean_t print_on_error);
void nvme_admin_qpair_enable(struct nvme_qpair *qpair);
void nvme_admin_qpair_disable(struct nvme_qpair *qpair);
void nvme_admin_qpair_destroy(struct nvme_qpair *qpair);
void nvme_io_qpair_enable(struct nvme_qpair *qpair);
void nvme_io_qpair_disable(struct nvme_qpair *qpair);
void nvme_io_qpair_destroy(struct nvme_qpair *qpair);
int nvme_ns_construct(struct nvme_namespace *ns, uint16_t id,
struct nvme_controller *ctrlr);
void nvme_ns_destruct(struct nvme_namespace *ns);
int nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag);

View File

@ -87,23 +87,6 @@ nvme_completion_is_retry(const struct nvme_completion *cpl)
}
}
static struct nvme_tracker *
nvme_qpair_find_tracker(struct nvme_qpair *qpair, struct nvme_request *req)
{
struct nvme_tracker *tr;
uint32_t i;
KASSERT(req != NULL, ("%s: called with NULL req\n", __func__));
for (i = 0; i < qpair->num_entries; ++i) {
tr = qpair->act_tr[i];
if (tr != NULL && tr->req == req)
return (tr);
}
return (NULL);
}
static void
nvme_qpair_construct_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr,
uint16_t cid)
@ -147,7 +130,7 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr,
callout_stop(&tr->timer);
if (retry)
nvme_qpair_submit_cmd(qpair, tr);
nvme_qpair_submit_tracker(qpair, tr);
else {
if (req->payload_size > 0 || req->uio != NULL)
bus_dmamap_unload(qpair->dma_tag,
@ -169,6 +152,21 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr,
mtx_unlock(&qpair->lock);
}
static void
nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair,
struct nvme_tracker *tr, uint32_t sct, uint32_t sc,
boolean_t print_on_error)
{
struct nvme_completion cpl;
memset(&cpl, 0, sizeof(cpl));
cpl.sqid = qpair->id;
cpl.cid = tr->cid;
cpl.sf_sct = sct;
cpl.sf_sc = sc;
nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
}
void
nvme_qpair_process_completions(struct nvme_qpair *qpair)
{
@ -177,6 +175,15 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
qpair->num_intr_handler_calls++;
if (!qpair->is_enabled)
/*
* qpair is not enabled, likely because a controller reset is
* is in progress. Ignore the interrupt - any I/O that was
* associated with this interrupt will get retried when the
* reset is complete.
*/
return;
while (1) {
cpl = &qpair->cpl[qpair->cq_head];
@ -236,15 +243,6 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
qpair->max_xfer_size = max_xfer_size;
qpair->ctrlr = ctrlr;
/*
* First time through the completion queue, HW will set phase
* bit on completions to 1. So set this to 1 here, indicating
* we're looking for a 1 to know which entries have completed.
* we'll toggle the bit each time when the completion queue
* rolls over.
*/
qpair->phase = 1;
if (ctrlr->msix_enabled) {
/*
@ -271,7 +269,6 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
qpair->num_cmds = 0;
qpair->num_intr_handler_calls = 0;
qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
/* TODO: error checking on contigmalloc, bus_dmamap_load calls */
qpair->cmd = contigmalloc(qpair->num_entries *
@ -341,10 +338,30 @@ nvme_qpair_destroy(struct nvme_qpair *qpair)
}
}
static void
nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
{
struct nvme_tracker *tr;
tr = TAILQ_FIRST(&qpair->outstanding_tr);
while (tr != NULL) {
if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
nvme_qpair_manual_complete_tracker(qpair, tr,
NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION,
FALSE);
tr = TAILQ_FIRST(&qpair->outstanding_tr);
} else {
tr = TAILQ_NEXT(tr, tailq);
}
}
}
void
nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
{
nvme_admin_qpair_abort_aers(qpair);
/*
* For NVMe, you don't send delete queue commands for the admin
* queue, so we just need to unload and free the cmd and cpl memory.
@ -412,39 +429,6 @@ nvme_io_qpair_destroy(struct nvme_qpair *qpair)
}
}
static void
nvme_qpair_manual_abort_tracker(struct nvme_qpair *qpair,
struct nvme_tracker *tr, uint32_t sct, uint32_t sc,
boolean_t print_on_error)
{
struct nvme_completion cpl;
memset(&cpl, 0, sizeof(cpl));
cpl.sqid = qpair->id;
cpl.cid = tr->cid;
cpl.sf_sct = sct;
cpl.sf_sc = sc;
nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
}
void
nvme_qpair_manual_abort_request(struct nvme_qpair *qpair,
struct nvme_request *req, uint32_t sct, uint32_t sc,
boolean_t print_on_error)
{
struct nvme_tracker *tr;
tr = nvme_qpair_find_tracker(qpair, req);
if (tr == NULL) {
printf("%s: request not found\n", __func__);
nvme_dump_command(&req->cmd);
return;
}
nvme_qpair_manual_abort_tracker(qpair, tr, sct, sc, print_on_error);
}
static void
nvme_abort_complete(void *arg, const struct nvme_completion *status)
{
@ -463,7 +447,7 @@ nvme_abort_complete(void *arg, const struct nvme_completion *status)
* status, and then complete the I/O's tracker manually.
*/
printf("abort command failed, aborting command manually\n");
nvme_qpair_manual_abort_tracker(tr->qpair, tr,
nvme_qpair_manual_complete_tracker(tr->qpair, tr,
NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE);
}
}
@ -478,10 +462,12 @@ nvme_timeout(void *arg)
}
void
nvme_qpair_submit_cmd(struct nvme_qpair *qpair, struct nvme_tracker *tr)
nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
{
struct nvme_request *req;
mtx_assert(&qpair->lock, MA_OWNED);
req = tr->req;
req->cmd.cid = tr->cid;
qpair->act_tr[tr->cid] = tr;
@ -517,11 +503,14 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
tr = TAILQ_FIRST(&qpair->free_tr);
if (tr == NULL) {
if (tr == NULL || !qpair->is_enabled) {
/*
* No tracker is available. Put the request on the qpair's
* request queue to be processed when a tracker frees up
* via a command completion.
* No tracker is available, or the qpair is disabled due to
* an in-progress controller-level reset.
*
* Put the request on the qpair's request queue to be processed
* when a tracker frees up via a command completion or when
* the controller reset is completed.
*/
STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
return;
@ -540,7 +529,7 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
if (err != 0)
panic("bus_dmamap_load returned non-zero!\n");
} else
nvme_qpair_submit_cmd(tr->qpair, tr);
nvme_qpair_submit_tracker(tr->qpair, tr);
} else {
err = bus_dmamap_load_uio(tr->qpair->dma_tag,
tr->payload_dma_map, req->uio,
@ -558,3 +547,85 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
_nvme_qpair_submit_request(qpair, req);
mtx_unlock(&qpair->lock);
}
static void
nvme_qpair_enable(struct nvme_qpair *qpair)
{
qpair->is_enabled = TRUE;
qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
/*
* First time through the completion queue, HW will set phase
* bit on completions to 1. So set this to 1 here, indicating
* we're looking for a 1 to know which entries have completed.
* we'll toggle the bit each time when the completion queue
* rolls over.
*/
qpair->phase = 1;
memset(qpair->cmd, 0,
qpair->num_entries * sizeof(struct nvme_command));
memset(qpair->cpl, 0,
qpair->num_entries * sizeof(struct nvme_completion));
}
void
nvme_admin_qpair_enable(struct nvme_qpair *qpair)
{
nvme_qpair_enable(qpair);
}
void
nvme_io_qpair_enable(struct nvme_qpair *qpair)
{
STAILQ_HEAD(, nvme_request) temp;
struct nvme_tracker *tr;
struct nvme_request *req;
mtx_lock(&qpair->lock);
nvme_qpair_enable(qpair);
TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq)
nvme_qpair_submit_tracker(qpair, tr);
STAILQ_INIT(&temp);
STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
while (!STAILQ_EMPTY(&temp)) {
req = STAILQ_FIRST(&temp);
STAILQ_REMOVE_HEAD(&temp, stailq);
_nvme_qpair_submit_request(qpair, req);
}
mtx_unlock(&qpair->lock);
}
static void
nvme_qpair_disable(struct nvme_qpair *qpair)
{
struct nvme_tracker *tr;
qpair->is_enabled = FALSE;
mtx_lock(&qpair->lock);
TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq)
callout_stop(&tr->timer);
mtx_unlock(&qpair->lock);
}
void
nvme_admin_qpair_disable(struct nvme_qpair *qpair)
{
nvme_qpair_disable(qpair);
nvme_admin_qpair_abort_aers(qpair);
}
void
nvme_io_qpair_disable(struct nvme_qpair *qpair)
{
nvme_qpair_disable(qpair);
}