From 232e2edb6cd50465bd62b9f7cbc9fd83531c02ca Mon Sep 17 00:00:00 2001 From: Jim Harris <jimharris@FreeBSD.org> Date: Tue, 26 Mar 2013 21:58:38 +0000 Subject: [PATCH] Add the ability to internally mark a controller as failed, if it is unable to start or reset. Also add a notifier for NVMe consumers for controller fail conditions and plumb this notifier for nvd(4) to destroy the associated GEOM disks when a failure occurs. This requires a bit of work to cover the races when a consumer is sending I/O requests to a controller that is transitioning to the failed state. To help cover this condition, add a task to defer completion of I/Os submitted to a failed controller, so that the consumer will still always receive its completions in a different context than the submission. Sponsored by: Intel Reviewed by: carl --- sys/dev/nvd/nvd.c | 22 +++++++- sys/dev/nvme/nvme.c | 18 ++++++- sys/dev/nvme/nvme.h | 4 +- sys/dev/nvme/nvme_ctrlr.c | 73 +++++++++++++++++++++++--- sys/dev/nvme/nvme_private.h | 17 +++++- sys/dev/nvme/nvme_qpair.c | 101 ++++++++++++++++++++++++++++++++---- 6 files changed, 213 insertions(+), 22 deletions(-) diff --git a/sys/dev/nvd/nvd.c b/sys/dev/nvd/nvd.c index 76ec5f7c9de7..09886b42ff30 100644 --- a/sys/dev/nvd/nvd.c +++ b/sys/dev/nvd/nvd.c @@ -49,6 +49,7 @@ static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr); static void destroy_geom_disk(struct nvd_disk *ndisk); static void *nvd_new_controller(struct nvme_controller *ctrlr); +static void nvd_controller_fail(void *ctrlr); static int nvd_load(void); static void nvd_unload(void); @@ -118,7 +119,7 @@ nvd_load() TAILQ_INIT(&disk_head); consumer_handle = nvme_register_consumer(nvd_new_disk, - nvd_new_controller, NULL); + nvd_new_controller, NULL, nvd_controller_fail); return (consumer_handle != NULL ? 0 : -1); } @@ -351,3 +352,22 @@ destroy_geom_disk(struct nvd_disk *ndisk) mtx_destroy(&ndisk->bioqlock); } + +static void +nvd_controller_fail(void *ctrlr_arg) +{ + struct nvd_controller *ctrlr = ctrlr_arg; + struct nvd_disk *disk; + + while (!TAILQ_EMPTY(&ctrlr->disk_head)) { + disk = TAILQ_FIRST(&ctrlr->disk_head); + TAILQ_REMOVE(&disk_head, disk, global_tailq); + TAILQ_REMOVE(&ctrlr->disk_head, disk, ctrlr_tailq); + destroy_geom_disk(disk); + free(disk, M_NVD); + } + + TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq); + free(ctrlr, M_NVD); +} + diff --git a/sys/dev/nvme/nvme.c b/sys/dev/nvme/nvme.c index 27870106bf48..afebae735ef2 100644 --- a/sys/dev/nvme/nvme.c +++ b/sys/dev/nvme/nvme.c @@ -44,6 +44,7 @@ struct nvme_consumer { nvme_cons_ns_fn_t ns_fn; nvme_cons_ctrlr_fn_t ctrlr_fn; nvme_cons_async_fn_t async_fn; + nvme_cons_fail_fn_t fail_fn; }; struct nvme_consumer nvme_consumer[NVME_MAX_CONSUMERS]; @@ -349,9 +350,23 @@ nvme_notify_async_consumers(struct nvme_controller *ctrlr, } } +void +nvme_notify_fail_consumers(struct nvme_controller *ctrlr) +{ + struct nvme_consumer *cons; + uint32_t i; + + for (i = 0; i < NVME_MAX_CONSUMERS; i++) { + cons = &nvme_consumer[i]; + if (cons->id != INVALID_CONSUMER_ID && cons->fail_fn != NULL) + cons->fail_fn(ctrlr->cons_cookie[i]); + } +} + struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, - nvme_cons_async_fn_t async_fn) + nvme_cons_async_fn_t async_fn, + nvme_cons_fail_fn_t fail_fn) { int i; @@ -365,6 +380,7 @@ nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, nvme_consumer[i].ns_fn = ns_fn; nvme_consumer[i].ctrlr_fn = ctrlr_fn; nvme_consumer[i].async_fn = async_fn; + nvme_consumer[i].fail_fn = fail_fn; nvme_notify_consumer(&nvme_consumer[i]); return (&nvme_consumer[i]); diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h index ab82f2f3e0b1..22f996ba8e63 100644 --- a/sys/dev/nvme/nvme.h +++ b/sys/dev/nvme/nvme.h @@ -733,6 +733,7 @@ typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *); typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *); typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *, uint32_t, void *, uint32_t); +typedef void (*nvme_cons_fail_fn_t)(void *); enum nvme_namespace_flags { NVME_NS_DEALLOCATE_SUPPORTED = 0x1, @@ -769,7 +770,8 @@ int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, /* Registration functions */ struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, - nvme_cons_async_fn_t async_fn); + nvme_cons_async_fn_t async_fn, + nvme_cons_fail_fn_t fail_fn); void nvme_unregister_consumer(struct nvme_consumer *consumer); /* Controller helper functions */ diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 248110a81197..19c06e5de34a 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -311,6 +311,45 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) return (0); } +static void +nvme_ctrlr_fail(struct nvme_controller *ctrlr) +{ + int i; + + ctrlr->is_failed = TRUE; + nvme_qpair_fail(&ctrlr->adminq); + for (i = 0; i < ctrlr->num_io_queues; i++) + nvme_qpair_fail(&ctrlr->ioq[i]); + nvme_notify_fail_consumers(ctrlr); +} + +void +nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, + struct nvme_request *req) +{ + + mtx_lock(&ctrlr->fail_req_lock); + STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq); + mtx_unlock(&ctrlr->fail_req_lock); + taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); +} + +static void +nvme_ctrlr_fail_req_task(void *arg, int pending) +{ + struct nvme_controller *ctrlr = arg; + struct nvme_request *req; + + mtx_lock(&ctrlr->fail_req_lock); + while (!STAILQ_EMPTY(&ctrlr->fail_req)) { + req = STAILQ_FIRST(&ctrlr->fail_req); + STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq); + nvme_qpair_manual_complete_request(req->qpair, req, + NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE); + } + mtx_unlock(&ctrlr->fail_req_lock); +} + static int nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr) { @@ -426,8 +465,12 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr) cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1); - if (cmpset == 0) - /* Controller is already resetting. */ + if (cmpset == 0 || ctrlr->is_failed) + /* + * Controller is already resetting or has failed. Return + * immediately since there is no need to kick off another + * reset in these cases. + */ return; taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); @@ -745,17 +788,25 @@ nvme_ctrlr_start(void *ctrlr_arg) nvme_admin_qpair_enable(&ctrlr->adminq); - if (nvme_ctrlr_identify(ctrlr) != 0) + if (nvme_ctrlr_identify(ctrlr) != 0) { + nvme_ctrlr_fail(ctrlr); return; + } - if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) + if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { + nvme_ctrlr_fail(ctrlr); return; + } - if (nvme_ctrlr_create_qpairs(ctrlr) != 0) + if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { + nvme_ctrlr_fail(ctrlr); return; + } - if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) + if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) { + nvme_ctrlr_fail(ctrlr); return; + } nvme_ctrlr_configure_aer(ctrlr); nvme_ctrlr_configure_int_coalescing(ctrlr); @@ -802,6 +853,8 @@ nvme_ctrlr_reset_task(void *arg, int pending) pause("nvmereset", hz / 10); if (status == 0) nvme_ctrlr_start(ctrlr); + else + nvme_ctrlr_fail(ctrlr); atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); } @@ -998,12 +1051,18 @@ intx: ctrlr->cdev->si_drv1 = (void *)ctrlr; - TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, taskqueue_thread_enqueue, &ctrlr->taskqueue); taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq"); ctrlr->is_resetting = 0; + TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); + + TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr); + mtx_init(&ctrlr->fail_req_lock, "nvme ctrlr fail req lock", NULL, + MTX_DEF); + STAILQ_INIT(&ctrlr->fail_req); + ctrlr->is_failed = FALSE; return (0); } diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 0dfa2f9e26ef..352316095780 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -85,7 +85,7 @@ MALLOC_DECLARE(M_NVME); */ #define NVME_IO_ENTRIES (256) #define NVME_IO_TRACKERS (128) -#define NVME_MIN_IO_TRACKERS (16) +#define NVME_MIN_IO_TRACKERS (4) #define NVME_MAX_IO_TRACKERS (1024) /* @@ -119,6 +119,7 @@ extern int32_t nvme_retry_count; struct nvme_request { struct nvme_command cmd; + struct nvme_qpair *qpair; void *payload; uint32_t payload_size; boolean_t timeout; @@ -250,7 +251,9 @@ struct nvme_controller { struct intr_config_hook config_hook; uint32_t ns_identified; uint32_t queues_created; + struct task reset_task; + struct task fail_req_task; struct taskqueue *taskqueue; /* For shared legacy interrupt. */ @@ -293,6 +296,10 @@ struct nvme_controller { uint32_t is_resetting; + struct mtx fail_req_lock; + boolean_t is_failed; + STAILQ_HEAD(, nvme_request) fail_req; + #ifdef CHATHAM2 uint64_t chatham_size; uint64_t chatham_lbas; @@ -403,6 +410,8 @@ void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, struct nvme_request *req); void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, struct nvme_request *req); +void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, + struct nvme_request *req); void nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, uint16_t vector, uint32_t num_entries, @@ -414,6 +423,11 @@ void nvme_qpair_process_completions(struct nvme_qpair *qpair); void nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req); void nvme_qpair_reset(struct nvme_qpair *qpair); +void nvme_qpair_fail(struct nvme_qpair *qpair); +void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, + struct nvme_request *req, + uint32_t sct, uint32_t sc, + boolean_t print_on_error); void nvme_admin_qpair_enable(struct nvme_qpair *qpair); void nvme_admin_qpair_disable(struct nvme_qpair *qpair); @@ -484,5 +498,6 @@ void nvme_notify_async_consumers(struct nvme_controller *ctrlr, const struct nvme_completion *async_cpl, uint32_t log_page_id, void *log_page_buffer, uint32_t log_page_size); +void nvme_notify_fail_consumers(struct nvme_controller *ctrlr); #endif /* __NVME_PRIVATE_H__ */ diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c index a508f9e7c2e3..8ac61b141f66 100644 --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -50,7 +50,6 @@ nvme_completion_is_retry(const struct nvme_completion *cpl) case NVME_SCT_GENERIC: switch (cpl->status.sc) { case NVME_SC_ABORTED_BY_REQUEST: - return (1); case NVME_SC_NAMESPACE_NOT_READY: if (cpl->status.dnr) return (0); @@ -155,7 +154,7 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, static void nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair, - struct nvme_tracker *tr, uint32_t sct, uint32_t sc, + struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, boolean_t print_on_error) { struct nvme_completion cpl; @@ -165,9 +164,36 @@ nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair, cpl.cid = tr->cid; cpl.status.sct = sct; cpl.status.sc = sc; + cpl.status.dnr = dnr; nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); } +void +nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, + struct nvme_request *req, uint32_t sct, uint32_t sc, + boolean_t print_on_error) +{ + struct nvme_completion cpl; + boolean_t error; + + memset(&cpl, 0, sizeof(cpl)); + cpl.sqid = qpair->id; + cpl.status.sct = sct; + cpl.status.sc = sc; + + error = nvme_completion_is_error(&cpl); + + if (error && print_on_error) { + nvme_dump_completion(&cpl); + nvme_dump_command(&req->cmd); + } + + if (req->cb_fn) + req->cb_fn(req->cb_arg, &cpl); + + nvme_free_request(req); +} + void nvme_qpair_process_completions(struct nvme_qpair *qpair) { @@ -363,7 +389,7 @@ nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) while (tr != NULL) { if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { nvme_qpair_manual_complete_tracker(qpair, tr, - NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, + NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, FALSE); tr = TAILQ_FIRST(&qpair->outstanding_tr); } else { @@ -406,7 +432,7 @@ nvme_abort_complete(void *arg, const struct nvme_completion *status) */ printf("abort command failed, aborting command manually\n"); nvme_qpair_manual_complete_tracker(tr->qpair, tr, - NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE); + NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, TRUE); } } @@ -476,17 +502,32 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) mtx_assert(&qpair->lock, MA_OWNED); tr = TAILQ_FIRST(&qpair->free_tr); + req->qpair = qpair; if (tr == NULL || !qpair->is_enabled) { /* * No tracker is available, or the qpair is disabled due to - * an in-progress controller-level reset. - * - * Put the request on the qpair's request queue to be processed - * when a tracker frees up via a command completion or when - * the controller reset is completed. + * an in-progress controller-level reset or controller + * failure. */ - STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); + + if (qpair->ctrlr->is_failed) { + /* + * The controller has failed. Post the request to a + * task where it will be aborted, so that we do not + * invoke the request's callback in the context + * of the submission. + */ + nvme_ctrlr_post_failed_request(qpair->ctrlr, req); + } else { + /* + * Put the request on the qpair's request queue to be + * processed when a tracker frees up via a command + * completion or when the controller reset is + * completed. + */ + STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); + } return; } @@ -574,7 +615,7 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair) device_printf(qpair->ctrlr->dev, "aborting outstanding i/o\n"); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, - NVME_SC_ABORTED_BY_REQUEST, TRUE); + NVME_SC_ABORTED_BY_REQUEST, 0, TRUE); } mtx_lock(&qpair->lock); @@ -622,3 +663,41 @@ nvme_io_qpair_disable(struct nvme_qpair *qpair) nvme_qpair_disable(qpair); } + +void +nvme_qpair_fail(struct nvme_qpair *qpair) +{ + struct nvme_tracker *tr; + struct nvme_request *req; + + mtx_lock(&qpair->lock); + + while (!STAILQ_EMPTY(&qpair->queued_req)) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + device_printf(qpair->ctrlr->dev, + "failing queued i/o\n"); + mtx_unlock(&qpair->lock); + nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, + NVME_SC_ABORTED_BY_REQUEST, TRUE); + mtx_lock(&qpair->lock); + } + + /* Manually abort each outstanding I/O. */ + while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { + tr = TAILQ_FIRST(&qpair->outstanding_tr); + /* + * Do not remove the tracker. The abort_tracker path will + * do that for us. + */ + device_printf(qpair->ctrlr->dev, + "failing outstanding i/o\n"); + mtx_unlock(&qpair->lock); + nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, + NVME_SC_ABORTED_BY_REQUEST, 1 /* do not retry */, TRUE); + mtx_lock(&qpair->lock); + } + + mtx_unlock(&qpair->lock); +} +