Add the ability to internally mark a controller as failed, if it is unable to
start or reset. Also add a notifier for NVMe consumers for controller fail conditions and plumb this notifier for nvd(4) to destroy the associated GEOM disks when a failure occurs. This requires a bit of work to cover the races when a consumer is sending I/O requests to a controller that is transitioning to the failed state. To help cover this condition, add a task to defer completion of I/Os submitted to a failed controller, so that the consumer will still always receive its completions in a different context than the submission. Sponsored by: Intel Reviewed by: carl
This commit is contained in:
parent
3d7eb41c1b
commit
232e2edb6c
@ -49,6 +49,7 @@ static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr);
|
||||
static void destroy_geom_disk(struct nvd_disk *ndisk);
|
||||
|
||||
static void *nvd_new_controller(struct nvme_controller *ctrlr);
|
||||
static void nvd_controller_fail(void *ctrlr);
|
||||
|
||||
static int nvd_load(void);
|
||||
static void nvd_unload(void);
|
||||
@ -118,7 +119,7 @@ nvd_load()
|
||||
TAILQ_INIT(&disk_head);
|
||||
|
||||
consumer_handle = nvme_register_consumer(nvd_new_disk,
|
||||
nvd_new_controller, NULL);
|
||||
nvd_new_controller, NULL, nvd_controller_fail);
|
||||
|
||||
return (consumer_handle != NULL ? 0 : -1);
|
||||
}
|
||||
@ -351,3 +352,22 @@ destroy_geom_disk(struct nvd_disk *ndisk)
|
||||
|
||||
mtx_destroy(&ndisk->bioqlock);
|
||||
}
|
||||
|
||||
static void
|
||||
nvd_controller_fail(void *ctrlr_arg)
|
||||
{
|
||||
struct nvd_controller *ctrlr = ctrlr_arg;
|
||||
struct nvd_disk *disk;
|
||||
|
||||
while (!TAILQ_EMPTY(&ctrlr->disk_head)) {
|
||||
disk = TAILQ_FIRST(&ctrlr->disk_head);
|
||||
TAILQ_REMOVE(&disk_head, disk, global_tailq);
|
||||
TAILQ_REMOVE(&ctrlr->disk_head, disk, ctrlr_tailq);
|
||||
destroy_geom_disk(disk);
|
||||
free(disk, M_NVD);
|
||||
}
|
||||
|
||||
TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
|
||||
free(ctrlr, M_NVD);
|
||||
}
|
||||
|
||||
|
@ -44,6 +44,7 @@ struct nvme_consumer {
|
||||
nvme_cons_ns_fn_t ns_fn;
|
||||
nvme_cons_ctrlr_fn_t ctrlr_fn;
|
||||
nvme_cons_async_fn_t async_fn;
|
||||
nvme_cons_fail_fn_t fail_fn;
|
||||
};
|
||||
|
||||
struct nvme_consumer nvme_consumer[NVME_MAX_CONSUMERS];
|
||||
@ -349,9 +350,23 @@ nvme_notify_async_consumers(struct nvme_controller *ctrlr,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
nvme_notify_fail_consumers(struct nvme_controller *ctrlr)
|
||||
{
|
||||
struct nvme_consumer *cons;
|
||||
uint32_t i;
|
||||
|
||||
for (i = 0; i < NVME_MAX_CONSUMERS; i++) {
|
||||
cons = &nvme_consumer[i];
|
||||
if (cons->id != INVALID_CONSUMER_ID && cons->fail_fn != NULL)
|
||||
cons->fail_fn(ctrlr->cons_cookie[i]);
|
||||
}
|
||||
}
|
||||
|
||||
struct nvme_consumer *
|
||||
nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn,
|
||||
nvme_cons_async_fn_t async_fn)
|
||||
nvme_cons_async_fn_t async_fn,
|
||||
nvme_cons_fail_fn_t fail_fn)
|
||||
{
|
||||
int i;
|
||||
|
||||
@ -365,6 +380,7 @@ nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn,
|
||||
nvme_consumer[i].ns_fn = ns_fn;
|
||||
nvme_consumer[i].ctrlr_fn = ctrlr_fn;
|
||||
nvme_consumer[i].async_fn = async_fn;
|
||||
nvme_consumer[i].fail_fn = fail_fn;
|
||||
|
||||
nvme_notify_consumer(&nvme_consumer[i]);
|
||||
return (&nvme_consumer[i]);
|
||||
|
@ -733,6 +733,7 @@ typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *);
|
||||
typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *);
|
||||
typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *,
|
||||
uint32_t, void *, uint32_t);
|
||||
typedef void (*nvme_cons_fail_fn_t)(void *);
|
||||
|
||||
enum nvme_namespace_flags {
|
||||
NVME_NS_DEALLOCATE_SUPPORTED = 0x1,
|
||||
@ -769,7 +770,8 @@ int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn,
|
||||
/* Registration functions */
|
||||
struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn,
|
||||
nvme_cons_ctrlr_fn_t ctrlr_fn,
|
||||
nvme_cons_async_fn_t async_fn);
|
||||
nvme_cons_async_fn_t async_fn,
|
||||
nvme_cons_fail_fn_t fail_fn);
|
||||
void nvme_unregister_consumer(struct nvme_consumer *consumer);
|
||||
|
||||
/* Controller helper functions */
|
||||
|
@ -311,6 +311,45 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
nvme_ctrlr_fail(struct nvme_controller *ctrlr)
|
||||
{
|
||||
int i;
|
||||
|
||||
ctrlr->is_failed = TRUE;
|
||||
nvme_qpair_fail(&ctrlr->adminq);
|
||||
for (i = 0; i < ctrlr->num_io_queues; i++)
|
||||
nvme_qpair_fail(&ctrlr->ioq[i]);
|
||||
nvme_notify_fail_consumers(ctrlr);
|
||||
}
|
||||
|
||||
void
|
||||
nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
|
||||
struct nvme_request *req)
|
||||
{
|
||||
|
||||
mtx_lock(&ctrlr->fail_req_lock);
|
||||
STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
|
||||
mtx_unlock(&ctrlr->fail_req_lock);
|
||||
taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
|
||||
}
|
||||
|
||||
static void
|
||||
nvme_ctrlr_fail_req_task(void *arg, int pending)
|
||||
{
|
||||
struct nvme_controller *ctrlr = arg;
|
||||
struct nvme_request *req;
|
||||
|
||||
mtx_lock(&ctrlr->fail_req_lock);
|
||||
while (!STAILQ_EMPTY(&ctrlr->fail_req)) {
|
||||
req = STAILQ_FIRST(&ctrlr->fail_req);
|
||||
STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq);
|
||||
nvme_qpair_manual_complete_request(req->qpair, req,
|
||||
NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE);
|
||||
}
|
||||
mtx_unlock(&ctrlr->fail_req_lock);
|
||||
}
|
||||
|
||||
static int
|
||||
nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr)
|
||||
{
|
||||
@ -426,8 +465,12 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr)
|
||||
|
||||
cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
|
||||
|
||||
if (cmpset == 0)
|
||||
/* Controller is already resetting. */
|
||||
if (cmpset == 0 || ctrlr->is_failed)
|
||||
/*
|
||||
* Controller is already resetting or has failed. Return
|
||||
* immediately since there is no need to kick off another
|
||||
* reset in these cases.
|
||||
*/
|
||||
return;
|
||||
|
||||
taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
|
||||
@ -745,17 +788,25 @@ nvme_ctrlr_start(void *ctrlr_arg)
|
||||
|
||||
nvme_admin_qpair_enable(&ctrlr->adminq);
|
||||
|
||||
if (nvme_ctrlr_identify(ctrlr) != 0)
|
||||
if (nvme_ctrlr_identify(ctrlr) != 0) {
|
||||
nvme_ctrlr_fail(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
|
||||
if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
|
||||
nvme_ctrlr_fail(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
if (nvme_ctrlr_create_qpairs(ctrlr) != 0)
|
||||
if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
|
||||
nvme_ctrlr_fail(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
|
||||
if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
|
||||
nvme_ctrlr_fail(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
nvme_ctrlr_configure_aer(ctrlr);
|
||||
nvme_ctrlr_configure_int_coalescing(ctrlr);
|
||||
@ -802,6 +853,8 @@ nvme_ctrlr_reset_task(void *arg, int pending)
|
||||
pause("nvmereset", hz / 10);
|
||||
if (status == 0)
|
||||
nvme_ctrlr_start(ctrlr);
|
||||
else
|
||||
nvme_ctrlr_fail(ctrlr);
|
||||
|
||||
atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
|
||||
}
|
||||
@ -998,12 +1051,18 @@ intx:
|
||||
|
||||
ctrlr->cdev->si_drv1 = (void *)ctrlr;
|
||||
|
||||
TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
|
||||
ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
|
||||
taskqueue_thread_enqueue, &ctrlr->taskqueue);
|
||||
taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");
|
||||
|
||||
ctrlr->is_resetting = 0;
|
||||
TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
|
||||
|
||||
TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr);
|
||||
mtx_init(&ctrlr->fail_req_lock, "nvme ctrlr fail req lock", NULL,
|
||||
MTX_DEF);
|
||||
STAILQ_INIT(&ctrlr->fail_req);
|
||||
ctrlr->is_failed = FALSE;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
@ -85,7 +85,7 @@ MALLOC_DECLARE(M_NVME);
|
||||
*/
|
||||
#define NVME_IO_ENTRIES (256)
|
||||
#define NVME_IO_TRACKERS (128)
|
||||
#define NVME_MIN_IO_TRACKERS (16)
|
||||
#define NVME_MIN_IO_TRACKERS (4)
|
||||
#define NVME_MAX_IO_TRACKERS (1024)
|
||||
|
||||
/*
|
||||
@ -119,6 +119,7 @@ extern int32_t nvme_retry_count;
|
||||
struct nvme_request {
|
||||
|
||||
struct nvme_command cmd;
|
||||
struct nvme_qpair *qpair;
|
||||
void *payload;
|
||||
uint32_t payload_size;
|
||||
boolean_t timeout;
|
||||
@ -250,7 +251,9 @@ struct nvme_controller {
|
||||
struct intr_config_hook config_hook;
|
||||
uint32_t ns_identified;
|
||||
uint32_t queues_created;
|
||||
|
||||
struct task reset_task;
|
||||
struct task fail_req_task;
|
||||
struct taskqueue *taskqueue;
|
||||
|
||||
/* For shared legacy interrupt. */
|
||||
@ -293,6 +296,10 @@ struct nvme_controller {
|
||||
|
||||
uint32_t is_resetting;
|
||||
|
||||
struct mtx fail_req_lock;
|
||||
boolean_t is_failed;
|
||||
STAILQ_HEAD(, nvme_request) fail_req;
|
||||
|
||||
#ifdef CHATHAM2
|
||||
uint64_t chatham_size;
|
||||
uint64_t chatham_lbas;
|
||||
@ -403,6 +410,8 @@ void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
|
||||
struct nvme_request *req);
|
||||
void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
|
||||
struct nvme_request *req);
|
||||
void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
|
||||
struct nvme_request *req);
|
||||
|
||||
void nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
|
||||
uint16_t vector, uint32_t num_entries,
|
||||
@ -414,6 +423,11 @@ void nvme_qpair_process_completions(struct nvme_qpair *qpair);
|
||||
void nvme_qpair_submit_request(struct nvme_qpair *qpair,
|
||||
struct nvme_request *req);
|
||||
void nvme_qpair_reset(struct nvme_qpair *qpair);
|
||||
void nvme_qpair_fail(struct nvme_qpair *qpair);
|
||||
void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
|
||||
struct nvme_request *req,
|
||||
uint32_t sct, uint32_t sc,
|
||||
boolean_t print_on_error);
|
||||
|
||||
void nvme_admin_qpair_enable(struct nvme_qpair *qpair);
|
||||
void nvme_admin_qpair_disable(struct nvme_qpair *qpair);
|
||||
@ -484,5 +498,6 @@ void nvme_notify_async_consumers(struct nvme_controller *ctrlr,
|
||||
const struct nvme_completion *async_cpl,
|
||||
uint32_t log_page_id, void *log_page_buffer,
|
||||
uint32_t log_page_size);
|
||||
void nvme_notify_fail_consumers(struct nvme_controller *ctrlr);
|
||||
|
||||
#endif /* __NVME_PRIVATE_H__ */
|
||||
|
@ -50,7 +50,6 @@ nvme_completion_is_retry(const struct nvme_completion *cpl)
|
||||
case NVME_SCT_GENERIC:
|
||||
switch (cpl->status.sc) {
|
||||
case NVME_SC_ABORTED_BY_REQUEST:
|
||||
return (1);
|
||||
case NVME_SC_NAMESPACE_NOT_READY:
|
||||
if (cpl->status.dnr)
|
||||
return (0);
|
||||
@ -155,7 +154,7 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr,
|
||||
|
||||
static void
|
||||
nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair,
|
||||
struct nvme_tracker *tr, uint32_t sct, uint32_t sc,
|
||||
struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
|
||||
boolean_t print_on_error)
|
||||
{
|
||||
struct nvme_completion cpl;
|
||||
@ -165,9 +164,36 @@ nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair,
|
||||
cpl.cid = tr->cid;
|
||||
cpl.status.sct = sct;
|
||||
cpl.status.sc = sc;
|
||||
cpl.status.dnr = dnr;
|
||||
nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
|
||||
}
|
||||
|
||||
void
|
||||
nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
|
||||
struct nvme_request *req, uint32_t sct, uint32_t sc,
|
||||
boolean_t print_on_error)
|
||||
{
|
||||
struct nvme_completion cpl;
|
||||
boolean_t error;
|
||||
|
||||
memset(&cpl, 0, sizeof(cpl));
|
||||
cpl.sqid = qpair->id;
|
||||
cpl.status.sct = sct;
|
||||
cpl.status.sc = sc;
|
||||
|
||||
error = nvme_completion_is_error(&cpl);
|
||||
|
||||
if (error && print_on_error) {
|
||||
nvme_dump_completion(&cpl);
|
||||
nvme_dump_command(&req->cmd);
|
||||
}
|
||||
|
||||
if (req->cb_fn)
|
||||
req->cb_fn(req->cb_arg, &cpl);
|
||||
|
||||
nvme_free_request(req);
|
||||
}
|
||||
|
||||
void
|
||||
nvme_qpair_process_completions(struct nvme_qpair *qpair)
|
||||
{
|
||||
@ -363,7 +389,7 @@ nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
|
||||
while (tr != NULL) {
|
||||
if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
|
||||
nvme_qpair_manual_complete_tracker(qpair, tr,
|
||||
NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION,
|
||||
NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
|
||||
FALSE);
|
||||
tr = TAILQ_FIRST(&qpair->outstanding_tr);
|
||||
} else {
|
||||
@ -406,7 +432,7 @@ nvme_abort_complete(void *arg, const struct nvme_completion *status)
|
||||
*/
|
||||
printf("abort command failed, aborting command manually\n");
|
||||
nvme_qpair_manual_complete_tracker(tr->qpair, tr,
|
||||
NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE);
|
||||
NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -476,17 +502,32 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
|
||||
mtx_assert(&qpair->lock, MA_OWNED);
|
||||
|
||||
tr = TAILQ_FIRST(&qpair->free_tr);
|
||||
req->qpair = qpair;
|
||||
|
||||
if (tr == NULL || !qpair->is_enabled) {
|
||||
/*
|
||||
* No tracker is available, or the qpair is disabled due to
|
||||
* an in-progress controller-level reset.
|
||||
*
|
||||
* Put the request on the qpair's request queue to be processed
|
||||
* when a tracker frees up via a command completion or when
|
||||
* the controller reset is completed.
|
||||
* an in-progress controller-level reset or controller
|
||||
* failure.
|
||||
*/
|
||||
STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
|
||||
|
||||
if (qpair->ctrlr->is_failed) {
|
||||
/*
|
||||
* The controller has failed. Post the request to a
|
||||
* task where it will be aborted, so that we do not
|
||||
* invoke the request's callback in the context
|
||||
* of the submission.
|
||||
*/
|
||||
nvme_ctrlr_post_failed_request(qpair->ctrlr, req);
|
||||
} else {
|
||||
/*
|
||||
* Put the request on the qpair's request queue to be
|
||||
* processed when a tracker frees up via a command
|
||||
* completion or when the controller reset is
|
||||
* completed.
|
||||
*/
|
||||
STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@ -574,7 +615,7 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
|
||||
device_printf(qpair->ctrlr->dev,
|
||||
"aborting outstanding i/o\n");
|
||||
nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
|
||||
NVME_SC_ABORTED_BY_REQUEST, TRUE);
|
||||
NVME_SC_ABORTED_BY_REQUEST, 0, TRUE);
|
||||
}
|
||||
|
||||
mtx_lock(&qpair->lock);
|
||||
@ -622,3 +663,41 @@ nvme_io_qpair_disable(struct nvme_qpair *qpair)
|
||||
|
||||
nvme_qpair_disable(qpair);
|
||||
}
|
||||
|
||||
void
|
||||
nvme_qpair_fail(struct nvme_qpair *qpair)
|
||||
{
|
||||
struct nvme_tracker *tr;
|
||||
struct nvme_request *req;
|
||||
|
||||
mtx_lock(&qpair->lock);
|
||||
|
||||
while (!STAILQ_EMPTY(&qpair->queued_req)) {
|
||||
req = STAILQ_FIRST(&qpair->queued_req);
|
||||
STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
|
||||
device_printf(qpair->ctrlr->dev,
|
||||
"failing queued i/o\n");
|
||||
mtx_unlock(&qpair->lock);
|
||||
nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
|
||||
NVME_SC_ABORTED_BY_REQUEST, TRUE);
|
||||
mtx_lock(&qpair->lock);
|
||||
}
|
||||
|
||||
/* Manually abort each outstanding I/O. */
|
||||
while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
|
||||
tr = TAILQ_FIRST(&qpair->outstanding_tr);
|
||||
/*
|
||||
* Do not remove the tracker. The abort_tracker path will
|
||||
* do that for us.
|
||||
*/
|
||||
device_printf(qpair->ctrlr->dev,
|
||||
"failing outstanding i/o\n");
|
||||
mtx_unlock(&qpair->lock);
|
||||
nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
|
||||
NVME_SC_ABORTED_BY_REQUEST, 1 /* do not retry */, TRUE);
|
||||
mtx_lock(&qpair->lock);
|
||||
}
|
||||
|
||||
mtx_unlock(&qpair->lock);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user