From ef409194a1589a14c16f4e14d17dc01dcbdeacfe Mon Sep 17 00:00:00 2001 From: Shuhei Matsumoto Date: Fri, 8 Oct 2021 07:11:17 +0900 Subject: [PATCH] bdev/nvme: Retry I/O a second later if any I/O path may become available If ANA state is inaccessible or qpair is disconnected, I/O cannot be submitted. But if qpair is connected, ANA state may become accessible, or if qpair is disconnected, it may become connected via resetting. Hence even if find_io_path() returned NULL, queue I/O and retry it one second later if qpair is connected or ctrlr is resetting. Sort retried I/Os by expiration values in ticks, and activate a timed poller per nvme_bdev_channel only if there is any retried I/O. So the poller function bdev_nvme_retry_ios() always returns BUSY because if the poller runs earlier than the closest retried I/O or runs when there is no retried I/O, it is more like a bug of the framework. Signed-off-by: Shuhei Matsumoto Change-Id: Id28110a0d63ebc1c5772814e2ff8a47934df1644 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9830 Community-CI: Broadcom CI Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins Reviewed-by: Ben Walker Reviewed-by: Aleksey Marchuk --- module/bdev/nvme/bdev_nvme.c | 146 +++++++++++++- module/bdev/nvme/bdev_nvme.h | 4 +- .../lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c | 189 ++++++++++++++++++ 3 files changed, 334 insertions(+), 5 deletions(-) diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index 5ad4a91656..f22d4aa8be 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -104,6 +104,9 @@ struct nvme_bdev_io { /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ uint64_t handled_zones; + + /** Expiration value in ticks to retry the current I/O. */ + uint64_t retry_ticks; }; struct nvme_probe_ctx { @@ -156,6 +159,8 @@ static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, struct nvme_async_probe_ctx *ctx); static int bdev_nvme_library_init(void); static void bdev_nvme_library_fini(void); +static void bdev_nvme_submit_request(struct spdk_io_channel *ch, + struct spdk_bdev_io *bdev_io); static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); @@ -605,6 +610,7 @@ bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) int rc; STAILQ_INIT(&nbdev_ch->io_path_list); + TAILQ_INIT(&nbdev_ch->retry_io_list); pthread_mutex_lock(&nbdev->mutex); TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { @@ -621,11 +627,25 @@ bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) return 0; } +static void +bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) +{ + struct spdk_bdev_io *bdev_io, *tmp_io; + + TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { + TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); + } + + spdk_poller_unregister(&nbdev_ch->retry_io_poller); +} + static void bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) { struct nvme_bdev_channel *nbdev_ch = ctx_buf; + bdev_nvme_abort_retry_ios(nbdev_ch); _bdev_nvme_delete_io_paths(nbdev_ch); } @@ -678,6 +698,16 @@ nvme_io_path_is_available(struct nvme_io_path *io_path) return true; } +static bool +nvme_io_path_is_failed(struct nvme_io_path *io_path) +{ + struct nvme_ctrlr *nvme_ctrlr; + + nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch); + + return spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr); +} + static inline struct nvme_io_path * bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) { @@ -705,6 +735,98 @@ bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) return non_optimized; } +/* Return true if there is any io_path whose qpair is active or ctrlr is not failed, + * or false otherwise. + * + * If any io_path has an active qpair but find_io_path() returned NULL, its namespace + * is likely to be non-accessible now but may become accessible. + * + * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr + * is likely to be resetting now but the reset may succeeed. A ctrlr is set to unfailed + * when starting to reset it but it is set to failed when the reset failed. Hence, if + * a ctrlr is unfailed, it is likely that it works fine or is resetting. + */ +static bool +any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) +{ + struct nvme_io_path *io_path; + + STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { + if (nvme_io_path_is_connected(io_path) || + !nvme_io_path_is_failed(io_path)) { + return true; + } + } + + return false; +} + +static int +bdev_nvme_retry_ios(void *arg) +{ + struct nvme_bdev_channel *nbdev_ch = arg; + struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch); + struct spdk_bdev_io *bdev_io, *tmp_bdev_io; + struct nvme_bdev_io *bio; + uint64_t now, delay_us; + + now = spdk_get_ticks(); + + TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { + bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; + if (bio->retry_ticks > now) { + break; + } + + TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); + + bdev_nvme_submit_request(ch, bdev_io); + } + + spdk_poller_unregister(&nbdev_ch->retry_io_poller); + + bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); + if (bdev_io != NULL) { + bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; + + delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); + + nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, + delay_us); + } + + return SPDK_POLLER_BUSY; +} + +static void +bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, + struct nvme_bdev_io *bio, uint64_t delay_ms) +{ + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + struct spdk_bdev_io *tmp_bdev_io; + struct nvme_bdev_io *tmp_bio; + + bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; + + TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { + tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; + + if (tmp_bio->retry_ticks <= bio->retry_ticks) { + TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, + module_link); + return; + } + } + + /* No earlier I/Os were found. This I/O must be the new head. */ + TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); + + spdk_poller_unregister(&nbdev_ch->retry_io_poller); + + nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, + delay_ms * 1000ULL); +} + static inline void bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, const struct spdk_nvme_cpl *cpl) @@ -716,17 +838,33 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, static inline void bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) { + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + struct nvme_bdev_channel *nbdev_ch; enum spdk_bdev_io_status io_status; - if (rc == 0) { + switch (rc) { + case 0: io_status = SPDK_BDEV_IO_STATUS_SUCCESS; - } else if (rc == -ENOMEM) { + break; + case -ENOMEM: io_status = SPDK_BDEV_IO_STATUS_NOMEM; - } else { + break; + case -ENXIO: + nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); + + if (!bdev_nvme_io_type_is_admin(bdev_io->type) && + any_io_path_may_become_available(nbdev_ch)) { + bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); + return; + } + + /* fallthrough */ + default: io_status = SPDK_BDEV_IO_STATUS_FAILED; + break; } - spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); + spdk_bdev_io_complete(bdev_io, io_status); } static struct nvme_ctrlr_channel * diff --git a/module/bdev/nvme/bdev_nvme.h b/module/bdev/nvme/bdev_nvme.h index ae263d0f3e..b024b51174 100644 --- a/module/bdev/nvme/bdev_nvme.h +++ b/module/bdev/nvme/bdev_nvme.h @@ -172,7 +172,9 @@ struct nvme_io_path { }; struct nvme_bdev_channel { - STAILQ_HEAD(, nvme_io_path) io_path_list; + STAILQ_HEAD(, nvme_io_path) io_path_list; + TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list; + struct spdk_poller *retry_io_poller; }; struct nvme_poll_group { diff --git a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c index e23069dfc1..b6a4014457 100644 --- a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c +++ b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c @@ -3788,6 +3788,194 @@ test_find_io_path(void) CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path1); } +static void +test_retry_io_if_ctrlr_is_resetting(void) +{ + struct nvme_path_id path = {}; + struct spdk_nvme_ctrlr *ctrlr; + struct nvme_bdev_ctrlr *nbdev_ctrlr; + struct nvme_ctrlr *nvme_ctrlr; + const int STRING_SIZE = 32; + const char *attached_names[STRING_SIZE]; + struct nvme_bdev *bdev; + struct nvme_ns *nvme_ns; + struct spdk_bdev_io *bdev_io1, *bdev_io2; + struct spdk_io_channel *ch; + struct nvme_bdev_channel *nbdev_ch; + struct nvme_io_path *io_path; + struct nvme_ctrlr_channel *ctrlr_ch; + int rc; + + memset(attached_names, 0, sizeof(char *) * STRING_SIZE); + ut_init_trid(&path.trid); + + set_thread(0); + + ctrlr = ut_attach_ctrlr(&path.trid, 1, false, false); + SPDK_CU_ASSERT_FATAL(ctrlr != NULL); + + g_ut_attach_ctrlr_status = 0; + g_ut_attach_bdev_count = 1; + + rc = bdev_nvme_create(&path.trid, "nvme0", attached_names, STRING_SIZE, 0, + attach_ctrlr_done, NULL, NULL, false); + CU_ASSERT(rc == 0); + + spdk_delay_us(1000); + poll_threads(); + + nbdev_ctrlr = nvme_bdev_ctrlr_get("nvme0"); + SPDK_CU_ASSERT_FATAL(nbdev_ctrlr != NULL); + + nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, &path.trid); + CU_ASSERT(nvme_ctrlr != NULL); + + bdev = nvme_bdev_ctrlr_get_bdev(nbdev_ctrlr, 1); + CU_ASSERT(bdev != NULL); + + nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); + CU_ASSERT(nvme_ns != NULL); + + bdev_io1 = ut_alloc_bdev_io(SPDK_BDEV_IO_TYPE_WRITE, bdev, NULL); + ut_bdev_io_set_buf(bdev_io1); + + bdev_io2 = ut_alloc_bdev_io(SPDK_BDEV_IO_TYPE_WRITE, bdev, NULL); + ut_bdev_io_set_buf(bdev_io1); + + ch = spdk_get_io_channel(bdev); + SPDK_CU_ASSERT_FATAL(ch != NULL); + + nbdev_ch = spdk_io_channel_get_ctx(ch); + + io_path = ut_get_io_path_by_ctrlr(nbdev_ch, nvme_ctrlr); + SPDK_CU_ASSERT_FATAL(io_path != NULL); + + ctrlr_ch = io_path->ctrlr_ch; + SPDK_CU_ASSERT_FATAL(ctrlr_ch != NULL); + SPDK_CU_ASSERT_FATAL(ctrlr_ch->qpair != NULL); + + bdev_io1->internal.ch = (struct spdk_bdev_channel *)ch; + bdev_io2->internal.ch = (struct spdk_bdev_channel *)ch; + + /* If qpair is connected, I/O should succeed. */ + bdev_io1->internal.in_submit_request = true; + + bdev_nvme_submit_request(ch, bdev_io1); + CU_ASSERT(bdev_io1->internal.in_submit_request == true); + + poll_threads(); + CU_ASSERT(bdev_io1->internal.in_submit_request == false); + CU_ASSERT(bdev_io1->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS); + + /* If qpair is disconnected, it is freed and then reconnected via resetting + * the corresponding nvme_ctrlr. I/O should be queued if it is submitted + * while resetting the nvme_ctrlr. + */ + ctrlr_ch->qpair->is_connected = false; + ctrlr->is_failed = true; + + poll_thread_times(0, 3); + + CU_ASSERT(ctrlr_ch->qpair == NULL); + CU_ASSERT(nvme_ctrlr->resetting == true); + CU_ASSERT(ctrlr->is_failed == false); + + bdev_io1->internal.in_submit_request = true; + + bdev_nvme_submit_request(ch, bdev_io1); + + spdk_delay_us(1); + + bdev_io2->internal.in_submit_request = true; + + bdev_nvme_submit_request(ch, bdev_io2); + + CU_ASSERT(bdev_io1->internal.in_submit_request == true); + CU_ASSERT(bdev_io2->internal.in_submit_request == true); + CU_ASSERT(bdev_io1 == TAILQ_FIRST(&nbdev_ch->retry_io_list)); + CU_ASSERT(bdev_io2 == TAILQ_NEXT(bdev_io1, module_link)); + + poll_threads(); + + CU_ASSERT(ctrlr_ch->qpair != NULL); + CU_ASSERT(nvme_ctrlr->resetting == false); + + spdk_delay_us(999999); + + poll_thread_times(0, 1); + + CU_ASSERT(ctrlr_ch->qpair->num_outstanding_reqs == 1); + CU_ASSERT(bdev_io1->internal.in_submit_request == true); + CU_ASSERT(bdev_io2->internal.in_submit_request == true); + CU_ASSERT(bdev_io2 == TAILQ_FIRST(&nbdev_ch->retry_io_list)); + + poll_threads(); + + CU_ASSERT(ctrlr_ch->qpair->num_outstanding_reqs == 0); + CU_ASSERT(bdev_io1->internal.in_submit_request == false); + CU_ASSERT(bdev_io1->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); + CU_ASSERT(bdev_io2->internal.in_submit_request == true); + CU_ASSERT(bdev_io2 == TAILQ_FIRST(&nbdev_ch->retry_io_list)); + + spdk_delay_us(1); + + poll_thread_times(0, 1); + + CU_ASSERT(ctrlr_ch->qpair->num_outstanding_reqs == 1); + CU_ASSERT(bdev_io2->internal.in_submit_request == true); + CU_ASSERT(TAILQ_EMPTY(&nbdev_ch->retry_io_list)); + + poll_threads(); + + CU_ASSERT(ctrlr_ch->qpair->num_outstanding_reqs == 0); + CU_ASSERT(bdev_io2->internal.in_submit_request == false); + CU_ASSERT(bdev_io2->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); + + /* If ANA state of namespace is inaccessible, I/O should be queued. */ + nvme_ns->ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE; + + bdev_io1->internal.in_submit_request = true; + + bdev_nvme_submit_request(ch, bdev_io1); + + CU_ASSERT(ctrlr_ch->qpair->num_outstanding_reqs == 0); + CU_ASSERT(bdev_io1->internal.in_submit_request == true); + CU_ASSERT(bdev_io1 == TAILQ_FIRST(&nbdev_ch->retry_io_list)); + + /* ANA state became accessible while I/O was queued. */ + nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + + spdk_delay_us(1000000); + + poll_thread_times(0, 1); + + CU_ASSERT(ctrlr_ch->qpair->num_outstanding_reqs == 1); + CU_ASSERT(bdev_io1->internal.in_submit_request == true); + CU_ASSERT(TAILQ_EMPTY(&nbdev_ch->retry_io_list)); + + poll_threads(); + + CU_ASSERT(ctrlr_ch->qpair->num_outstanding_reqs == 0); + CU_ASSERT(bdev_io1->internal.in_submit_request == false); + CU_ASSERT(bdev_io1->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); + + free(bdev_io1); + free(bdev_io2); + + spdk_put_io_channel(ch); + + poll_threads(); + + rc = bdev_nvme_delete("nvme0", &g_any_path); + CU_ASSERT(rc == 0); + + poll_threads(); + spdk_delay_us(1000); + poll_threads(); + + CU_ASSERT(nvme_bdev_ctrlr_get("nvme0") == NULL); +} + int main(int argc, const char **argv) { @@ -3821,6 +4009,7 @@ main(int argc, const char **argv) CU_ADD_TEST(suite, test_admin_path); CU_ADD_TEST(suite, test_reset_bdev_ctrlr); CU_ADD_TEST(suite, test_find_io_path); + CU_ADD_TEST(suite, test_retry_io_if_ctrlr_is_resetting); CU_basic_set_mode(CU_BRM_VERBOSE);