bdev/nvme: Reset the nvme_ctrlr if an I/O qpair is disconnected

Previously, if an I/O qpair is disconnected, we tried reconnecting
the qpair. However, this reconnect operation was very likely to fail
and will not match the upcoming asynchronous connect/reconnect
operation. We need an extra callback to make this reconnect operation
asynchronous, but we do not want to have it.

Hence if an I/O qpair is disconnected, we free the I/O qpair and then
reset the corresponding nvme_ctrlr immediately. If the admin qpair is
also disconnected, the nvme_ctrlr is reset immediately. However this
event may never happen. So we do not wait for the error of the admin
qpair.

The NVMf host may disconnect connections by itself intentionally.
In this case, resetting the nvme_ctrlr will surely fail. But resetting
the nvme_ctrlr frees all I/O qpairs of the nvme_ctrlr and these I/O
qpairs are not created again until resetting the nvme_ctrlr succeeds.
Resetting the nvme_ctrlr once at most is more efficient than repeating
reconnecting the I/O qpair. So this change is valuable even for such
intentional disconnection. However, it is helpful to know the event that
I/O qpair is disconnected. Hence change DEBUGLOG to NOTICELOG in the
disconnected callback. The disconnected callback is not repeated, and
we do not need to worry about NOTICELOG flooding.

Refine the unit test case to verify this change.

Signed-off-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Change-Id: I376b749c2f55d010692bf916370e8bb4249b795f
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9515
Community-CI: Mellanox Build Bot
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
Shuhei Matsumoto 2021-09-15 13:01:49 +09:00 committed by Tomasz Zawadzki
parent c9ab7ae4b3
commit 14535253f1
2 changed files with 164 additions and 74 deletions

View File

@ -193,6 +193,7 @@ static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qp
static int bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
static int bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio);
static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr);
static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove);
static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
@ -503,19 +504,47 @@ bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status);
}
static struct nvme_ctrlr_channel *
nvme_poll_group_get_ctrlr_channel(struct nvme_poll_group *group,
struct spdk_nvme_qpair *qpair)
{
struct nvme_ctrlr_channel *ctrlr_ch;
TAILQ_FOREACH(ctrlr_ch, &group->ctrlr_ch_list, tailq) {
if (ctrlr_ch->qpair == qpair) {
break;
}
}
return ctrlr_ch;
}
static void
bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
{
if (ctrlr_ch->qpair != NULL) {
spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
ctrlr_ch->qpair = NULL;
}
}
static void
bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
{
int rc;
struct nvme_poll_group *group = poll_group_ctx;
struct nvme_ctrlr_channel *ctrlr_ch;
struct nvme_ctrlr *nvme_ctrlr;
SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair);
SPDK_NOTICELOG("qpair %p is disconnected, free the qpair and reset controller.\n", qpair);
/*
* Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
* reconnect a qpair and we will stop getting a callback for this one.
* Free the I/O qpair and reset the nvme_ctrlr.
*/
rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
if (rc != 0) {
SPDK_DEBUGLOG(bdev_nvme, "Failed to reconnect to qpair %p, errno %d\n", qpair, -rc);
ctrlr_ch = nvme_poll_group_get_ctrlr_channel(group, qpair);
if (ctrlr_ch != NULL) {
bdev_nvme_destroy_qpair(ctrlr_ch);
nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch);
bdev_nvme_reset(nvme_ctrlr);
}
}
@ -652,15 +681,6 @@ err:
return rc;
}
static void
bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch)
{
if (ctrlr_ch->qpair != NULL) {
spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair);
ctrlr_ch->qpair = NULL;
}
}
static void
_bdev_nvme_check_pending_destruct(struct nvme_ctrlr *nvme_ctrlr)
{

View File

@ -1767,63 +1767,6 @@ test_attach_ctrlr(void)
g_ut_register_bdev_status = 0;
}
static void
test_reconnect_qpair(void)
{
struct spdk_nvme_transport_id trid = {};
struct spdk_nvme_ctrlr ctrlr = {};
struct nvme_ctrlr *nvme_ctrlr = NULL;
struct spdk_io_channel *ch;
struct nvme_ctrlr_channel *ctrlr_ch;
int rc;
set_thread(0);
ut_init_trid(&trid);
TAILQ_INIT(&ctrlr.active_io_qpairs);
rc = nvme_ctrlr_create(&ctrlr, "nvme0", &trid, 0, NULL);
CU_ASSERT(rc == 0);
nvme_ctrlr = nvme_ctrlr_get_by_name("nvme0");
SPDK_CU_ASSERT_FATAL(nvme_ctrlr != NULL);
ch = spdk_get_io_channel(nvme_ctrlr);
SPDK_CU_ASSERT_FATAL(ch != NULL);
ctrlr_ch = spdk_io_channel_get_ctx(ch);
CU_ASSERT(ctrlr_ch->qpair != NULL);
CU_ASSERT(ctrlr_ch->group != NULL);
CU_ASSERT(ctrlr_ch->group->group != NULL);
CU_ASSERT(ctrlr_ch->group->poller != NULL);
/* Test if the disconnected qpair is reconnected. */
ctrlr_ch->qpair->is_connected = false;
poll_threads();
CU_ASSERT(ctrlr_ch->qpair->is_connected == true);
/* If the ctrlr is failed, reconnecting qpair should fail too. */
ctrlr_ch->qpair->is_connected = false;
ctrlr.is_failed = true;
poll_threads();
CU_ASSERT(ctrlr_ch->qpair->is_connected == false);
spdk_put_io_channel(ch);
poll_threads();
rc = bdev_nvme_delete("nvme0", NULL);
CU_ASSERT(rc == 0);
poll_threads();
CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
}
static void
test_aer_cb(void)
{
@ -2710,6 +2653,133 @@ test_get_memory_domains(void)
MOCK_CLEAR(spdk_nvme_ctrlr_get_memory_domain);
}
static void
test_reconnect_qpair(void)
{
struct spdk_nvme_transport_id trid = {};
struct spdk_nvme_ctrlr *ctrlr;
struct nvme_ctrlr *nvme_ctrlr;
const int STRING_SIZE = 32;
const char *attached_names[STRING_SIZE];
struct nvme_bdev *bdev;
struct spdk_io_channel *ch1, *ch2;
struct nvme_bdev_channel *nbdev_ch1, *nbdev_ch2;
struct nvme_ctrlr_channel *ctrlr_ch1, *ctrlr_ch2;
int rc;
memset(attached_names, 0, sizeof(char *) * STRING_SIZE);
ut_init_trid(&trid);
set_thread(0);
ctrlr = ut_attach_ctrlr(&trid, 1, false);
SPDK_CU_ASSERT_FATAL(ctrlr != NULL);
g_ut_attach_ctrlr_status = 0;
g_ut_attach_bdev_count = 1;
rc = bdev_nvme_create(&trid, "nvme0", attached_names, STRING_SIZE, 0,
attach_ctrlr_done, NULL, NULL);
CU_ASSERT(rc == 0);
spdk_delay_us(1000);
poll_threads();
nvme_ctrlr = nvme_ctrlr_get_by_name("nvme0");
SPDK_CU_ASSERT_FATAL(nvme_ctrlr != NULL);
bdev = nvme_ctrlr_get_ns(nvme_ctrlr, 1)->bdev;
SPDK_CU_ASSERT_FATAL(bdev != NULL);
ch1 = spdk_get_io_channel(bdev);
SPDK_CU_ASSERT_FATAL(ch1 != NULL);
nbdev_ch1 = spdk_io_channel_get_ctx(ch1);
ctrlr_ch1 = nbdev_ch1->ctrlr_ch;
SPDK_CU_ASSERT_FATAL(ctrlr_ch1 != NULL);
set_thread(1);
ch2 = spdk_get_io_channel(bdev);
SPDK_CU_ASSERT_FATAL(ch2 != NULL);
nbdev_ch2 = spdk_io_channel_get_ctx(ch2);
ctrlr_ch2 = nbdev_ch2->ctrlr_ch;
SPDK_CU_ASSERT_FATAL(ctrlr_ch2 != NULL);
/* If a qpair is disconnected, it is freed and then reconnected via
* resetting the corresponding nvme_ctrlr.
*/
ctrlr_ch2->qpair->is_connected = false;
ctrlr->is_failed = true;
poll_thread_times(1, 1);
CU_ASSERT(ctrlr_ch1->qpair != NULL);
CU_ASSERT(ctrlr_ch2->qpair == NULL);
CU_ASSERT(nvme_ctrlr->resetting == true);
poll_thread_times(0, 1);
poll_thread_times(1, 1);
CU_ASSERT(ctrlr_ch1->qpair == NULL);
CU_ASSERT(ctrlr_ch2->qpair == NULL);
CU_ASSERT(ctrlr->is_failed == true);
poll_thread_times(1, 1);
CU_ASSERT(ctrlr->is_failed == false);
poll_thread_times(0, 1);
poll_thread_times(1, 1);
CU_ASSERT(ctrlr_ch1->qpair != NULL);
CU_ASSERT(ctrlr_ch2->qpair != NULL);
CU_ASSERT(nvme_ctrlr->resetting == true);
poll_thread_times(1, 1);
CU_ASSERT(nvme_ctrlr->resetting == false);
poll_threads();
/* If a qpair is disconnected and resetting the corresponding nvme_ctrlr
* fails, the qpair is just freed.
*/
ctrlr_ch2->qpair->is_connected = false;
ctrlr->is_failed = true;
ctrlr->fail_reset = true;
poll_thread_times(1, 1);
CU_ASSERT(ctrlr_ch1->qpair != NULL);
CU_ASSERT(ctrlr_ch2->qpair == NULL);
CU_ASSERT(nvme_ctrlr->resetting == true);
poll_thread_times(0, 1);
poll_thread_times(1, 1);
CU_ASSERT(ctrlr_ch1->qpair == NULL);
CU_ASSERT(ctrlr_ch2->qpair == NULL);
CU_ASSERT(ctrlr->is_failed == true);
poll_thread_times(1, 1);
CU_ASSERT(ctrlr->is_failed == true);
CU_ASSERT(nvme_ctrlr->resetting == false);
CU_ASSERT(ctrlr_ch1->qpair == NULL);
CU_ASSERT(ctrlr_ch2->qpair == NULL);
poll_threads();
spdk_put_io_channel(ch2);
set_thread(0);
spdk_put_io_channel(ch1);
poll_threads();
rc = bdev_nvme_delete("nvme0", NULL);
CU_ASSERT(rc == 0);
poll_threads();
CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
}
int
main(int argc, const char **argv)
{
@ -2727,7 +2797,6 @@ main(int argc, const char **argv)
CU_ADD_TEST(suite, test_failover_ctrlr);
CU_ADD_TEST(suite, test_pending_reset);
CU_ADD_TEST(suite, test_attach_ctrlr);
CU_ADD_TEST(suite, test_reconnect_qpair);
CU_ADD_TEST(suite, test_aer_cb);
CU_ADD_TEST(suite, test_submit_nvme_cmd);
CU_ADD_TEST(suite, test_add_remove_trid);
@ -2737,6 +2806,7 @@ main(int argc, const char **argv)
CU_ADD_TEST(suite, test_compare_ns);
CU_ADD_TEST(suite, test_init_ana_log_page);
CU_ADD_TEST(suite, test_get_memory_domains);
CU_ADD_TEST(suite, test_reconnect_qpair);
CU_basic_set_mode(CU_BRM_VERBOSE);