bdev: share nomem_io data between bdevs built on the same device

When there are two bdevs built on the same io_device, it is possible that one bdev entirely saturates underlying queue, not letting the second bdev issue a single I/O. The second bdev will silently fail any subsequent I/O and append it to the nomem_io list. However, since we resend I/O only from I/O completion callback and there's no outstanding I/O for that bdev (io_outstanding==0), the I/O will never be resent. It'll be stuck in nomem_io forever. This patch makes nomem_io list to be shared between bdevs built on the same device. It is now possible that I/O completion callback from one bdev will retry sending I/O from other bdev. The shared bdev data is based on thread-local bdev_mgmt_channel, so doesn't need any external synchronization. Change-Id: Ia5ac3a1627ce3de4087e43907c329aa7d07ed7c7 Signed-off-by: Dariusz Stojaczyk <dariuszx.stojaczyk@intel.com> Reviewed-on: https://review.gerrithub.io/394658 Tested-by: SPDK Automated Test System <sys_sgsw@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Ziye Yang <optimistyzy@gmail.com>
2018-01-14 12:48:01 +01:00 · 2018-01-14 12:48:01 +01:00 · 583a24a489
commit 583a24a489
parent 5e799ff470
2 changed files with 177 additions and 45 deletions
--- a/lib/bdev/bdev.c
+++ b/lib/bdev/bdev.c
@ -113,6 +113,8 @@ struct spdk_bdev_mgmt_channel {
 	 */
 	bdev_io_stailq_t per_thread_cache;
 	uint32_t	per_thread_cache_count;
+
+	TAILQ_HEAD(, spdk_bdev_module_channel) module_channels;
 };

 struct spdk_bdev_desc {
@ -136,14 +138,33 @@ struct spdk_bdev_channel {

 	struct spdk_bdev_io_stat stat;

+	bdev_io_tailq_t		queued_resets;
+
+	uint32_t		flags;
+
+	/* Per-device channel */
+	struct spdk_bdev_module_channel *module_ch;
+
+#ifdef SPDK_CONFIG_VTUNE
+	uint64_t		start_tsc;
+	uint64_t		interval_tsc;
+	__itt_string_handle	*handle;
+#endif
+
+};
+
+/*
+ * Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device
+ * will queue here their IO that awaits retry. It makes it posible to retry sending
+ * IO to one bdev after IO from other bdev completes.
+ */
+struct spdk_bdev_module_channel {
 	/*
 	 * Count of I/O submitted to bdev module and waiting for completion.
 	 * Incremented before submit_request() is called on an spdk_bdev_io.
 	 */
 	uint64_t		io_outstanding;

-	bdev_io_tailq_t		queued_resets;
-
 	/*
 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
 	 *  on this channel.
@ -155,14 +176,12 @@ struct spdk_bdev_channel {
 	 */
 	uint64_t		nomem_threshold;

-	uint32_t		flags;
+	/* I/O channel allocated by a bdev module */
+	struct spdk_io_channel	*module_ch;

-#ifdef SPDK_CONFIG_VTUNE
-	uint64_t		start_tsc;
-	uint64_t		interval_tsc;
-	__itt_string_handle	*handle;
-#endif
+	uint32_t		ref;

+	TAILQ_ENTRY(spdk_bdev_module_channel) link;
 };

 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
@ -379,6 +398,8 @@ spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
 	STAILQ_INIT(&ch->per_thread_cache);
 	ch->per_thread_cache_count = 0;

+	TAILQ_INIT(&ch->module_channels);
+
 	return 0;
 }

@ -782,17 +803,18 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
 	struct spdk_bdev *bdev = bdev_io->bdev;
 	struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
 	struct spdk_io_channel *ch = bdev_ch->channel;
+	struct spdk_bdev_module_channel	*shared_ch = bdev_ch->module_ch;

 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);

-	bdev_ch->io_outstanding++;
+	shared_ch->io_outstanding++;
 	bdev_io->in_submit_request = true;
 	if (spdk_likely(bdev_ch->flags == 0)) {
-		if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) {
+		if (spdk_likely(TAILQ_EMPTY(&shared_ch->nomem_io))) {
 			bdev->fn_table->submit_request(ch, bdev_io);
 		} else {
-			bdev_ch->io_outstanding--;
-			TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link);
+			shared_ch->io_outstanding--;
+			TAILQ_INSERT_TAIL(&shared_ch->nomem_io, bdev_io, link);
 		}
 	} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
@ -851,6 +873,8 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
 {
 	struct spdk_bdev		*bdev = io_device;
 	struct spdk_bdev_channel	*ch = ctx_buf;
+	struct spdk_bdev_mgmt_channel	*mgmt_ch;
+	struct spdk_bdev_module_channel	*shared_ch;

 	ch->bdev = io_device;
 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
@ -864,12 +888,34 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
 		return -1;
 	}

+	mgmt_ch = spdk_io_channel_get_ctx(ch->mgmt_channel);
+	TAILQ_FOREACH(shared_ch, &mgmt_ch->module_channels, link) {
+		if (shared_ch->module_ch == ch->channel) {
+			shared_ch->ref++;
+			break;
+		}
+	}
+
+	if (shared_ch == NULL) {
+		shared_ch = calloc(1, sizeof(*shared_ch));
+		if (!shared_ch) {
+			spdk_put_io_channel(ch->channel);
+			spdk_put_io_channel(ch->mgmt_channel);
+			return -1;
+		}
+
+		shared_ch->io_outstanding = 0;
+		TAILQ_INIT(&shared_ch->nomem_io);
+		shared_ch->nomem_threshold = 0;
+		shared_ch->module_ch = ch->channel;
+		shared_ch->ref = 1;
+		TAILQ_INSERT_TAIL(&mgmt_ch->module_channels, shared_ch, link);
+	}
+
 	memset(&ch->stat, 0, sizeof(ch->stat));
-	ch->io_outstanding = 0;
 	TAILQ_INIT(&ch->queued_resets);
-	TAILQ_INIT(&ch->nomem_io);
-	ch->nomem_threshold = 0;
 	ch->flags = 0;
+	ch->module_ch = shared_ch;

 #ifdef SPDK_CONFIG_VTUNE
 	{
@ -935,7 +981,7 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
 			 *  that spdk_bdev_io_complete() will do.
 			 */
 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
-				ch->io_outstanding++;
+				ch->module_ch->io_outstanding++;
 			}
 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
 		}
@ -947,17 +993,24 @@ spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
 {
 	struct spdk_bdev_channel	*ch = ctx_buf;
 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
+	struct spdk_bdev_module_channel	*shared_ch = ch->module_ch;

 	mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);

 	_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
-	_spdk_bdev_abort_queued_io(&ch->nomem_io, ch);
+	_spdk_bdev_abort_queued_io(&shared_ch->nomem_io, ch);
 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch);
 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch);

+	assert(shared_ch->ref > 0);
+	shared_ch->ref--;
+	if (shared_ch->ref == 0) {
+		assert(shared_ch->io_outstanding == 0);
+		TAILQ_REMOVE(&mgmt_channel->module_channels, shared_ch, link);
+		free(shared_ch);
+	}
 	spdk_put_io_channel(ch->channel);
 	spdk_put_io_channel(ch->mgmt_channel);
-	assert(ch->io_outstanding == 0);
 }

 int
@ -1502,14 +1555,16 @@ _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
 	struct spdk_io_channel 		*ch;
 	struct spdk_bdev_channel	*channel;
 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
+	struct spdk_bdev_module_channel	*shared_ch;

 	ch = spdk_io_channel_iter_get_channel(i);
 	channel = spdk_io_channel_get_ctx(ch);
 	mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel);
+	shared_ch = channel->module_ch;

 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;

-	_spdk_bdev_abort_queued_io(&channel->nomem_io, channel);
+	_spdk_bdev_abort_queued_io(&shared_ch->nomem_io, channel);
 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);

@ -1724,9 +1779,10 @@ static void
 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
 {
 	struct spdk_bdev *bdev = bdev_ch->bdev;
+	struct spdk_bdev_module_channel	*shared_ch = bdev_ch->module_ch;
 	struct spdk_bdev_io *bdev_io;

-	if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) {
+	if (shared_ch->io_outstanding > shared_ch->nomem_threshold) {
 		/*
 		 * Allow some more I/O to complete before retrying the nomem_io queue.
 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
@ -1738,12 +1794,12 @@ _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
 		return;
 	}

-	while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) {
-		bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io);
-		TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link);
-		bdev_ch->io_outstanding++;
+	while (!TAILQ_EMPTY(&shared_ch->nomem_io)) {
+		bdev_io = TAILQ_FIRST(&shared_ch->nomem_io);
+		TAILQ_REMOVE(&shared_ch->nomem_io, bdev_io, link);
+		shared_ch->io_outstanding++;
 		bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
-		bdev->fn_table->submit_request(bdev_ch->channel, bdev_io);
+		bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io);
 		if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) {
 			break;
 		}
@ -1791,6 +1847,7 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
 {
 	struct spdk_bdev *bdev = bdev_io->bdev;
 	struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
+	struct spdk_bdev_module_channel	*shared_ch = bdev_ch->module_ch;

 	bdev_io->status = status;

@ -1813,22 +1870,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
 			return;
 		}
 	} else {
-		assert(bdev_ch->io_outstanding > 0);
-		bdev_ch->io_outstanding--;
+		assert(shared_ch->io_outstanding > 0);
+		shared_ch->io_outstanding--;
 		if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) {
-			if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) {
+			if (spdk_unlikely(!TAILQ_EMPTY(&shared_ch->nomem_io))) {
 				_spdk_bdev_ch_retry_io(bdev_ch);
 			}
 		} else {
-			TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link);
+			TAILQ_INSERT_HEAD(&shared_ch->nomem_io, bdev_io, link);
 			/*
 			 * Wait for some of the outstanding I/O to complete before we
 			 *  retry any of the nomem_io.  Normally we will wait for
 			 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
 			 *  depth channels we will instead wait for half to complete.
 			 */
-			bdev_ch->nomem_threshold = spdk_max((int64_t)bdev_ch->io_outstanding / 2,
-							    (int64_t)bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
+			shared_ch->nomem_threshold = spdk_max((int64_t)shared_ch->io_outstanding / 2,
+							      (int64_t)shared_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
 			return;
 		}
 	}
--- a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c
+++ b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c
@ -486,6 +486,7 @@ enomem(void)
 {
 	struct spdk_io_channel *io_ch;
 	struct spdk_bdev_channel *bdev_ch;
+	struct spdk_bdev_module_channel *module_ch;
 	struct ut_bdev_channel *ut_ch;
 	const uint32_t IO_ARRAY_SIZE = 64;
 	const uint32_t AVAIL = 20;
@ -499,6 +500,7 @@ enomem(void)
 	set_thread(0);
 	io_ch = spdk_bdev_get_io_channel(g_desc);
 	bdev_ch = spdk_io_channel_get_ctx(io_ch);
+	module_ch = bdev_ch->module_ch;
 	ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
 	ut_ch->avail_cnt = AVAIL;

@ -508,7 +510,7 @@ enomem(void)
 		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
 		CU_ASSERT(rc == 0);
 	}
-	CU_ASSERT(TAILQ_EMPTY(&bdev_ch->nomem_io));
+	CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io));

 	/*
 	 * Next, submit one additional I/O.  This one should fail with ENOMEM and then go onto
@ -517,8 +519,8 @@ enomem(void)
 	status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
 	rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
 	CU_ASSERT(rc == 0);
-	SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&bdev_ch->nomem_io));
-	first_io = TAILQ_FIRST(&bdev_ch->nomem_io);
+	SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io));
+	first_io = TAILQ_FIRST(&module_ch->nomem_io);

 	/*
 	 * Now submit a bunch more I/O.  These should all fail with ENOMEM and get queued behind
@ -531,10 +533,10 @@ enomem(void)
 	}

 	/* Assert that first_io is still at the head of the list. */
-	CU_ASSERT(TAILQ_FIRST(&bdev_ch->nomem_io) == first_io);
-	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
-	nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
-	CU_ASSERT(bdev_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));
+	CU_ASSERT(TAILQ_FIRST(&module_ch->nomem_io) == first_io);
+	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
+	nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io);
+	CU_ASSERT(module_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));

 	/*
 	 * Complete 1 I/O only.  The key check here is bdev_io_tailq_cnt - this should not have
@ -542,19 +544,19 @@ enomem(void)
 	 *  list.
 	 */
 	stub_complete_io(g_bdev.io_target, 1);
-	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
+	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt);

 	/*
 	 * Complete enough I/O to hit the nomem_theshold.  This should trigger retrying nomem_io,
 	 *  and we should see I/O get resubmitted to the test bdev module.
 	 */
 	stub_complete_io(g_bdev.io_target, NOMEM_THRESHOLD_COUNT - 1);
-	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) < nomem_cnt);
-	nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
+	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) < nomem_cnt);
+	nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io);

 	/* Complete 1 I/O only.  This should not trigger retrying the queued nomem_io. */
 	stub_complete_io(g_bdev.io_target, 1);
-	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
+	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt);

 	/*
 	 * Send a reset and confirm that all I/O are completed, including the ones that
@ -567,14 +569,86 @@ enomem(void)
 	/* This will complete the reset. */
 	stub_complete_io(g_bdev.io_target, 0);

-	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == 0);
-	CU_ASSERT(bdev_ch->io_outstanding == 0);
+	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == 0);
+	CU_ASSERT(module_ch->io_outstanding == 0);

 	spdk_put_io_channel(io_ch);
 	poll_threads();
 	teardown_test();
 }

+static void
+enomem_multi_bdev(void)
+{
+	struct spdk_io_channel *io_ch;
+	struct spdk_bdev_channel *bdev_ch;
+	struct spdk_bdev_module_channel *module_ch;
+	struct ut_bdev_channel *ut_ch;
+	const uint32_t IO_ARRAY_SIZE = 64;
+	const uint32_t AVAIL = 20;
+	enum spdk_bdev_io_status status[IO_ARRAY_SIZE];
+	uint32_t i;
+	struct ut_bdev *second_bdev;
+	struct spdk_bdev_desc *second_desc;
+	struct spdk_bdev_channel *second_bdev_ch;
+	struct spdk_io_channel *second_ch;
+	int rc;
+
+	setup_test();
+
+	/* Register second bdev with the same io_target  */
+	second_bdev = calloc(1, sizeof(*second_bdev));
+	SPDK_CU_ASSERT_FATAL(second_bdev != NULL);
+	register_bdev(second_bdev, "ut_bdev2", g_bdev.io_target);
+	spdk_bdev_open(&second_bdev->bdev, true, NULL, NULL, &second_desc);
+
+	set_thread(0);
+	io_ch = spdk_bdev_get_io_channel(g_desc);
+	bdev_ch = spdk_io_channel_get_ctx(io_ch);
+	module_ch = bdev_ch->module_ch;
+	ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
+	ut_ch->avail_cnt = AVAIL;
+
+	second_ch = spdk_bdev_get_io_channel(second_desc);
+	second_bdev_ch = spdk_io_channel_get_ctx(second_ch);
+	SPDK_CU_ASSERT_FATAL(module_ch == second_bdev_ch->module_ch);
+
+	/* Saturate io_target through bdev A. */
+	for (i = 0; i < AVAIL; i++) {
+		status[i] = SPDK_BDEV_IO_STATUS_PENDING;
+		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
+		CU_ASSERT(rc == 0);
+	}
+	CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io));
+
+	/*
+	 * Now submit I/O through the second bdev. This should fail with ENOMEM
+	 * and then go onto the nomem_io list.
+	 */
+	status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
+	rc = spdk_bdev_read_blocks(second_desc, second_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
+	CU_ASSERT(rc == 0);
+	SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io));
+
+	/* Complete first bdev's I/O. This should retry sending second bdev's nomem_io */
+	stub_complete_io(g_bdev.io_target, AVAIL);
+
+	SPDK_CU_ASSERT_FATAL(TAILQ_EMPTY(&module_ch->nomem_io));
+	CU_ASSERT(module_ch->io_outstanding == 1);
+
+	/* Now complete our retried I/O  */
+	stub_complete_io(g_bdev.io_target, 1);
+	SPDK_CU_ASSERT_FATAL(module_ch->io_outstanding == 0);
+
+	spdk_put_io_channel(io_ch);
+	spdk_put_io_channel(second_ch);
+	spdk_bdev_close(second_desc);
+	unregister_bdev(second_bdev);
+	free(second_bdev);
+	poll_threads();
+	teardown_test();
+}
+
 int
 main(int argc, char **argv)
 {
@ -596,7 +670,8 @@ main(int argc, char **argv)
 		CU_add_test(suite, "put_channel_during_reset", put_channel_during_reset) == NULL ||
 		CU_add_test(suite, "aborted_reset", aborted_reset) == NULL ||
 		CU_add_test(suite, "io_during_reset", io_during_reset) == NULL ||
-		CU_add_test(suite, "enomem", enomem) == NULL
+		CU_add_test(suite, "enomem", enomem) == NULL ||
+		CU_add_test(suite, "enomem_multi_bdev", enomem_multi_bdev) == NULL
 	) {
 		CU_cleanup_registry();
 		return CU_get_error();