bdev: share nomem_io data between bdevs built on the same device

When there are two bdevs built on the same io_device,
it is possible that one bdev entirely saturates
underlying queue, not letting the second bdev issue
a single I/O. The second bdev will silently fail any
subsequent I/O and append it to the nomem_io list.
However, since we resend I/O only from I/O completion
callback and there's no outstanding I/O for that bdev
(io_outstanding==0), the I/O will never be resent.
It'll be stuck in nomem_io forever.

This patch makes nomem_io list to be shared between
bdevs built on the same device. It is now possible
that I/O completion callback from one bdev will retry
sending I/O from other bdev.

The shared bdev data is based on thread-local
bdev_mgmt_channel, so doesn't need any external
synchronization.

Change-Id: Ia5ac3a1627ce3de4087e43907c329aa7d07ed7c7
Signed-off-by: Dariusz Stojaczyk <dariuszx.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/394658
Tested-by: SPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ziye Yang <optimistyzy@gmail.com>
This commit is contained in:
Dariusz Stojaczyk 2018-01-14 12:48:01 +01:00 committed by Jim Harris
parent 5e799ff470
commit 583a24a489
2 changed files with 177 additions and 45 deletions

View File

@ -113,6 +113,8 @@ struct spdk_bdev_mgmt_channel {
*/
bdev_io_stailq_t per_thread_cache;
uint32_t per_thread_cache_count;
TAILQ_HEAD(, spdk_bdev_module_channel) module_channels;
};
struct spdk_bdev_desc {
@ -136,14 +138,33 @@ struct spdk_bdev_channel {
struct spdk_bdev_io_stat stat;
bdev_io_tailq_t queued_resets;
uint32_t flags;
/* Per-device channel */
struct spdk_bdev_module_channel *module_ch;
#ifdef SPDK_CONFIG_VTUNE
uint64_t start_tsc;
uint64_t interval_tsc;
__itt_string_handle *handle;
#endif
};
/*
* Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device
* will queue here their IO that awaits retry. It makes it posible to retry sending
* IO to one bdev after IO from other bdev completes.
*/
struct spdk_bdev_module_channel {
/*
* Count of I/O submitted to bdev module and waiting for completion.
* Incremented before submit_request() is called on an spdk_bdev_io.
*/
uint64_t io_outstanding;
bdev_io_tailq_t queued_resets;
/*
* Queue of IO awaiting retry because of a previous NOMEM status returned
* on this channel.
@ -155,14 +176,12 @@ struct spdk_bdev_channel {
*/
uint64_t nomem_threshold;
uint32_t flags;
/* I/O channel allocated by a bdev module */
struct spdk_io_channel *module_ch;
#ifdef SPDK_CONFIG_VTUNE
uint64_t start_tsc;
uint64_t interval_tsc;
__itt_string_handle *handle;
#endif
uint32_t ref;
TAILQ_ENTRY(spdk_bdev_module_channel) link;
};
static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
@ -379,6 +398,8 @@ spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
STAILQ_INIT(&ch->per_thread_cache);
ch->per_thread_cache_count = 0;
TAILQ_INIT(&ch->module_channels);
return 0;
}
@ -782,17 +803,18 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
struct spdk_bdev *bdev = bdev_io->bdev;
struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
struct spdk_io_channel *ch = bdev_ch->channel;
struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch;
assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
bdev_ch->io_outstanding++;
shared_ch->io_outstanding++;
bdev_io->in_submit_request = true;
if (spdk_likely(bdev_ch->flags == 0)) {
if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) {
if (spdk_likely(TAILQ_EMPTY(&shared_ch->nomem_io))) {
bdev->fn_table->submit_request(ch, bdev_io);
} else {
bdev_ch->io_outstanding--;
TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link);
shared_ch->io_outstanding--;
TAILQ_INSERT_TAIL(&shared_ch->nomem_io, bdev_io, link);
}
} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
@ -851,6 +873,8 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
{
struct spdk_bdev *bdev = io_device;
struct spdk_bdev_channel *ch = ctx_buf;
struct spdk_bdev_mgmt_channel *mgmt_ch;
struct spdk_bdev_module_channel *shared_ch;
ch->bdev = io_device;
ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
@ -864,12 +888,34 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
return -1;
}
mgmt_ch = spdk_io_channel_get_ctx(ch->mgmt_channel);
TAILQ_FOREACH(shared_ch, &mgmt_ch->module_channels, link) {
if (shared_ch->module_ch == ch->channel) {
shared_ch->ref++;
break;
}
}
if (shared_ch == NULL) {
shared_ch = calloc(1, sizeof(*shared_ch));
if (!shared_ch) {
spdk_put_io_channel(ch->channel);
spdk_put_io_channel(ch->mgmt_channel);
return -1;
}
shared_ch->io_outstanding = 0;
TAILQ_INIT(&shared_ch->nomem_io);
shared_ch->nomem_threshold = 0;
shared_ch->module_ch = ch->channel;
shared_ch->ref = 1;
TAILQ_INSERT_TAIL(&mgmt_ch->module_channels, shared_ch, link);
}
memset(&ch->stat, 0, sizeof(ch->stat));
ch->io_outstanding = 0;
TAILQ_INIT(&ch->queued_resets);
TAILQ_INIT(&ch->nomem_io);
ch->nomem_threshold = 0;
ch->flags = 0;
ch->module_ch = shared_ch;
#ifdef SPDK_CONFIG_VTUNE
{
@ -935,7 +981,7 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
* that spdk_bdev_io_complete() will do.
*/
if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
ch->io_outstanding++;
ch->module_ch->io_outstanding++;
}
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
@ -947,17 +993,24 @@ spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
{
struct spdk_bdev_channel *ch = ctx_buf;
struct spdk_bdev_mgmt_channel *mgmt_channel;
struct spdk_bdev_module_channel *shared_ch = ch->module_ch;
mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);
_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
_spdk_bdev_abort_queued_io(&ch->nomem_io, ch);
_spdk_bdev_abort_queued_io(&shared_ch->nomem_io, ch);
_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch);
_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch);
assert(shared_ch->ref > 0);
shared_ch->ref--;
if (shared_ch->ref == 0) {
assert(shared_ch->io_outstanding == 0);
TAILQ_REMOVE(&mgmt_channel->module_channels, shared_ch, link);
free(shared_ch);
}
spdk_put_io_channel(ch->channel);
spdk_put_io_channel(ch->mgmt_channel);
assert(ch->io_outstanding == 0);
}
int
@ -1502,14 +1555,16 @@ _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
struct spdk_io_channel *ch;
struct spdk_bdev_channel *channel;
struct spdk_bdev_mgmt_channel *mgmt_channel;
struct spdk_bdev_module_channel *shared_ch;
ch = spdk_io_channel_iter_get_channel(i);
channel = spdk_io_channel_get_ctx(ch);
mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel);
shared_ch = channel->module_ch;
channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
_spdk_bdev_abort_queued_io(&channel->nomem_io, channel);
_spdk_bdev_abort_queued_io(&shared_ch->nomem_io, channel);
_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);
@ -1724,9 +1779,10 @@ static void
_spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
{
struct spdk_bdev *bdev = bdev_ch->bdev;
struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch;
struct spdk_bdev_io *bdev_io;
if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) {
if (shared_ch->io_outstanding > shared_ch->nomem_threshold) {
/*
* Allow some more I/O to complete before retrying the nomem_io queue.
* Some drivers (such as nvme) cannot immediately take a new I/O in
@ -1738,12 +1794,12 @@ _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
return;
}
while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) {
bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io);
TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link);
bdev_ch->io_outstanding++;
while (!TAILQ_EMPTY(&shared_ch->nomem_io)) {
bdev_io = TAILQ_FIRST(&shared_ch->nomem_io);
TAILQ_REMOVE(&shared_ch->nomem_io, bdev_io, link);
shared_ch->io_outstanding++;
bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
bdev->fn_table->submit_request(bdev_ch->channel, bdev_io);
bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io);
if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) {
break;
}
@ -1791,6 +1847,7 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
{
struct spdk_bdev *bdev = bdev_io->bdev;
struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch;
bdev_io->status = status;
@ -1813,22 +1870,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
return;
}
} else {
assert(bdev_ch->io_outstanding > 0);
bdev_ch->io_outstanding--;
assert(shared_ch->io_outstanding > 0);
shared_ch->io_outstanding--;
if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) {
if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) {
if (spdk_unlikely(!TAILQ_EMPTY(&shared_ch->nomem_io))) {
_spdk_bdev_ch_retry_io(bdev_ch);
}
} else {
TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link);
TAILQ_INSERT_HEAD(&shared_ch->nomem_io, bdev_io, link);
/*
* Wait for some of the outstanding I/O to complete before we
* retry any of the nomem_io. Normally we will wait for
* NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
* depth channels we will instead wait for half to complete.
*/
bdev_ch->nomem_threshold = spdk_max((int64_t)bdev_ch->io_outstanding / 2,
(int64_t)bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
shared_ch->nomem_threshold = spdk_max((int64_t)shared_ch->io_outstanding / 2,
(int64_t)shared_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
return;
}
}

View File

@ -486,6 +486,7 @@ enomem(void)
{
struct spdk_io_channel *io_ch;
struct spdk_bdev_channel *bdev_ch;
struct spdk_bdev_module_channel *module_ch;
struct ut_bdev_channel *ut_ch;
const uint32_t IO_ARRAY_SIZE = 64;
const uint32_t AVAIL = 20;
@ -499,6 +500,7 @@ enomem(void)
set_thread(0);
io_ch = spdk_bdev_get_io_channel(g_desc);
bdev_ch = spdk_io_channel_get_ctx(io_ch);
module_ch = bdev_ch->module_ch;
ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
ut_ch->avail_cnt = AVAIL;
@ -508,7 +510,7 @@ enomem(void)
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
CU_ASSERT(rc == 0);
}
CU_ASSERT(TAILQ_EMPTY(&bdev_ch->nomem_io));
CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io));
/*
* Next, submit one additional I/O. This one should fail with ENOMEM and then go onto
@ -517,8 +519,8 @@ enomem(void)
status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
CU_ASSERT(rc == 0);
SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&bdev_ch->nomem_io));
first_io = TAILQ_FIRST(&bdev_ch->nomem_io);
SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io));
first_io = TAILQ_FIRST(&module_ch->nomem_io);
/*
* Now submit a bunch more I/O. These should all fail with ENOMEM and get queued behind
@ -531,10 +533,10 @@ enomem(void)
}
/* Assert that first_io is still at the head of the list. */
CU_ASSERT(TAILQ_FIRST(&bdev_ch->nomem_io) == first_io);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
CU_ASSERT(bdev_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));
CU_ASSERT(TAILQ_FIRST(&module_ch->nomem_io) == first_io);
CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io);
CU_ASSERT(module_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));
/*
* Complete 1 I/O only. The key check here is bdev_io_tailq_cnt - this should not have
@ -542,19 +544,19 @@ enomem(void)
* list.
*/
stub_complete_io(g_bdev.io_target, 1);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt);
/*
* Complete enough I/O to hit the nomem_theshold. This should trigger retrying nomem_io,
* and we should see I/O get resubmitted to the test bdev module.
*/
stub_complete_io(g_bdev.io_target, NOMEM_THRESHOLD_COUNT - 1);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) < nomem_cnt);
nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) < nomem_cnt);
nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io);
/* Complete 1 I/O only. This should not trigger retrying the queued nomem_io. */
stub_complete_io(g_bdev.io_target, 1);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt);
/*
* Send a reset and confirm that all I/O are completed, including the ones that
@ -567,14 +569,86 @@ enomem(void)
/* This will complete the reset. */
stub_complete_io(g_bdev.io_target, 0);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == 0);
CU_ASSERT(bdev_ch->io_outstanding == 0);
CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == 0);
CU_ASSERT(module_ch->io_outstanding == 0);
spdk_put_io_channel(io_ch);
poll_threads();
teardown_test();
}
static void
enomem_multi_bdev(void)
{
struct spdk_io_channel *io_ch;
struct spdk_bdev_channel *bdev_ch;
struct spdk_bdev_module_channel *module_ch;
struct ut_bdev_channel *ut_ch;
const uint32_t IO_ARRAY_SIZE = 64;
const uint32_t AVAIL = 20;
enum spdk_bdev_io_status status[IO_ARRAY_SIZE];
uint32_t i;
struct ut_bdev *second_bdev;
struct spdk_bdev_desc *second_desc;
struct spdk_bdev_channel *second_bdev_ch;
struct spdk_io_channel *second_ch;
int rc;
setup_test();
/* Register second bdev with the same io_target */
second_bdev = calloc(1, sizeof(*second_bdev));
SPDK_CU_ASSERT_FATAL(second_bdev != NULL);
register_bdev(second_bdev, "ut_bdev2", g_bdev.io_target);
spdk_bdev_open(&second_bdev->bdev, true, NULL, NULL, &second_desc);
set_thread(0);
io_ch = spdk_bdev_get_io_channel(g_desc);
bdev_ch = spdk_io_channel_get_ctx(io_ch);
module_ch = bdev_ch->module_ch;
ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
ut_ch->avail_cnt = AVAIL;
second_ch = spdk_bdev_get_io_channel(second_desc);
second_bdev_ch = spdk_io_channel_get_ctx(second_ch);
SPDK_CU_ASSERT_FATAL(module_ch == second_bdev_ch->module_ch);
/* Saturate io_target through bdev A. */
for (i = 0; i < AVAIL; i++) {
status[i] = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
CU_ASSERT(rc == 0);
}
CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io));
/*
* Now submit I/O through the second bdev. This should fail with ENOMEM
* and then go onto the nomem_io list.
*/
status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_read_blocks(second_desc, second_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
CU_ASSERT(rc == 0);
SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io));
/* Complete first bdev's I/O. This should retry sending second bdev's nomem_io */
stub_complete_io(g_bdev.io_target, AVAIL);
SPDK_CU_ASSERT_FATAL(TAILQ_EMPTY(&module_ch->nomem_io));
CU_ASSERT(module_ch->io_outstanding == 1);
/* Now complete our retried I/O */
stub_complete_io(g_bdev.io_target, 1);
SPDK_CU_ASSERT_FATAL(module_ch->io_outstanding == 0);
spdk_put_io_channel(io_ch);
spdk_put_io_channel(second_ch);
spdk_bdev_close(second_desc);
unregister_bdev(second_bdev);
free(second_bdev);
poll_threads();
teardown_test();
}
int
main(int argc, char **argv)
{
@ -596,7 +670,8 @@ main(int argc, char **argv)
CU_add_test(suite, "put_channel_during_reset", put_channel_during_reset) == NULL ||
CU_add_test(suite, "aborted_reset", aborted_reset) == NULL ||
CU_add_test(suite, "io_during_reset", io_during_reset) == NULL ||
CU_add_test(suite, "enomem", enomem) == NULL
CU_add_test(suite, "enomem", enomem) == NULL ||
CU_add_test(suite, "enomem_multi_bdev", enomem_multi_bdev) == NULL
) {
CU_cleanup_registry();
return CU_get_error();