bdev: Change split IOV submission from sequential to batch
Large read I/O will be typical in some use cases such as web stream services. On the other hand, large write I/O may not be typical but will be sufficiently probable. Currently when large I/O is submitted to the RAID bdev, the I/O will be divided by the strip size of it and then divided I/Os are submitted sequentially. This patch tries to improve the performance of the RAID bdev in large I/Os. Besides, when the RAID bdev supports higher levels of RAID (such as RAID5), it should issue multiple I/Os to multiple base bdevs by batch fasion in the parity update. Having experience in batched I/O will be helpful in the future case too. In this patch, submit split I/Os by batch until all child IOVs are consumed or all data are submitted. If all child IOVs are consumed before all data are submitted, wait until all batched split I/Os complete and then submit again. In this patch, test code is added too. Change-Id: If6cd81cc0c306e3875a93c39dbe4288723b78937 Signed-off-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-on: https://review.gerrithub.io/424770 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Chandler-Test-Pool: SPDK Automated Test System <sys_sgsw@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
parent
08ed55bb1d
commit
5616c1ed9c
@ -395,6 +395,9 @@ struct spdk_bdev_io {
|
||||
|
||||
/** current offset of the split I/O in the bdev */
|
||||
uint64_t split_current_offset_blocks;
|
||||
|
||||
/** count of outstanding batched split I/Os */
|
||||
uint32_t split_outstanding;
|
||||
} bdev;
|
||||
struct {
|
||||
/** Channel reference held while messages for this reset are in progress. */
|
||||
|
143
lib/bdev/bdev.c
143
lib/bdev/bdev.c
@ -1238,9 +1238,9 @@ _spdk_bdev_io_split_with_payload(void *_bdev_io)
|
||||
struct spdk_bdev_io *bdev_io = _bdev_io;
|
||||
uint64_t current_offset, remaining;
|
||||
uint32_t blocklen, to_next_boundary, to_next_boundary_bytes;
|
||||
struct iovec *parent_iov;
|
||||
uint64_t parent_iov_offset, child_iov_len;
|
||||
uint32_t parent_iovpos, parent_iovcnt, child_iovcnt;
|
||||
struct iovec *parent_iov, *iov;
|
||||
uint64_t parent_iov_offset, iov_len;
|
||||
uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt;
|
||||
int rc;
|
||||
|
||||
remaining = bdev_io->u.bdev.split_remaining_num_blocks;
|
||||
@ -1257,59 +1257,85 @@ _spdk_bdev_io_split_with_payload(void *_bdev_io)
|
||||
parent_iov_offset -= parent_iov->iov_len;
|
||||
}
|
||||
|
||||
to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary);
|
||||
to_next_boundary = spdk_min(remaining, to_next_boundary);
|
||||
to_next_boundary_bytes = to_next_boundary * blocklen;
|
||||
child_iovcnt = 0;
|
||||
while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
|
||||
child_iovcnt < BDEV_IO_NUM_CHILD_IOV) {
|
||||
parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
|
||||
child_iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset);
|
||||
to_next_boundary_bytes -= child_iov_len;
|
||||
while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) {
|
||||
to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary);
|
||||
to_next_boundary = spdk_min(remaining, to_next_boundary);
|
||||
to_next_boundary_bytes = to_next_boundary * blocklen;
|
||||
iov = &bdev_io->child_iov[child_iovcnt];
|
||||
iovcnt = 0;
|
||||
while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
|
||||
child_iovcnt < BDEV_IO_NUM_CHILD_IOV) {
|
||||
parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
|
||||
iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset);
|
||||
to_next_boundary_bytes -= iov_len;
|
||||
|
||||
bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
|
||||
bdev_io->child_iov[child_iovcnt].iov_len = child_iov_len;
|
||||
bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
|
||||
bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
|
||||
|
||||
parent_iovpos++;
|
||||
parent_iov_offset = 0;
|
||||
child_iovcnt++;
|
||||
}
|
||||
if (iov_len < parent_iov->iov_len - parent_iov_offset) {
|
||||
parent_iov_offset += iov_len;
|
||||
} else {
|
||||
parent_iovpos++;
|
||||
parent_iov_offset = 0;
|
||||
}
|
||||
child_iovcnt++;
|
||||
iovcnt++;
|
||||
}
|
||||
|
||||
if (to_next_boundary_bytes > 0) {
|
||||
/* We had to stop this child I/O early because we ran out of
|
||||
* child_iov space. Make sure the iovs collected are valid and
|
||||
* then adjust to_next_boundary before starting the child I/O.
|
||||
*/
|
||||
if ((to_next_boundary_bytes % blocklen) != 0) {
|
||||
SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n",
|
||||
to_next_boundary_bytes, blocklen);
|
||||
bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
|
||||
if (bdev_io->u.bdev.split_outstanding == 0) {
|
||||
bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
|
||||
}
|
||||
return;
|
||||
}
|
||||
to_next_boundary -= to_next_boundary_bytes / blocklen;
|
||||
}
|
||||
|
||||
bdev_io->u.bdev.split_outstanding++;
|
||||
|
||||
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
|
||||
rc = spdk_bdev_readv_blocks(bdev_io->internal.desc,
|
||||
spdk_io_channel_from_ctx(bdev_io->internal.ch),
|
||||
iov, iovcnt, current_offset, to_next_boundary,
|
||||
_spdk_bdev_io_split_done, bdev_io);
|
||||
} else {
|
||||
rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
|
||||
spdk_io_channel_from_ctx(bdev_io->internal.ch),
|
||||
iov, iovcnt, current_offset, to_next_boundary,
|
||||
_spdk_bdev_io_split_done, bdev_io);
|
||||
}
|
||||
|
||||
if (rc == 0) {
|
||||
current_offset += to_next_boundary;
|
||||
remaining -= to_next_boundary;
|
||||
bdev_io->u.bdev.split_current_offset_blocks = current_offset;
|
||||
bdev_io->u.bdev.split_remaining_num_blocks = remaining;
|
||||
} else {
|
||||
bdev_io->u.bdev.split_outstanding--;
|
||||
if (rc == -ENOMEM) {
|
||||
if (bdev_io->u.bdev.split_outstanding == 0) {
|
||||
/* No I/O is outstanding. Hence we should wait here. */
|
||||
_spdk_bdev_queue_io_wait_with_cb(bdev_io,
|
||||
_spdk_bdev_io_split_with_payload);
|
||||
}
|
||||
} else {
|
||||
bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
|
||||
if (bdev_io->u.bdev.split_outstanding == 0) {
|
||||
bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
if (to_next_boundary_bytes > 0) {
|
||||
/* We had to stop this child I/O early because we ran out of
|
||||
* child_iov space. Make sure the iovs collected are valid and
|
||||
* then adjust to_next_boundary before starting the child I/O.
|
||||
*/
|
||||
if ((to_next_boundary_bytes % blocklen) != 0) {
|
||||
SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n",
|
||||
to_next_boundary_bytes, blocklen);
|
||||
bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
|
||||
bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
|
||||
return;
|
||||
}
|
||||
to_next_boundary -= to_next_boundary_bytes / blocklen;
|
||||
}
|
||||
|
||||
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
|
||||
rc = spdk_bdev_readv_blocks(bdev_io->internal.desc,
|
||||
spdk_io_channel_from_ctx(bdev_io->internal.ch),
|
||||
bdev_io->child_iov, child_iovcnt, current_offset, to_next_boundary,
|
||||
_spdk_bdev_io_split_done, bdev_io);
|
||||
} else {
|
||||
rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
|
||||
spdk_io_channel_from_ctx(bdev_io->internal.ch),
|
||||
bdev_io->child_iov, child_iovcnt, current_offset, to_next_boundary,
|
||||
_spdk_bdev_io_split_done, bdev_io);
|
||||
}
|
||||
|
||||
if (rc == 0) {
|
||||
bdev_io->u.bdev.split_current_offset_blocks += to_next_boundary;
|
||||
bdev_io->u.bdev.split_remaining_num_blocks -= to_next_boundary;
|
||||
} else if (rc == -ENOMEM) {
|
||||
_spdk_bdev_queue_io_wait_with_cb(bdev_io, _spdk_bdev_io_split_with_payload);
|
||||
} else {
|
||||
bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
|
||||
bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1322,13 +1348,20 @@ _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_ar
|
||||
|
||||
if (!success) {
|
||||
parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
|
||||
parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
|
||||
}
|
||||
parent_io->u.bdev.split_outstanding--;
|
||||
if (parent_io->u.bdev.split_outstanding != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
|
||||
parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
|
||||
parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
|
||||
/*
|
||||
* Parent I/O finishes when all blocks are consumed or there is any failure of
|
||||
* child I/O and no outstanding child I/O.
|
||||
*/
|
||||
if (parent_io->u.bdev.split_remaining_num_blocks == 0 ||
|
||||
parent_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS) {
|
||||
parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
|
||||
parent_io->internal.caller_ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1346,6 +1379,8 @@ _spdk_bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
|
||||
|
||||
bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks;
|
||||
bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks;
|
||||
bdev_io->u.bdev.split_outstanding = 0;
|
||||
bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
|
||||
|
||||
_spdk_bdev_io_split_with_payload(bdev_io);
|
||||
}
|
||||
|
@ -300,7 +300,7 @@ allocate_bdev(char *name)
|
||||
bdev->name = name;
|
||||
bdev->fn_table = &fn_table;
|
||||
bdev->module = &bdev_ut_if;
|
||||
bdev->blockcnt = 256;
|
||||
bdev->blockcnt = 1024;
|
||||
bdev->blocklen = 512;
|
||||
|
||||
rc = spdk_bdev_register(bdev);
|
||||
@ -891,19 +891,10 @@ bdev_io_split(void)
|
||||
CU_ASSERT(rc == 0);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
/* The second child will get submitted once the first child is completed by
|
||||
* stub_complete_io().
|
||||
*/
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
/* Complete the second child I/O. This should result in our callback getting
|
||||
* invoked since the parent I/O is now complete.
|
||||
*/
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2);
|
||||
stub_complete_io(2);
|
||||
CU_ASSERT(g_io_done == true);
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
|
||||
|
||||
/* Now set up a more complex, multi-vector command that needs to be split,
|
||||
* including splitting iovecs.
|
||||
@ -934,16 +925,8 @@ bdev_io_split(void)
|
||||
CU_ASSERT(rc == 0);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 3);
|
||||
stub_complete_io(3);
|
||||
CU_ASSERT(g_io_done == true);
|
||||
|
||||
/* Test multi vector command that needs to be split by strip and then needs to be
|
||||
@ -983,6 +966,7 @@ bdev_io_split(void)
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_io_done == true);
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
|
||||
|
||||
/* Test multi vector command that needs to be split by strip and then needs to be
|
||||
* split further due to the capacity of child iovs, but fails to split. The cause
|
||||
@ -1066,6 +1050,7 @@ bdev_io_split_with_io_wait(void)
|
||||
.bdev_io_pool_size = 2,
|
||||
.bdev_io_cache_size = 1,
|
||||
};
|
||||
struct iovec iov[3];
|
||||
struct ut_expected_io *expected_io;
|
||||
int rc;
|
||||
|
||||
@ -1094,7 +1079,6 @@ bdev_io_split_with_io_wait(void)
|
||||
* Child - Offset 14, length 2, payload 0xF000
|
||||
* Child - Offset 16, length 6, payload 0xF000 + 2 * 512
|
||||
*
|
||||
* This I/O will consume three spdk_bdev_io.
|
||||
* Set up the expected values before calling spdk_bdev_read_blocks
|
||||
*/
|
||||
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 14, 2, 1);
|
||||
@ -1105,6 +1089,10 @@ bdev_io_split_with_io_wait(void)
|
||||
ut_expected_io_set_iov(expected_io, 0, (void *)(0xF000 + 2 * 512), 6 * 512);
|
||||
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
|
||||
|
||||
/* The following children will be submitted sequentially due to the capacity of
|
||||
* spdk_bdev_io.
|
||||
*/
|
||||
|
||||
/* The first child I/O will be queued to wait until an spdk_bdev_io becomes available */
|
||||
rc = spdk_bdev_read_blocks(desc, io_ch, (void *)0xF000, 14, 8, io_done, NULL);
|
||||
CU_ASSERT(rc == 0);
|
||||
@ -1126,6 +1114,56 @@ bdev_io_split_with_io_wait(void)
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
|
||||
|
||||
/* Now set up a more complex, multi-vector command that needs to be split,
|
||||
* including splitting iovecs.
|
||||
*/
|
||||
iov[0].iov_base = (void *)0x10000;
|
||||
iov[0].iov_len = 512;
|
||||
iov[1].iov_base = (void *)0x20000;
|
||||
iov[1].iov_len = 20 * 512;
|
||||
iov[2].iov_base = (void *)0x30000;
|
||||
iov[2].iov_len = 11 * 512;
|
||||
|
||||
g_io_done = false;
|
||||
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 14, 2, 2);
|
||||
ut_expected_io_set_iov(expected_io, 0, (void *)0x10000, 512);
|
||||
ut_expected_io_set_iov(expected_io, 1, (void *)0x20000, 512);
|
||||
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
|
||||
|
||||
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 16, 16, 1);
|
||||
ut_expected_io_set_iov(expected_io, 0, (void *)(0x20000 + 512), 16 * 512);
|
||||
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
|
||||
|
||||
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 32, 14, 2);
|
||||
ut_expected_io_set_iov(expected_io, 0, (void *)(0x20000 + 17 * 512), 3 * 512);
|
||||
ut_expected_io_set_iov(expected_io, 1, (void *)0x30000, 11 * 512);
|
||||
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
|
||||
|
||||
rc = spdk_bdev_writev_blocks(desc, io_ch, iov, 3, 14, 32, io_done, NULL);
|
||||
CU_ASSERT(rc == 0);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
/* The following children will be submitted sequentially due to the capacity of
|
||||
* spdk_bdev_io.
|
||||
*/
|
||||
|
||||
/* Completing the first child will submit the second child */
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
/* Completing the second child will submit the third child */
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
/* Completing the third child will result in our callback getting invoked
|
||||
* since the parent I/O is now complete.
|
||||
*/
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_io_done == true);
|
||||
|
||||
CU_ASSERT(TAILQ_EMPTY(&g_bdev_ut_channel->expected_io));
|
||||
|
||||
spdk_put_io_channel(io_ch);
|
||||
|
Loading…
x
Reference in New Issue
Block a user