bdev/raid: use split_on_optimal_io_boundary

Set the bdev->optimal_io_boundary to the strip size, and
set split_on_optimal_io_boundary = true.  This will ensure
that all I/O submitted to the raid module do not cross
a strip boundary, meaning it does not need to be split
across multiple member disks.

This is a step towards removing the iovcnt == 1
limitation.  Further improvements and simplifications
will be made in future patches before removing this
restriction.

Unit tests need to be adjusted here to not span
boundaries either.

Signed-off-by: Jim Harris <james.r.harris@intel.com>
Change-Id: I08943805def673288f552a1b7662a4fbe16f25eb

Reviewed-on: https://review.gerrithub.io/423323
Chandler-Test-Pool: SPDK Automated Test System <sys_sgsw@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
Jim Harris 2018-08-23 12:50:53 -07:00
parent 4f860d7e40
commit 2e6aac525c
2 changed files with 68 additions and 81 deletions

View File

@ -327,75 +327,58 @@ raid_bdev_submit_children(struct spdk_bdev_io *bdev_io,
uint64_t pd_lba;
uint64_t pd_blocks;
uint32_t pd_idx;
int ret;
int ret = 0;
for (uint64_t strip = cur_strip; strip <= end_strip; strip++) {
if (start_strip != end_strip) {
SPDK_ERRLOG("I/O spans strip boundary\n");
assert(false);
}
pd_strip = start_strip / raid_bdev->num_base_bdevs;
pd_idx = start_strip % raid_bdev->num_base_bdevs;
offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
pd_blocks = bdev_io->u.bdev.num_blocks;
raid_io->splits_comp_outstanding++;
assert(raid_io->splits_pending);
raid_io->splits_pending--;
if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
assert(0);
}
/*
* Submit child io to bdev layer with using base bdev descriptors, base
* bdev lba, base bdev child io length in blocks, buffer, completion
* function and function callback context
*/
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
ret = spdk_bdev_read_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
raid_ch->base_channel[pd_idx],
buf, pd_lba, pd_blocks, raid_bdev_io_completion,
bdev_io);
} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
ret = spdk_bdev_write_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
raid_ch->base_channel[pd_idx],
buf, pd_lba, pd_blocks, raid_bdev_io_completion,
bdev_io);
} else {
SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
assert(0);
}
if (ret != 0) {
/*
* For each strip of parent bdev io, process for each strip and submit
* child io to bdev layer. Calculate base bdev level start lba, length
* and buffer for this child io
* If failed to submit child io to bdev layer then queue the parent
* bdev io with current active split information in the wait queue
* for that core. This will get resume from this point only. Assume
* if 4 splits are required and 2 childs are submitted, then parent
* io is queued to io waitq of this core and it will get resumed and
* try to submit the remaining 3 and 4 childs
*/
pd_strip = strip / raid_bdev->num_base_bdevs;
pd_idx = strip % raid_bdev->num_base_bdevs;
if (strip == start_strip) {
offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
if (strip == end_strip) {
pd_blocks = bdev_io->u.bdev.num_blocks;
} else {
pd_blocks = raid_bdev->strip_size - offset_in_strip;
}
} else if (strip == end_strip) {
pd_lba = pd_strip << raid_bdev->strip_size_shift;
pd_blocks = ((bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) &
(raid_bdev->strip_size - 1)) + 1;
} else {
pd_lba = pd_strip << raid_bdev->strip_size_shift;
pd_blocks = raid_bdev->strip_size;
}
raid_io->splits_comp_outstanding++;
assert(raid_io->splits_pending);
raid_io->splits_pending--;
if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
assert(0);
}
/*
* Submit child io to bdev layer with using base bdev descriptors, base
* bdev lba, base bdev child io length in blocks, buffer, completion
* function and function callback context
*/
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
ret = spdk_bdev_read_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
raid_ch->base_channel[pd_idx],
buf, pd_lba, pd_blocks, raid_bdev_io_completion,
bdev_io);
} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
ret = spdk_bdev_write_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
raid_ch->base_channel[pd_idx],
buf, pd_lba, pd_blocks, raid_bdev_io_completion,
bdev_io);
} else {
SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
assert(0);
}
if (ret != 0) {
/*
* If failed to submit child io to bdev layer then queue the parent
* bdev io with current active split information in the wait queue
* for that core. This will get resume from this point only. Assume
* if 4 splits are required and 2 childs are submitted, then parent
* io is queued to io waitq of this core and it will get resumed and
* try to submit the remaining 3 and 4 childs
*/
raid_io->buf = buf;
raid_io->splits_comp_outstanding--;
raid_io->splits_pending++;
return ret;
}
buf += (pd_blocks << raid_bdev->blocklen_shift);
raid_io->buf = buf;
raid_io->splits_comp_outstanding--;
raid_io->splits_pending++;
return ret;
}
return 0;
@ -1257,11 +1240,14 @@ raid_bdev_configure(struct raid_bdev *raid_bdev)
raid_bdev_gen = &raid_bdev->bdev;
raid_bdev_gen->write_cache = 0;
raid_bdev_gen->blocklen = blocklen;
raid_bdev_gen->optimal_io_boundary = 0;
raid_bdev_gen->ctxt = raid_bdev;
raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
raid_bdev_gen->module = &g_raid_if;
raid_bdev->strip_size = (raid_bdev->strip_size * 1024) / blocklen;
raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size;
raid_bdev_gen->split_on_optimal_io_boundary = true;
/*
* RAID bdev logic is for striping so take the minimum block count based

View File

@ -863,7 +863,8 @@ verify_raid_bdev(struct rpc_construct_raid_bdev *r, bool presence, uint32_t raid
CU_ASSERT(strcmp(pbdev->bdev.product_name, "Pooled Device") == 0);
CU_ASSERT(pbdev->bdev.write_cache == 0);
CU_ASSERT(pbdev->bdev.blocklen == g_block_len);
CU_ASSERT(pbdev->bdev.optimal_io_boundary == 0);
CU_ASSERT(pbdev->bdev.optimal_io_boundary == pbdev->strip_size);
CU_ASSERT(pbdev->bdev.split_on_optimal_io_boundary == true);
CU_ASSERT(pbdev->bdev.ctxt == pbdev);
CU_ASSERT(pbdev->bdev.fn_table == &g_raid_bdev_fn_table);
CU_ASSERT(pbdev->bdev.module == &g_raid_if);
@ -1375,9 +1376,9 @@ test_write_io(void)
for (count = 0; count < g_max_qd; count++) {
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
io_len = (rand() % g_max_io_size) + 1;
io_len = (rand() % g_strip_size) + 1;
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
lba += io_len;
lba += g_strip_size;
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
g_io_output_index = 0;
raid_bdev_submit_request(ch, bdev_io);
@ -1454,9 +1455,9 @@ test_read_io(void)
for (count = 0; count < g_max_qd; count++) {
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
io_len = (rand() % g_max_io_size) + 1;
io_len = (rand() % g_strip_size) + 1;
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_READ);
lba += io_len;
lba += g_strip_size;
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
g_io_output_index = 0;
raid_bdev_submit_request(ch, bdev_io);
@ -1533,9 +1534,9 @@ test_io_failure(void)
for (count = 0; count < 1; count++) {
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
io_len = (rand() % g_max_io_size) + 1;
io_len = (rand() % g_strip_size) + 1;
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_INVALID);
lba += io_len;
lba += g_strip_size;
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
g_io_output_index = 0;
raid_bdev_submit_request(ch, bdev_io);
@ -1551,9 +1552,9 @@ test_io_failure(void)
for (count = 0; count < 1; count++) {
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
io_len = (rand() % g_max_io_size) + 1;
io_len = (rand() % g_strip_size) + 1;
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
lba += io_len;
lba += g_strip_size;
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
g_io_output_index = 0;
raid_bdev_submit_request(ch, bdev_io);
@ -1635,10 +1636,10 @@ test_io_waitq(void)
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
TAILQ_INSERT_TAIL(&head_io, bdev_io, module_link);
io_len = (rand() % g_max_io_size) + 1;
io_len = (rand() % g_strip_size) + 1;
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
g_bdev_io_submit_status = -ENOMEM;
lba += io_len;
lba += g_strip_size;
raid_bdev_submit_request(ch, bdev_io);
}
@ -1869,7 +1870,7 @@ test_multi_raid_with_io(void)
for (count = 0; count < g_max_qd; count++) {
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
io_len = (rand() % g_max_io_size) + 1;
io_len = (rand() % g_strip_size) + 1;
iotype = (rand() % 2) ? SPDK_BDEV_IO_TYPE_WRITE : SPDK_BDEV_IO_TYPE_READ;
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
g_io_output_index = 0;
@ -1882,7 +1883,7 @@ test_multi_raid_with_io(void)
}
}
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, iotype);
lba += io_len;
lba += g_strip_size;
CU_ASSERT(pbdev != NULL);
raid_bdev_submit_request(ch_random, bdev_io);
verify_io(bdev_io, g_max_base_drives, ch_ctx_random, pbdev,