bdev/raid: use split_on_optimal_io_boundary
Set the bdev->optimal_io_boundary to the strip size, and set split_on_optimal_io_boundary = true. This will ensure that all I/O submitted to the raid module do not cross a strip boundary, meaning it does not need to be split across multiple member disks. This is a step towards removing the iovcnt == 1 limitation. Further improvements and simplifications will be made in future patches before removing this restriction. Unit tests need to be adjusted here to not span boundaries either. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: I08943805def673288f552a1b7662a4fbe16f25eb Reviewed-on: https://review.gerrithub.io/423323 Chandler-Test-Pool: SPDK Automated Test System <sys_sgsw@intel.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
parent
4f860d7e40
commit
2e6aac525c
@ -327,75 +327,58 @@ raid_bdev_submit_children(struct spdk_bdev_io *bdev_io,
|
||||
uint64_t pd_lba;
|
||||
uint64_t pd_blocks;
|
||||
uint32_t pd_idx;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
for (uint64_t strip = cur_strip; strip <= end_strip; strip++) {
|
||||
if (start_strip != end_strip) {
|
||||
SPDK_ERRLOG("I/O spans strip boundary\n");
|
||||
assert(false);
|
||||
}
|
||||
|
||||
pd_strip = start_strip / raid_bdev->num_base_bdevs;
|
||||
pd_idx = start_strip % raid_bdev->num_base_bdevs;
|
||||
offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
|
||||
pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
|
||||
pd_blocks = bdev_io->u.bdev.num_blocks;
|
||||
raid_io->splits_comp_outstanding++;
|
||||
assert(raid_io->splits_pending);
|
||||
raid_io->splits_pending--;
|
||||
if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
|
||||
SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Submit child io to bdev layer with using base bdev descriptors, base
|
||||
* bdev lba, base bdev child io length in blocks, buffer, completion
|
||||
* function and function callback context
|
||||
*/
|
||||
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
|
||||
ret = spdk_bdev_read_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
|
||||
raid_ch->base_channel[pd_idx],
|
||||
buf, pd_lba, pd_blocks, raid_bdev_io_completion,
|
||||
bdev_io);
|
||||
} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
|
||||
ret = spdk_bdev_write_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
|
||||
raid_ch->base_channel[pd_idx],
|
||||
buf, pd_lba, pd_blocks, raid_bdev_io_completion,
|
||||
bdev_io);
|
||||
} else {
|
||||
SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
|
||||
assert(0);
|
||||
}
|
||||
if (ret != 0) {
|
||||
/*
|
||||
* For each strip of parent bdev io, process for each strip and submit
|
||||
* child io to bdev layer. Calculate base bdev level start lba, length
|
||||
* and buffer for this child io
|
||||
* If failed to submit child io to bdev layer then queue the parent
|
||||
* bdev io with current active split information in the wait queue
|
||||
* for that core. This will get resume from this point only. Assume
|
||||
* if 4 splits are required and 2 childs are submitted, then parent
|
||||
* io is queued to io waitq of this core and it will get resumed and
|
||||
* try to submit the remaining 3 and 4 childs
|
||||
*/
|
||||
pd_strip = strip / raid_bdev->num_base_bdevs;
|
||||
pd_idx = strip % raid_bdev->num_base_bdevs;
|
||||
if (strip == start_strip) {
|
||||
offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
|
||||
pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
|
||||
if (strip == end_strip) {
|
||||
pd_blocks = bdev_io->u.bdev.num_blocks;
|
||||
} else {
|
||||
pd_blocks = raid_bdev->strip_size - offset_in_strip;
|
||||
}
|
||||
} else if (strip == end_strip) {
|
||||
pd_lba = pd_strip << raid_bdev->strip_size_shift;
|
||||
pd_blocks = ((bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) &
|
||||
(raid_bdev->strip_size - 1)) + 1;
|
||||
} else {
|
||||
pd_lba = pd_strip << raid_bdev->strip_size_shift;
|
||||
pd_blocks = raid_bdev->strip_size;
|
||||
}
|
||||
raid_io->splits_comp_outstanding++;
|
||||
assert(raid_io->splits_pending);
|
||||
raid_io->splits_pending--;
|
||||
if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
|
||||
SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Submit child io to bdev layer with using base bdev descriptors, base
|
||||
* bdev lba, base bdev child io length in blocks, buffer, completion
|
||||
* function and function callback context
|
||||
*/
|
||||
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
|
||||
ret = spdk_bdev_read_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
|
||||
raid_ch->base_channel[pd_idx],
|
||||
buf, pd_lba, pd_blocks, raid_bdev_io_completion,
|
||||
bdev_io);
|
||||
|
||||
} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
|
||||
ret = spdk_bdev_write_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
|
||||
raid_ch->base_channel[pd_idx],
|
||||
buf, pd_lba, pd_blocks, raid_bdev_io_completion,
|
||||
bdev_io);
|
||||
} else {
|
||||
SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
|
||||
assert(0);
|
||||
}
|
||||
if (ret != 0) {
|
||||
/*
|
||||
* If failed to submit child io to bdev layer then queue the parent
|
||||
* bdev io with current active split information in the wait queue
|
||||
* for that core. This will get resume from this point only. Assume
|
||||
* if 4 splits are required and 2 childs are submitted, then parent
|
||||
* io is queued to io waitq of this core and it will get resumed and
|
||||
* try to submit the remaining 3 and 4 childs
|
||||
*/
|
||||
raid_io->buf = buf;
|
||||
raid_io->splits_comp_outstanding--;
|
||||
raid_io->splits_pending++;
|
||||
return ret;
|
||||
}
|
||||
buf += (pd_blocks << raid_bdev->blocklen_shift);
|
||||
raid_io->buf = buf;
|
||||
raid_io->splits_comp_outstanding--;
|
||||
raid_io->splits_pending++;
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -1257,11 +1240,14 @@ raid_bdev_configure(struct raid_bdev *raid_bdev)
|
||||
raid_bdev_gen = &raid_bdev->bdev;
|
||||
raid_bdev_gen->write_cache = 0;
|
||||
raid_bdev_gen->blocklen = blocklen;
|
||||
raid_bdev_gen->optimal_io_boundary = 0;
|
||||
|
||||
raid_bdev_gen->ctxt = raid_bdev;
|
||||
raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
|
||||
raid_bdev_gen->module = &g_raid_if;
|
||||
raid_bdev->strip_size = (raid_bdev->strip_size * 1024) / blocklen;
|
||||
raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
|
||||
raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
|
||||
raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size;
|
||||
raid_bdev_gen->split_on_optimal_io_boundary = true;
|
||||
|
||||
/*
|
||||
* RAID bdev logic is for striping so take the minimum block count based
|
||||
|
@ -863,7 +863,8 @@ verify_raid_bdev(struct rpc_construct_raid_bdev *r, bool presence, uint32_t raid
|
||||
CU_ASSERT(strcmp(pbdev->bdev.product_name, "Pooled Device") == 0);
|
||||
CU_ASSERT(pbdev->bdev.write_cache == 0);
|
||||
CU_ASSERT(pbdev->bdev.blocklen == g_block_len);
|
||||
CU_ASSERT(pbdev->bdev.optimal_io_boundary == 0);
|
||||
CU_ASSERT(pbdev->bdev.optimal_io_boundary == pbdev->strip_size);
|
||||
CU_ASSERT(pbdev->bdev.split_on_optimal_io_boundary == true);
|
||||
CU_ASSERT(pbdev->bdev.ctxt == pbdev);
|
||||
CU_ASSERT(pbdev->bdev.fn_table == &g_raid_bdev_fn_table);
|
||||
CU_ASSERT(pbdev->bdev.module == &g_raid_if);
|
||||
@ -1375,9 +1376,9 @@ test_write_io(void)
|
||||
for (count = 0; count < g_max_qd; count++) {
|
||||
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
|
||||
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
|
||||
io_len = (rand() % g_max_io_size) + 1;
|
||||
io_len = (rand() % g_strip_size) + 1;
|
||||
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
|
||||
lba += io_len;
|
||||
lba += g_strip_size;
|
||||
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
|
||||
g_io_output_index = 0;
|
||||
raid_bdev_submit_request(ch, bdev_io);
|
||||
@ -1454,9 +1455,9 @@ test_read_io(void)
|
||||
for (count = 0; count < g_max_qd; count++) {
|
||||
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
|
||||
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
|
||||
io_len = (rand() % g_max_io_size) + 1;
|
||||
io_len = (rand() % g_strip_size) + 1;
|
||||
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_READ);
|
||||
lba += io_len;
|
||||
lba += g_strip_size;
|
||||
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
|
||||
g_io_output_index = 0;
|
||||
raid_bdev_submit_request(ch, bdev_io);
|
||||
@ -1533,9 +1534,9 @@ test_io_failure(void)
|
||||
for (count = 0; count < 1; count++) {
|
||||
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
|
||||
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
|
||||
io_len = (rand() % g_max_io_size) + 1;
|
||||
io_len = (rand() % g_strip_size) + 1;
|
||||
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_INVALID);
|
||||
lba += io_len;
|
||||
lba += g_strip_size;
|
||||
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
|
||||
g_io_output_index = 0;
|
||||
raid_bdev_submit_request(ch, bdev_io);
|
||||
@ -1551,9 +1552,9 @@ test_io_failure(void)
|
||||
for (count = 0; count < 1; count++) {
|
||||
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
|
||||
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
|
||||
io_len = (rand() % g_max_io_size) + 1;
|
||||
io_len = (rand() % g_strip_size) + 1;
|
||||
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
|
||||
lba += io_len;
|
||||
lba += g_strip_size;
|
||||
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
|
||||
g_io_output_index = 0;
|
||||
raid_bdev_submit_request(ch, bdev_io);
|
||||
@ -1635,10 +1636,10 @@ test_io_waitq(void)
|
||||
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
|
||||
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
|
||||
TAILQ_INSERT_TAIL(&head_io, bdev_io, module_link);
|
||||
io_len = (rand() % g_max_io_size) + 1;
|
||||
io_len = (rand() % g_strip_size) + 1;
|
||||
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
|
||||
g_bdev_io_submit_status = -ENOMEM;
|
||||
lba += io_len;
|
||||
lba += g_strip_size;
|
||||
raid_bdev_submit_request(ch, bdev_io);
|
||||
}
|
||||
|
||||
@ -1869,7 +1870,7 @@ test_multi_raid_with_io(void)
|
||||
for (count = 0; count < g_max_qd; count++) {
|
||||
bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
|
||||
SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
|
||||
io_len = (rand() % g_max_io_size) + 1;
|
||||
io_len = (rand() % g_strip_size) + 1;
|
||||
iotype = (rand() % 2) ? SPDK_BDEV_IO_TYPE_WRITE : SPDK_BDEV_IO_TYPE_READ;
|
||||
memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
|
||||
g_io_output_index = 0;
|
||||
@ -1882,7 +1883,7 @@ test_multi_raid_with_io(void)
|
||||
}
|
||||
}
|
||||
bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, iotype);
|
||||
lba += io_len;
|
||||
lba += g_strip_size;
|
||||
CU_ASSERT(pbdev != NULL);
|
||||
raid_bdev_submit_request(ch_random, bdev_io);
|
||||
verify_io(bdev_io, g_max_base_drives, ch_ctx_random, pbdev,
|
||||
|
Loading…
Reference in New Issue
Block a user