bdev/raid: use split_on_optimal_io_boundary

Set the bdev->optimal_io_boundary to the strip size, and set split_on_optimal_io_boundary = true. This will ensure that all I/O submitted to the raid module do not cross a strip boundary, meaning it does not need to be split across multiple member disks. This is a step towards removing the iovcnt == 1 limitation. Further improvements and simplifications will be made in future patches before removing this restriction. Unit tests need to be adjusted here to not span boundaries either. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: I08943805def673288f552a1b7662a4fbe16f25eb Reviewed-on: https://review.gerrithub.io/423323 Chandler-Test-Pool: SPDK Automated Test System <sys_sgsw@intel.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2018-08-23 12:50:53 -07:00 · 2018-08-23 12:50:53 -07:00 · 2e6aac525c
commit 2e6aac525c
parent 4f860d7e40
2 changed files with 68 additions and 81 deletions
--- a/lib/bdev/raid/bdev_raid.c
+++ b/lib/bdev/raid/bdev_raid.c
@ -327,75 +327,58 @@ raid_bdev_submit_children(struct spdk_bdev_io *bdev_io,
 	uint64_t                      pd_lba;
 	uint64_t                      pd_blocks;
 	uint32_t                      pd_idx;
-	int                           ret;
+	int                           ret = 0;

-	for (uint64_t strip = cur_strip; strip <= end_strip; strip++) {
+	if (start_strip != end_strip) {
+		SPDK_ERRLOG("I/O spans strip boundary\n");
+		assert(false);
+	}
+
+	pd_strip = start_strip / raid_bdev->num_base_bdevs;
+	pd_idx = start_strip % raid_bdev->num_base_bdevs;
+	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
+	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
+	pd_blocks = bdev_io->u.bdev.num_blocks;
+	raid_io->splits_comp_outstanding++;
+	assert(raid_io->splits_pending);
+	raid_io->splits_pending--;
+	if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
+		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
+		assert(0);
+	}
+
+	/*
+	 * Submit child io to bdev layer with using base bdev descriptors, base
+	 * bdev lba, base bdev child io length in blocks, buffer, completion
+	 * function and function callback context
+	 */
+	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+		ret = spdk_bdev_read_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
+					    raid_ch->base_channel[pd_idx],
+					    buf, pd_lba, pd_blocks, raid_bdev_io_completion,
+					    bdev_io);
+	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+		ret = spdk_bdev_write_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
+					     raid_ch->base_channel[pd_idx],
+					     buf, pd_lba, pd_blocks, raid_bdev_io_completion,
+					     bdev_io);
+	} else {
+		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
+		assert(0);
+	}
+	if (ret != 0) {
 		/*
-		 * For each strip of parent bdev io, process for each strip and submit
-		 * child io to bdev layer. Calculate base bdev level start lba, length
-		 * and buffer for this child io
+		 * If failed to submit child io to bdev layer then queue the parent
+		 * bdev io with current active split information in the wait queue
+		 * for that core. This will get resume from this point only. Assume
+		 * if 4 splits are required and 2 childs are submitted, then parent
+		 * io is queued to io waitq of this core and it will get resumed and
+		 * try to submit the remaining 3 and 4 childs
 		 */
-		pd_strip = strip / raid_bdev->num_base_bdevs;
-		pd_idx = strip % raid_bdev->num_base_bdevs;
-		if (strip == start_strip) {
-			offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
-			pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
-			if (strip == end_strip) {
-				pd_blocks = bdev_io->u.bdev.num_blocks;
-			} else {
-				pd_blocks = raid_bdev->strip_size - offset_in_strip;
-			}
-		} else if (strip == end_strip) {
-			pd_lba = pd_strip << raid_bdev->strip_size_shift;
-			pd_blocks = ((bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) &
-				     (raid_bdev->strip_size - 1)) + 1;
-		} else {
-			pd_lba = pd_strip << raid_bdev->strip_size_shift;
-			pd_blocks = raid_bdev->strip_size;
-		}
-		raid_io->splits_comp_outstanding++;
-		assert(raid_io->splits_pending);
-		raid_io->splits_pending--;
-		if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
-			SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
-			assert(0);
-		}
-
-		/*
-		 * Submit child io to bdev layer with using base bdev descriptors, base
-		 * bdev lba, base bdev child io length in blocks, buffer, completion
-		 * function and function callback context
-		 */
-		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
-			ret = spdk_bdev_read_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
-						    raid_ch->base_channel[pd_idx],
-						    buf, pd_lba, pd_blocks, raid_bdev_io_completion,
-						    bdev_io);
-
-		} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
-			ret = spdk_bdev_write_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
-						     raid_ch->base_channel[pd_idx],
-						     buf, pd_lba, pd_blocks, raid_bdev_io_completion,
-						     bdev_io);
-		} else {
-			SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
-			assert(0);
-		}
-		if (ret != 0) {
-			/*
-			 * If failed to submit child io to bdev layer then queue the parent
-			 * bdev io with current active split information in the wait queue
-			 * for that core. This will get resume from this point only. Assume
-			 * if 4 splits are required and 2 childs are submitted, then parent
-			 * io is queued to io waitq of this core and it will get resumed and
-			 * try to submit the remaining 3 and 4 childs
-			 */
-			raid_io->buf = buf;
-			raid_io->splits_comp_outstanding--;
-			raid_io->splits_pending++;
-			return ret;
-		}
-		buf += (pd_blocks << raid_bdev->blocklen_shift);
+		raid_io->buf = buf;
+		raid_io->splits_comp_outstanding--;
+		raid_io->splits_pending++;
+		return ret;
 	}

 	return 0;
@ -1257,11 +1240,14 @@ raid_bdev_configure(struct raid_bdev *raid_bdev)
 	raid_bdev_gen = &raid_bdev->bdev;
 	raid_bdev_gen->write_cache = 0;
 	raid_bdev_gen->blocklen = blocklen;
-	raid_bdev_gen->optimal_io_boundary = 0;
-
+	raid_bdev_gen->ctxt = raid_bdev;
+	raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
+	raid_bdev_gen->module = &g_raid_if;
 	raid_bdev->strip_size = (raid_bdev->strip_size * 1024) / blocklen;
 	raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
 	raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
+	raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size;
+	raid_bdev_gen->split_on_optimal_io_boundary = true;

 	/*
 	 * RAID bdev logic is for striping so take the minimum block count based
--- a/test/unit/lib/bdev/bdev_raid.c/bdev_raid_ut.c
+++ b/test/unit/lib/bdev/bdev_raid.c/bdev_raid_ut.c
@ -863,7 +863,8 @@ verify_raid_bdev(struct rpc_construct_raid_bdev *r, bool presence, uint32_t raid
 			CU_ASSERT(strcmp(pbdev->bdev.product_name, "Pooled Device") == 0);
 			CU_ASSERT(pbdev->bdev.write_cache == 0);
 			CU_ASSERT(pbdev->bdev.blocklen == g_block_len);
-			CU_ASSERT(pbdev->bdev.optimal_io_boundary == 0);
+			CU_ASSERT(pbdev->bdev.optimal_io_boundary == pbdev->strip_size);
+			CU_ASSERT(pbdev->bdev.split_on_optimal_io_boundary == true);
 			CU_ASSERT(pbdev->bdev.ctxt == pbdev);
 			CU_ASSERT(pbdev->bdev.fn_table == &g_raid_bdev_fn_table);
 			CU_ASSERT(pbdev->bdev.module == &g_raid_if);
@ -1375,9 +1376,9 @@ test_write_io(void)
 	for (count = 0; count < g_max_qd; count++) {
 		bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
 		SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
-		io_len = (rand() % g_max_io_size) + 1;
+		io_len = (rand() % g_strip_size) + 1;
 		bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
-		lba += io_len;
+		lba += g_strip_size;
 		memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
 		g_io_output_index = 0;
 		raid_bdev_submit_request(ch, bdev_io);
@ -1454,9 +1455,9 @@ test_read_io(void)
 	for (count = 0; count < g_max_qd; count++) {
 		bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
 		SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
-		io_len = (rand() % g_max_io_size) + 1;
+		io_len = (rand() % g_strip_size) + 1;
 		bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_READ);
-		lba += io_len;
+		lba += g_strip_size;
 		memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
 		g_io_output_index = 0;
 		raid_bdev_submit_request(ch, bdev_io);
@ -1533,9 +1534,9 @@ test_io_failure(void)
 	for (count = 0; count < 1; count++) {
 		bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
 		SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
-		io_len = (rand() % g_max_io_size) + 1;
+		io_len = (rand() % g_strip_size) + 1;
 		bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_INVALID);
-		lba += io_len;
+		lba += g_strip_size;
 		memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
 		g_io_output_index = 0;
 		raid_bdev_submit_request(ch, bdev_io);
@ -1551,9 +1552,9 @@ test_io_failure(void)
 	for (count = 0; count < 1; count++) {
 		bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
 		SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
-		io_len = (rand() % g_max_io_size) + 1;
+		io_len = (rand() % g_strip_size) + 1;
 		bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
-		lba += io_len;
+		lba += g_strip_size;
 		memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
 		g_io_output_index = 0;
 		raid_bdev_submit_request(ch, bdev_io);
@ -1635,10 +1636,10 @@ test_io_waitq(void)
 		bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
 		SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
 		TAILQ_INSERT_TAIL(&head_io, bdev_io, module_link);
-		io_len = (rand() % g_max_io_size) + 1;
+		io_len = (rand() % g_strip_size) + 1;
 		bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE);
 		g_bdev_io_submit_status = -ENOMEM;
-		lba += io_len;
+		lba += g_strip_size;
 		raid_bdev_submit_request(ch, bdev_io);
 	}

@ -1869,7 +1870,7 @@ test_multi_raid_with_io(void)
 	for (count = 0; count < g_max_qd; count++) {
 		bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io));
 		SPDK_CU_ASSERT_FATAL(bdev_io != NULL);
-		io_len = (rand() % g_max_io_size) + 1;
+		io_len = (rand() % g_strip_size) + 1;
 		iotype = (rand() % 2) ? SPDK_BDEV_IO_TYPE_WRITE : SPDK_BDEV_IO_TYPE_READ;
 		memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output));
 		g_io_output_index = 0;
@ -1882,7 +1883,7 @@ test_multi_raid_with_io(void)
 			}
 		}
 		bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, iotype);
-		lba += io_len;
+		lba += g_strip_size;
 		CU_ASSERT(pbdev != NULL);
 		raid_bdev_submit_request(ch_random, bdev_io);
 		verify_io(bdev_io, g_max_base_drives, ch_ctx_random, pbdev,