diff --git a/include/spdk_internal/bdev.h b/include/spdk_internal/bdev.h index 40fb9a3365..8db683e5b8 100644 --- a/include/spdk_internal/bdev.h +++ b/include/spdk_internal/bdev.h @@ -271,6 +271,12 @@ struct spdk_bdev_io { /** Status for the IO */ int16_t status; + /** number of blocks remaining in a split i/o */ + uint64_t split_remaining_num_blocks; + + /** current offset of the split I/O in the bdev */ + uint64_t split_current_offset_blocks; + /** * Set to true while the bdev module submit_request function is in progress. * @@ -337,6 +343,9 @@ struct spdk_bdev_io { /** User function that will be called when this completes */ spdk_bdev_io_completion_cb cb; + /** stored user callback in case we split the I/O and use a temporary callback */ + spdk_bdev_io_completion_cb stored_user_cb; + /** Context that will be passed to the completion callback */ void *caller_ctx; diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index de9b72b841..97b3a6cebb 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -58,6 +58,7 @@ int __itt_init_ittlib(const char *, __itt_group_id); #define BUF_SMALL_POOL_SIZE 8192 #define BUF_LARGE_POOL_SIZE 1024 #define NOMEM_THRESHOLD_COUNT 8 +#define ZERO_BUFFER_SIZE 0x100000 typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t; @@ -67,6 +68,8 @@ struct spdk_bdev_mgr { struct spdk_mempool *buf_small_pool; struct spdk_mempool *buf_large_pool; + void *zero_buffer; + TAILQ_HEAD(, spdk_bdev_module_if) bdev_modules; TAILQ_HEAD(, spdk_bdev) bdevs; @@ -150,6 +153,8 @@ struct spdk_bdev_channel { }; +static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); + struct spdk_bdev * spdk_bdev_first(void) { @@ -527,6 +532,14 @@ spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg, return; } + g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, + NULL); + if (!g_bdev_mgr.zero_buffer) { + SPDK_ERRLOG("create bdev zero buffer failed\n"); + spdk_bdev_init_complete(-1); + return; + } + #ifdef SPDK_CONFIG_VTUNE g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); #endif @@ -579,6 +592,7 @@ spdk_bdev_finish(void) spdk_mempool_free(g_bdev_mgr.bdev_io_pool); spdk_mempool_free(g_bdev_mgr.buf_small_pool); spdk_mempool_free(g_bdev_mgr.buf_large_pool); + spdk_dma_free(g_bdev_mgr.zero_buffer); spdk_io_device_unregister(&g_bdev_mgr, NULL); } @@ -1088,26 +1102,60 @@ spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channe struct spdk_bdev *bdev = desc->bdev; struct spdk_bdev_io *bdev_io; struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + uint64_t len; + bool split_request = false; + + if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) { + SPDK_ERRLOG("length argument out of range in write_zeroes\n"); + return -ERANGE; + } if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { return -EINVAL; } bdev_io = spdk_bdev_get_io(); + if (!bdev_io) { SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n"); return -ENOMEM; } bdev_io->ch = channel; - bdev_io->u.bdev.iovs = NULL; - bdev_io->u.bdev.iovcnt = 0; - bdev_io->u.bdev.num_blocks = num_blocks; bdev_io->u.bdev.offset_blocks = offset_blocks; - bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; - spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.iovs = NULL; + bdev_io->u.bdev.iovcnt = 0; + } else { + assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); + + len = spdk_bdev_get_block_size(bdev) * num_blocks; + + if (len > ZERO_BUFFER_SIZE) { + split_request = true; + len = ZERO_BUFFER_SIZE; + } + + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; + bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer; + bdev_io->u.bdev.iov.iov_len = len; + bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; + bdev_io->u.bdev.iovcnt = 1; + bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); + bdev_io->split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; + bdev_io->split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; + } + + if (split_request) { + bdev_io->stored_user_cb = cb; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); + } else { + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + } spdk_bdev_io_submit(bdev_io); return 0; } @@ -1948,6 +1996,35 @@ spdk_bdev_part_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_ spdk_bdev_free_io(bdev_io); } +static void +spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + uint64_t len; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ + len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->split_remaining_num_blocks, + ZERO_BUFFER_SIZE); + + bdev_io->u.bdev.offset_blocks = bdev_io->split_current_offset_blocks; + bdev_io->u.bdev.iov.iov_len = len; + bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); + bdev_io->split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; + bdev_io->split_current_offset_blocks += bdev_io->u.bdev.num_blocks; + + /* if this round completes the i/o, change the callback to be the original user callback */ + if (bdev_io->split_remaining_num_blocks == 0) { + spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->stored_user_cb); + } else { + spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); + } + spdk_bdev_io_submit(bdev_io); +} + void spdk_bdev_part_submit_request(struct spdk_bdev_part_channel *ch, struct spdk_bdev_io *bdev_io) { diff --git a/test/lib/bdev/bdevio/bdevio.c b/test/lib/bdev/bdevio/bdevio.c index 75cb6e66aa..1fb632de01 100644 --- a/test/lib/bdev/bdevio/bdevio.c +++ b/test/lib/bdev/bdevio/bdevio.c @@ -209,6 +209,21 @@ __blockdev_write(void *arg1, void *arg2) } } +static void +__blockdev_write_zeroes(void *arg1, void *arg2) +{ + struct bdevio_request *req = arg1; + struct io_target *target = req->target; + int rc; + + rc = spdk_bdev_write_zeroes(target->bdev_desc, target->ch, req->offset, + req->data_len, quick_test_complete, NULL); + if (rc) { + g_completion_success = false; + wake_ut_thread(); + } +} + static void sgl_chop_buffer(struct bdevio_request *req, int iov_len) { @@ -250,6 +265,22 @@ blockdev_write(struct io_target *target, char *tx_buf, execute_spdk_function(__blockdev_write, &req, NULL); } +static void +blockdev_write_zeroes(struct io_target *target, char *tx_buf, + uint64_t offset, int data_len) +{ + struct bdevio_request req; + + req.target = target; + req.buf = tx_buf; + req.data_len = data_len; + req.offset = offset; + + g_completion_success = false; + + execute_spdk_function(__blockdev_write_zeroes, &req, NULL); +} + static void __blockdev_read(void *arg1, void *arg2) { @@ -303,7 +334,7 @@ blockdev_write_read_data_match(char *rx_buf, char *tx_buf, int data_length) static void blockdev_write_read(uint32_t data_length, uint32_t iov_len, int pattern, uint64_t offset, - int expected_rc) + int expected_rc, bool write_zeroes) { struct io_target *target; char *tx_buf = NULL; @@ -312,22 +343,30 @@ blockdev_write_read(uint32_t data_length, uint32_t iov_len, int pattern, uint64_ target = g_io_targets; while (target != NULL) { - if (data_length < spdk_bdev_get_block_size(target->bdev)) { + if (data_length < spdk_bdev_get_block_size(target->bdev) || + data_length / spdk_bdev_get_block_size(target->bdev) > spdk_bdev_get_num_blocks(target->bdev)) { target = target->next; continue; } - initialize_buffer(&tx_buf, pattern, data_length); - initialize_buffer(&rx_buf, 0, data_length); + if (!write_zeroes) { + initialize_buffer(&tx_buf, pattern, data_length); + initialize_buffer(&rx_buf, 0, data_length); + + blockdev_write(target, tx_buf, offset, data_length, iov_len); + } else { + initialize_buffer(&tx_buf, 0, data_length); + initialize_buffer(&rx_buf, pattern, data_length); + + blockdev_write_zeroes(target, tx_buf, offset, data_length); + } - blockdev_write(target, tx_buf, offset, data_length, iov_len); if (expected_rc == 0) { CU_ASSERT_EQUAL(g_completion_success, true); } else { CU_ASSERT_EQUAL(g_completion_success, false); } - blockdev_read(target, rx_buf, offset, data_length, iov_len); if (expected_rc == 0) { @@ -364,7 +403,96 @@ blockdev_write_read_4k(void) * of write and read for all blockdevs is 0. */ expected_rc = 0; - blockdev_write_read(data_length, 0, pattern, offset, expected_rc); + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0); +} + +static void +blockdev_write_zeroes_read_4k(void) +{ + uint32_t data_length; + uint64_t offset; + int pattern; + int expected_rc; + + /* Data size = 4K */ + data_length = 4096; + offset = 0; + pattern = 0xA3; + /* Params are valid, hence the expected return value + * of write_zeroes and read for all blockdevs is 0. */ + expected_rc = 0; + + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1); +} + +/* + * This i/o will not have to split at the bdev layer. + */ +static void +blockdev_write_zeroes_read_1m(void) +{ + uint32_t data_length; + uint64_t offset; + int pattern; + int expected_rc; + + /* Data size = 1M */ + data_length = 1048576; + offset = 0; + pattern = 0xA3; + /* Params are valid, hence the expected return value + * of write_zeroes and read for all blockdevs is 0. */ + expected_rc = 0; + + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1); +} + +/* + * This i/o will have to split at the bdev layer if + * write-zeroes is not supported by the bdev. + */ +static void +blockdev_write_zeroes_read_3m(void) +{ + uint32_t data_length; + uint64_t offset; + int pattern; + int expected_rc; + + /* Data size = 3M */ + data_length = 3145728; + offset = 0; + pattern = 0xA3; + /* Params are valid, hence the expected return value + * of write_zeroes and read for all blockdevs is 0. */ + expected_rc = 0; + + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1); +} + +/* + * This i/o will have to split at the bdev layer if + * write-zeroes is not supported by the bdev. It also + * tests a write size that is not an even multiple of + * the bdev layer zero buffer size. + */ +static void +blockdev_write_zeroes_read_3m_500k(void) +{ + uint32_t data_length; + uint64_t offset; + int pattern; + int expected_rc; + + /* Data size = 3.5M */ + data_length = 3670016; + offset = 0; + pattern = 0xA3; + /* Params are valid, hence the expected return value + * of write_zeroes and read for all blockdevs is 0. */ + expected_rc = 0; + + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1); } static void @@ -385,7 +513,7 @@ blockdev_writev_readv_4k(void) * of write and read for all blockdevs is 0. */ expected_rc = 0; - blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc); + blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0); } static void @@ -406,7 +534,7 @@ blockdev_writev_readv_30x4k(void) * of write and read for all blockdevs is 0. */ expected_rc = 0; - blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc); + blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0); } static void @@ -426,7 +554,7 @@ blockdev_write_read_512Bytes(void) * of write and read for all blockdevs is 0. */ expected_rc = 0; - blockdev_write_read(data_length, 0, pattern, offset, expected_rc); + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0); } static void @@ -447,7 +575,7 @@ blockdev_writev_readv_512Bytes(void) * of write and read for all blockdevs is 0. */ expected_rc = 0; - blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc); + blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0); } static void @@ -467,7 +595,7 @@ blockdev_write_read_size_gt_128k(void) * of write and read for all blockdevs is 0. */ expected_rc = 0; - blockdev_write_read(data_length, 0, pattern, offset, expected_rc); + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0); } static void @@ -488,7 +616,7 @@ blockdev_writev_readv_size_gt_128k(void) * of write and read for all blockdevs is 0. */ expected_rc = 0; - blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc); + blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0); } static void @@ -509,7 +637,7 @@ blockdev_writev_readv_size_gt_128k_two_iov(void) * of write and read for all blockdevs is 0. */ expected_rc = 0; - blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc); + blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0); } static void @@ -529,7 +657,7 @@ blockdev_write_read_invalid_size(void) * of write and read for all blockdevs is < 0 */ expected_rc = -1; - blockdev_write_read(data_length, 0, pattern, offset, expected_rc); + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0); } static void @@ -628,7 +756,7 @@ blockdev_write_read_max_offset(void) * of write and read for all blockdevs is < 0 */ expected_rc = -1; - blockdev_write_read(data_length, 0, pattern, offset, expected_rc); + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0); } static void @@ -649,7 +777,7 @@ blockdev_overlapped_write_read_8k(void) expected_rc = 0; /* Assert the write by comparing it with values read * from the same offset for each blockdev */ - blockdev_write_read(data_length, 0, pattern, offset, expected_rc); + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0); /* Overwrite the pattern 0xbb of size 8K on an address offset overlapping * with the address written above and assert the new value in @@ -660,7 +788,7 @@ blockdev_overlapped_write_read_8k(void) offset = 4096; /* Assert the write by comparing it with values read * from the overlapped offset for each blockdev */ - blockdev_write_read(data_length, 0, pattern, offset, expected_rc); + blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0); } static void @@ -741,6 +869,10 @@ __run_ut_thread(void *arg1, void *arg2) if ( CU_add_test(suite, "blockdev write read 4k", blockdev_write_read_4k) == NULL + || CU_add_test(suite, "blockdev write zeroes read 4k", blockdev_write_zeroes_read_4k) == NULL + || CU_add_test(suite, "blockdev write zeroes read 1m", blockdev_write_zeroes_read_1m) == NULL + || CU_add_test(suite, "blockdev write zeroes read 3m", blockdev_write_zeroes_read_3m) == NULL + || CU_add_test(suite, "blockdev write zeroes read 3.5m", blockdev_write_zeroes_read_3m_500k) == NULL || CU_add_test(suite, "blockdev write read 512 bytes", blockdev_write_read_512Bytes) == NULL || CU_add_test(suite, "blockdev write read size > 128k",