diff --git a/include/spdk/bdev.h b/include/spdk/bdev.h index f09f34c7e6..477a0e324a 100644 --- a/include/spdk/bdev.h +++ b/include/spdk/bdev.h @@ -138,6 +138,7 @@ enum spdk_bdev_io_type { SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT, SPDK_BDEV_IO_TYPE_ZONE_APPEND, SPDK_BDEV_IO_TYPE_COMPARE, + SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE, SPDK_BDEV_NUM_IO_TYPES /* Keep last */ }; @@ -1125,6 +1126,46 @@ int spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_i uint64_t offset_blocks, uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg); +/** + * Submit an atomic compare-and-write request to the bdev on the given channel. + * For bdevs that do not natively support atomic compare-and-write, the bdev layer + * will quiesce I/O to the specified LBA range, before performing the read, + * compare and write operations. + * + * Currently this supports compare-and-write of only one block. + * + * The data buffers for both the compare and write operations are described in a + * scatter gather list. Some physical devices place memory alignment requirements on + * data and may not be able to directly transfer out of the buffers provided. In + * this case, the request may fail. + * + * \ingroup bdev_io_submit_functions + * + * \param desc Block device descriptor. + * \param ch I/O channel. Obtained by calling spdk_bdev_get_io_channel(). + * \param compare_iov A scatter gather list of buffers to be compared. + * \param compare_iovcnt The number of elements in compare_iov. + * \param write_iov A scatter gather list of buffers to be written if the compare is + * successful. + * \param write_iovcnt The number of elements in write_iov. + * \param offset_blocks The offset, in blocks, from the start of the block device. + * \param num_blocks The number of blocks to compare-and-write. + * \param cb Called when the request is complete. + * \param cb_arg Argument passed to cb. + * + * \return 0 on success. On success, the callback will always + * be called (even if the request ultimately failed). Return + * negated errno on failure, in which case the callback will not be called. + * * -EINVAL - offset_blocks and/or num_blocks are out of range + * * -ENOMEM - spdk_bdev_io buffer cannot be allocated + * * -EBADF - desc not open for writing + */ +int spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *compare_iov, int compare_iovcnt, + struct iovec *write_iov, int write_iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg); + /** * Submit a request to acquire a data buffer that represents the given * range of blocks. The data buffer is placed in the spdk_bdev_io structure diff --git a/include/spdk/bdev_module.h b/include/spdk/bdev_module.h index 8f84f0c087..9b4186bc41 100644 --- a/include/spdk/bdev_module.h +++ b/include/spdk/bdev_module.h @@ -476,6 +476,14 @@ struct spdk_bdev_io { /** For SG buffer cases, number of iovecs in iovec array. */ int iovcnt; + /** For fused operations such as COMPARE_AND_WRITE, array of iovecs + * for the second operation. + */ + struct iovec *fused_iovs; + + /** Number of iovecs in fused_iovs. */ + int fused_iovcnt; + /* Metadata buffer */ void *md_buf; diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index 82d883e803..a733ba4142 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -3565,6 +3565,123 @@ spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_cha cb, cb_arg); } +static void +bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (success) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + } else { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + } + + parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); +} + +static void +bdev_compare_and_write_do_write(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + int rc; + + rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, + bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, + bdev_compare_and_write_do_write_done, bdev_io); + + + if (rc == -ENOMEM) { + bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); + } else if (rc != 0) { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } +} + +static void +bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; + parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); + return; + } + + bdev_compare_and_write_do_write(parent_io); +} + +static void +bdev_compare_and_write_do_compare(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + int rc; + + rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, + bdev_compare_and_write_do_compare_done, bdev_io); + + if (rc == -ENOMEM) { + bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); + } else if (rc != 0) { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } +} + +int +spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *compare_iov, int compare_iovcnt, + struct iovec *write_iov, int write_iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; + bdev_io->u.bdev.iovs = compare_iov; + bdev_io->u.bdev.iovcnt = compare_iovcnt; + bdev_io->u.bdev.fused_iovs = write_iov; + bdev_io->u.bdev.fused_iovcnt = write_iovcnt; + bdev_io->u.bdev.md_buf = NULL; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { + bdev_io_submit(bdev_io); + return 0; + } + + bdev_compare_and_write_do_compare(bdev_io); + return 0; +} + static void bdev_zcopy_get_buf(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) { diff --git a/test/unit/lib/bdev/bdev.c/bdev_ut.c b/test/unit/lib/bdev/bdev.c/bdev_ut.c index 6191113ada..c3e4a79d4e 100644 --- a/test/unit/lib/bdev/bdev.c/bdev_ut.c +++ b/test/unit/lib/bdev/bdev.c/bdev_ut.c @@ -116,6 +116,10 @@ static enum spdk_bdev_io_status g_io_status; static enum spdk_bdev_io_status g_io_exp_status = SPDK_BDEV_IO_STATUS_SUCCESS; static uint32_t g_bdev_ut_io_device; static struct bdev_ut_channel *g_bdev_ut_channel; +static void *g_compare_read_buf; +static uint32_t g_compare_read_buf_len; +static void *g_compare_write_buf; +static uint32_t g_compare_write_buf_len; static struct ut_expected_io * ut_alloc_expected_io(uint8_t type, uint64_t offset, uint64_t length, int iovcnt) @@ -150,6 +154,22 @@ stub_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) g_bdev_io = bdev_io; + if (g_compare_read_buf && bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + uint32_t len = bdev_io->u.bdev.iovs[0].iov_len; + + CU_ASSERT(bdev_io->u.bdev.iovcnt == 1); + CU_ASSERT(g_compare_read_buf_len == len); + memcpy(bdev_io->u.bdev.iovs[0].iov_base, g_compare_read_buf, len); + } + + if (g_compare_write_buf && bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + uint32_t len = bdev_io->u.bdev.iovs[0].iov_len; + + CU_ASSERT(bdev_io->u.bdev.iovcnt == 1); + CU_ASSERT(g_compare_write_buf_len == len); + memcpy(g_compare_write_buf, bdev_io->u.bdev.iovs[0].iov_base, len); + } + TAILQ_INSERT_TAIL(&ch->outstanding_io, bdev_io, module_link); ch->outstanding_io_count++; @@ -1917,6 +1937,7 @@ bdev_io_alignment(void) spdk_put_io_channel(io_ch); spdk_bdev_close(desc); free_bdev(bdev); + fn_table.submit_request = stub_submit_request; spdk_bdev_finish(bdev_fini_cb, NULL); poll_threads(); @@ -2054,6 +2075,7 @@ bdev_io_alignment_with_boundary(void) spdk_put_io_channel(io_ch); spdk_bdev_close(desc); free_bdev(bdev); + fn_table.submit_request = stub_submit_request; spdk_bdev_finish(bdev_fini_cb, NULL); poll_threads(); @@ -2168,6 +2190,100 @@ bdev_histograms(void) poll_threads(); } +static void +bdev_compare_and_write(void) +{ + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc = NULL; + struct spdk_io_channel *ioch; + struct ut_expected_io *expected_io; + uint64_t offset, num_blocks; + uint32_t num_completed; + char aa_buf[512]; + char bb_buf[512]; + char cc_buf[512]; + char write_buf[512]; + struct iovec compare_iov; + struct iovec write_iov; + int rc; + + memset(aa_buf, 0xaa, sizeof(aa_buf)); + memset(bb_buf, 0xbb, sizeof(bb_buf)); + memset(cc_buf, 0xcc, sizeof(cc_buf)); + + spdk_bdev_initialize(bdev_init_cb, NULL); + fn_table.submit_request = stub_submit_request_get_buf; + bdev = allocate_bdev("bdev"); + + rc = spdk_bdev_open(bdev, true, NULL, NULL, &desc); + CU_ASSERT_EQUAL(rc, 0); + SPDK_CU_ASSERT_FATAL(desc != NULL); + ioch = spdk_bdev_get_io_channel(desc); + SPDK_CU_ASSERT_FATAL(ioch != NULL); + + fn_table.submit_request = stub_submit_request_get_buf; + g_io_exp_status = SPDK_BDEV_IO_STATUS_SUCCESS; + + offset = 50; + num_blocks = 1; + compare_iov.iov_base = aa_buf; + compare_iov.iov_len = sizeof(aa_buf); + write_iov.iov_base = bb_buf; + write_iov.iov_len = sizeof(bb_buf); + + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, offset, num_blocks, 0); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, offset, num_blocks, 0); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + + g_io_done = false; + g_compare_read_buf = aa_buf; + g_compare_read_buf_len = sizeof(aa_buf); + memset(write_buf, 0, sizeof(write_buf)); + g_compare_write_buf = write_buf; + g_compare_write_buf_len = sizeof(write_buf); + rc = spdk_bdev_comparev_and_writev_blocks(desc, ioch, &compare_iov, 1, &write_iov, 1, + offset, num_blocks, io_done, NULL); + CU_ASSERT_EQUAL(rc, 0); + num_completed = stub_complete_io(1); + CU_ASSERT_EQUAL(num_completed, 1); + CU_ASSERT(g_io_done == false); + num_completed = stub_complete_io(1); + CU_ASSERT_EQUAL(num_completed, 1); + CU_ASSERT(g_io_done == true); + CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_SUCCESS); + CU_ASSERT(memcmp(write_buf, bb_buf, sizeof(write_buf)) == 0); + + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, offset, num_blocks, 0); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + + g_io_done = false; + g_compare_read_buf = cc_buf; + g_compare_read_buf_len = sizeof(cc_buf); + memset(write_buf, 0, sizeof(write_buf)); + g_compare_write_buf = write_buf; + g_compare_write_buf_len = sizeof(write_buf); + rc = spdk_bdev_comparev_and_writev_blocks(desc, ioch, &compare_iov, 1, &write_iov, 1, + offset, num_blocks, io_done, NULL); + CU_ASSERT_EQUAL(rc, 0); + num_completed = stub_complete_io(1); + CU_ASSERT_EQUAL(num_completed, 1); + CU_ASSERT(g_io_done == true); + CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_MISCOMPARE); + num_completed = stub_complete_io(1); + CU_ASSERT_EQUAL(num_completed, 0); + + spdk_put_io_channel(ioch); + spdk_bdev_close(desc); + free_bdev(bdev); + fn_table.submit_request = stub_submit_request; + spdk_bdev_finish(bdev_fini_cb, NULL); + poll_threads(); + + g_compare_read_buf = NULL; + g_compare_write_buf = NULL; +} + static void bdev_write_zeroes(void) { @@ -2921,6 +3037,7 @@ main(int argc, char **argv) CU_add_test(suite, "bdev_io_alignment", bdev_io_alignment) == NULL || CU_add_test(suite, "bdev_histograms", bdev_histograms) == NULL || CU_add_test(suite, "bdev_write_zeroes", bdev_write_zeroes) == NULL || + CU_add_test(suite, "bdev_compare_and_write", bdev_compare_and_write) == NULL || CU_add_test(suite, "bdev_open_while_hotremove", bdev_open_while_hotremove) == NULL || CU_add_test(suite, "bdev_close_while_hotremove", bdev_close_while_hotremove) == NULL || CU_add_test(suite, "bdev_open_ext", bdev_open_ext) == NULL ||