bdev: add fallback from write_zeroes to writev

if write_zeroes is not supported by the block device, we can get the
same behavior by simply writing a buffer full of zeroes to the blocks
we want to erase. I also incorporate splitting into the bdev layer to
accomodate large i/o.

Change-Id: I8fa1bfaaf22d7bfc6e3afb6e89d22fa9f7767e55
Signed-off-by: Seth Howell <seth.howell@intel.com>
Reviewed-on: https://review.gerrithub.io/373829
Reviewed-by: Daniel Verkamp <daniel.verkamp@intel.com>
Tested-by: SPDK Automated Test System <sys_sgsw@intel.com>
This commit is contained in:
Seth Howell 2017-07-28 15:34:24 -07:00 committed by Daniel Verkamp
parent 5f5edbcb8c
commit 3fd7f28dc9
3 changed files with 241 additions and 23 deletions

View File

@ -271,6 +271,12 @@ struct spdk_bdev_io {
/** Status for the IO */
int16_t status;
/** number of blocks remaining in a split i/o */
uint64_t split_remaining_num_blocks;
/** current offset of the split I/O in the bdev */
uint64_t split_current_offset_blocks;
/**
* Set to true while the bdev module submit_request function is in progress.
*
@ -337,6 +343,9 @@ struct spdk_bdev_io {
/** User function that will be called when this completes */
spdk_bdev_io_completion_cb cb;
/** stored user callback in case we split the I/O and use a temporary callback */
spdk_bdev_io_completion_cb stored_user_cb;
/** Context that will be passed to the completion callback */
void *caller_ctx;

View File

@ -58,6 +58,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
#define BUF_SMALL_POOL_SIZE 8192
#define BUF_LARGE_POOL_SIZE 1024
#define NOMEM_THRESHOLD_COUNT 8
#define ZERO_BUFFER_SIZE 0x100000
typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t;
@ -67,6 +68,8 @@ struct spdk_bdev_mgr {
struct spdk_mempool *buf_small_pool;
struct spdk_mempool *buf_large_pool;
void *zero_buffer;
TAILQ_HEAD(, spdk_bdev_module_if) bdev_modules;
TAILQ_HEAD(, spdk_bdev) bdevs;
@ -150,6 +153,8 @@ struct spdk_bdev_channel {
};
static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
struct spdk_bdev *
spdk_bdev_first(void)
{
@ -527,6 +532,14 @@ spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg,
return;
}
g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
NULL);
if (!g_bdev_mgr.zero_buffer) {
SPDK_ERRLOG("create bdev zero buffer failed\n");
spdk_bdev_init_complete(-1);
return;
}
#ifdef SPDK_CONFIG_VTUNE
g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
#endif
@ -579,6 +592,7 @@ spdk_bdev_finish(void)
spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
spdk_mempool_free(g_bdev_mgr.buf_small_pool);
spdk_mempool_free(g_bdev_mgr.buf_large_pool);
spdk_dma_free(g_bdev_mgr.zero_buffer);
spdk_io_device_unregister(&g_bdev_mgr, NULL);
}
@ -1088,26 +1102,60 @@ spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channe
struct spdk_bdev *bdev = desc->bdev;
struct spdk_bdev_io *bdev_io;
struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
uint64_t len;
bool split_request = false;
if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) {
SPDK_ERRLOG("length argument out of range in write_zeroes\n");
return -ERANGE;
}
if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
return -EINVAL;
}
bdev_io = spdk_bdev_get_io();
if (!bdev_io) {
SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n");
return -ENOMEM;
}
bdev_io->ch = channel;
bdev_io->u.bdev.iovs = NULL;
bdev_io->u.bdev.iovcnt = 0;
bdev_io->u.bdev.num_blocks = num_blocks;
bdev_io->u.bdev.offset_blocks = offset_blocks;
bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
bdev_io->u.bdev.num_blocks = num_blocks;
bdev_io->u.bdev.iovs = NULL;
bdev_io->u.bdev.iovcnt = 0;
} else {
assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE);
len = spdk_bdev_get_block_size(bdev) * num_blocks;
if (len > ZERO_BUFFER_SIZE) {
split_request = true;
len = ZERO_BUFFER_SIZE;
}
bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer;
bdev_io->u.bdev.iov.iov_len = len;
bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov;
bdev_io->u.bdev.iovcnt = 1;
bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev);
bdev_io->split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks;
bdev_io->split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks;
}
if (split_request) {
bdev_io->stored_user_cb = cb;
spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split);
} else {
spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
}
spdk_bdev_io_submit(bdev_io);
return 0;
}
@ -1948,6 +1996,35 @@ spdk_bdev_part_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_
spdk_bdev_free_io(bdev_io);
}
static void
spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
uint64_t len;
if (!success) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
return;
}
/* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */
len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->split_remaining_num_blocks,
ZERO_BUFFER_SIZE);
bdev_io->u.bdev.offset_blocks = bdev_io->split_current_offset_blocks;
bdev_io->u.bdev.iov.iov_len = len;
bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev);
bdev_io->split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks;
bdev_io->split_current_offset_blocks += bdev_io->u.bdev.num_blocks;
/* if this round completes the i/o, change the callback to be the original user callback */
if (bdev_io->split_remaining_num_blocks == 0) {
spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->stored_user_cb);
} else {
spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split);
}
spdk_bdev_io_submit(bdev_io);
}
void
spdk_bdev_part_submit_request(struct spdk_bdev_part_channel *ch, struct spdk_bdev_io *bdev_io)
{

View File

@ -209,6 +209,21 @@ __blockdev_write(void *arg1, void *arg2)
}
}
static void
__blockdev_write_zeroes(void *arg1, void *arg2)
{
struct bdevio_request *req = arg1;
struct io_target *target = req->target;
int rc;
rc = spdk_bdev_write_zeroes(target->bdev_desc, target->ch, req->offset,
req->data_len, quick_test_complete, NULL);
if (rc) {
g_completion_success = false;
wake_ut_thread();
}
}
static void
sgl_chop_buffer(struct bdevio_request *req, int iov_len)
{
@ -250,6 +265,22 @@ blockdev_write(struct io_target *target, char *tx_buf,
execute_spdk_function(__blockdev_write, &req, NULL);
}
static void
blockdev_write_zeroes(struct io_target *target, char *tx_buf,
uint64_t offset, int data_len)
{
struct bdevio_request req;
req.target = target;
req.buf = tx_buf;
req.data_len = data_len;
req.offset = offset;
g_completion_success = false;
execute_spdk_function(__blockdev_write_zeroes, &req, NULL);
}
static void
__blockdev_read(void *arg1, void *arg2)
{
@ -303,7 +334,7 @@ blockdev_write_read_data_match(char *rx_buf, char *tx_buf, int data_length)
static void
blockdev_write_read(uint32_t data_length, uint32_t iov_len, int pattern, uint64_t offset,
int expected_rc)
int expected_rc, bool write_zeroes)
{
struct io_target *target;
char *tx_buf = NULL;
@ -312,22 +343,30 @@ blockdev_write_read(uint32_t data_length, uint32_t iov_len, int pattern, uint64_
target = g_io_targets;
while (target != NULL) {
if (data_length < spdk_bdev_get_block_size(target->bdev)) {
if (data_length < spdk_bdev_get_block_size(target->bdev) ||
data_length / spdk_bdev_get_block_size(target->bdev) > spdk_bdev_get_num_blocks(target->bdev)) {
target = target->next;
continue;
}
initialize_buffer(&tx_buf, pattern, data_length);
initialize_buffer(&rx_buf, 0, data_length);
if (!write_zeroes) {
initialize_buffer(&tx_buf, pattern, data_length);
initialize_buffer(&rx_buf, 0, data_length);
blockdev_write(target, tx_buf, offset, data_length, iov_len);
} else {
initialize_buffer(&tx_buf, 0, data_length);
initialize_buffer(&rx_buf, pattern, data_length);
blockdev_write_zeroes(target, tx_buf, offset, data_length);
}
blockdev_write(target, tx_buf, offset, data_length, iov_len);
if (expected_rc == 0) {
CU_ASSERT_EQUAL(g_completion_success, true);
} else {
CU_ASSERT_EQUAL(g_completion_success, false);
}
blockdev_read(target, rx_buf, offset, data_length, iov_len);
if (expected_rc == 0) {
@ -364,7 +403,96 @@ blockdev_write_read_4k(void)
* of write and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}
static void
blockdev_write_zeroes_read_4k(void)
{
uint32_t data_length;
uint64_t offset;
int pattern;
int expected_rc;
/* Data size = 4K */
data_length = 4096;
offset = 0;
pattern = 0xA3;
/* Params are valid, hence the expected return value
* of write_zeroes and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1);
}
/*
* This i/o will not have to split at the bdev layer.
*/
static void
blockdev_write_zeroes_read_1m(void)
{
uint32_t data_length;
uint64_t offset;
int pattern;
int expected_rc;
/* Data size = 1M */
data_length = 1048576;
offset = 0;
pattern = 0xA3;
/* Params are valid, hence the expected return value
* of write_zeroes and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1);
}
/*
* This i/o will have to split at the bdev layer if
* write-zeroes is not supported by the bdev.
*/
static void
blockdev_write_zeroes_read_3m(void)
{
uint32_t data_length;
uint64_t offset;
int pattern;
int expected_rc;
/* Data size = 3M */
data_length = 3145728;
offset = 0;
pattern = 0xA3;
/* Params are valid, hence the expected return value
* of write_zeroes and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1);
}
/*
* This i/o will have to split at the bdev layer if
* write-zeroes is not supported by the bdev. It also
* tests a write size that is not an even multiple of
* the bdev layer zero buffer size.
*/
static void
blockdev_write_zeroes_read_3m_500k(void)
{
uint32_t data_length;
uint64_t offset;
int pattern;
int expected_rc;
/* Data size = 3.5M */
data_length = 3670016;
offset = 0;
pattern = 0xA3;
/* Params are valid, hence the expected return value
* of write_zeroes and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1);
}
static void
@ -385,7 +513,7 @@ blockdev_writev_readv_4k(void)
* of write and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}
static void
@ -406,7 +534,7 @@ blockdev_writev_readv_30x4k(void)
* of write and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}
static void
@ -426,7 +554,7 @@ blockdev_write_read_512Bytes(void)
* of write and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}
static void
@ -447,7 +575,7 @@ blockdev_writev_readv_512Bytes(void)
* of write and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}
static void
@ -467,7 +595,7 @@ blockdev_write_read_size_gt_128k(void)
* of write and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}
static void
@ -488,7 +616,7 @@ blockdev_writev_readv_size_gt_128k(void)
* of write and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}
static void
@ -509,7 +637,7 @@ blockdev_writev_readv_size_gt_128k_two_iov(void)
* of write and read for all blockdevs is 0. */
expected_rc = 0;
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}
static void
@ -529,7 +657,7 @@ blockdev_write_read_invalid_size(void)
* of write and read for all blockdevs is < 0 */
expected_rc = -1;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}
static void
@ -628,7 +756,7 @@ blockdev_write_read_max_offset(void)
* of write and read for all blockdevs is < 0 */
expected_rc = -1;
blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}
static void
@ -649,7 +777,7 @@ blockdev_overlapped_write_read_8k(void)
expected_rc = 0;
/* Assert the write by comparing it with values read
* from the same offset for each blockdev */
blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
/* Overwrite the pattern 0xbb of size 8K on an address offset overlapping
* with the address written above and assert the new value in
@ -660,7 +788,7 @@ blockdev_overlapped_write_read_8k(void)
offset = 4096;
/* Assert the write by comparing it with values read
* from the overlapped offset for each blockdev */
blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}
static void
@ -741,6 +869,10 @@ __run_ut_thread(void *arg1, void *arg2)
if (
CU_add_test(suite, "blockdev write read 4k", blockdev_write_read_4k) == NULL
|| CU_add_test(suite, "blockdev write zeroes read 4k", blockdev_write_zeroes_read_4k) == NULL
|| CU_add_test(suite, "blockdev write zeroes read 1m", blockdev_write_zeroes_read_1m) == NULL
|| CU_add_test(suite, "blockdev write zeroes read 3m", blockdev_write_zeroes_read_3m) == NULL
|| CU_add_test(suite, "blockdev write zeroes read 3.5m", blockdev_write_zeroes_read_3m_500k) == NULL
|| CU_add_test(suite, "blockdev write read 512 bytes",
blockdev_write_read_512Bytes) == NULL
|| CU_add_test(suite, "blockdev write read size > 128k",