rdma: Add DIF support for write operation

Update transaction length wrt to medata size
Change buffers handling in the case of enabled DIF - add function nvmf_rdma_fill_buffer_with_md_interleave to split SGL into several parts with metadata blocks between them in order to perform RDMA operation with appropriate offsets
Add DIF generation before executing bdev IO operation
Add parsing of DifInsertOrStrip config parameter.
Since there is a limitation on the number of entries in SG list (16), the current approach has a limitation on the max transaction size which depends on the data block size. E.g. if data block size is 512 bytes then the maximum transaction size will be 512 * 16 = 8192 bytes.
In adiition, the size of IO buffer (IOUnitSize conf param) must be aligned to metadata size for better perfromance since metadata is treated as part of this buffer. E.g. if the initiator uses transaction size = 4096, data block size on nvme disk is 512 then IO buffer size should be aligned to (512 + 8) which is 4160. In other case an extra IO buffer will be consumed which will increase the number of entries in SGL and in iov.

Change-Id: I7ad2270fe9dcceb114ece34675eac44e5783a0d5
Signed-off-by: Alexey Marchuk <alexeymar@mellanox.com>
Signed-off-by: Sasha Kotchubievsky <sashakot@mellanox.com>
Signed-off-by: Evgenii Kochetov <evgeniik@mellanox.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/465248
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Seth Howell <seth.howell@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
This commit is contained in:
Alexey Marchuk 2019-07-29 15:30:32 +00:00 committed by Jim Harris
parent a77217feb8
commit 1bc5710a9e
4 changed files with 172 additions and 13 deletions

View File

@ -1602,12 +1602,12 @@ dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *t
spdk_json_write_named_uint32(w, "max_aq_depth", opts->max_aq_depth);
spdk_json_write_named_uint32(w, "num_shared_buffers", opts->num_shared_buffers);
spdk_json_write_named_uint32(w, "buf_cache_size", opts->buf_cache_size);
spdk_json_write_named_bool(w, "dif_insert_or_strip", opts->dif_insert_or_strip);
if (type == SPDK_NVME_TRANSPORT_RDMA) {
spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth);
spdk_json_write_named_bool(w, "no_srq", opts->no_srq);
} else if (type == SPDK_NVME_TRANSPORT_TCP) {
spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success);
spdk_json_write_named_bool(w, "dif_insert_or_strip", opts->dif_insert_or_strip);
spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority);
}

View File

@ -267,6 +267,11 @@ struct spdk_nvmf_rdma_request {
uint32_t num_outstanding_data_wr;
uint64_t receive_tsc;
struct spdk_dif_ctx dif_ctx;
bool dif_insert_or_strip;
uint32_t elba_length;
uint32_t orig_length;
STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link;
};
@ -503,6 +508,9 @@ struct spdk_nvmf_rdma_transport {
TAILQ_HEAD(, spdk_nvmf_rdma_port) ports;
};
static inline void
spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair);
static inline int
spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state)
{
@ -1495,6 +1503,93 @@ nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf)
return 0;
}
/*
* Fills iov and SGL, iov[i] points to buffer[i], SGE[i] is limited in length to data block size
* and points to part of buffer
*/
static int
nvmf_rdma_fill_buffers_with_md_interleave(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poll_group *rgroup,
struct spdk_nvmf_rdma_device *device,
struct spdk_nvmf_request *req,
struct ibv_send_wr *wr,
uint32_t length,
uint32_t data_block_size,
uint32_t md_size)
{
uint32_t remaining_length = length;
uint32_t remaining_io_buffer_length;
uint32_t remaining_data_block = data_block_size;
uint32_t offset = 0;
uint32_t sge_len;
uint64_t translation_len;
struct iovec *iovec;
struct ibv_sge *sg_list;
uint32_t lkey = 0;
wr->num_sge = 0;
while (remaining_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) {
iovec = &req->iov[req->iovcnt];
iovec->iov_base = (void *)((uintptr_t)(req->buffers[req->iovcnt] + NVMF_DATA_BUFFER_MASK)
& ~NVMF_DATA_BUFFER_MASK);
iovec->iov_len = spdk_min(remaining_length, rtransport->transport.opts.io_unit_size);
remaining_io_buffer_length = iovec->iov_len - offset;
translation_len = iovec->iov_len;
if (!g_nvmf_hooks.get_rkey) {
lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, (uint64_t)iovec->iov_base,
&translation_len))->lkey;
} else {
lkey = spdk_mem_map_translate(device->map, (uint64_t)iovec->iov_base, &translation_len);
}
/* This is a very rare case that can occur when using DPDK version < 19.05 */
if (spdk_unlikely(translation_len < iovec->iov_len)) {
SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n");
if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[req->iovcnt]) == -ENOMEM) {
return -ENOMEM;
}
continue;
}
req->iovcnt++;
while (remaining_io_buffer_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) {
sg_list = &wr->sg_list[wr->num_sge];
sg_list->addr = (uintptr_t)((char *) iovec->iov_base + offset);
sge_len = spdk_min(remaining_io_buffer_length, remaining_data_block);
sg_list->length = sge_len;
sg_list->lkey = lkey;
remaining_io_buffer_length -= sge_len;
remaining_data_block -= sge_len;
offset += sge_len;
wr->num_sge++;
if (remaining_data_block == 0) {
/* skip metadata */
offset += md_size;
/* Metadata that do not fit this IO buffer will be included in the next IO buffer */
remaining_io_buffer_length -= spdk_min(remaining_io_buffer_length, md_size);
remaining_data_block = data_block_size;
}
if (remaining_io_buffer_length == 0) {
/* By subtracting the size of the last IOV from the offset, we ensure that we skip
the remaining metadata bits at the beginning of the next buffer */
offset -= iovec->iov_len;
}
}
remaining_length -= iovec->iov_len;
}
if (remaining_length) {
SPDK_ERRLOG("Not enough SG entries to hold data buffer\n");
return -EINVAL;
}
return 0;
}
static int
nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poll_group *rgroup,
@ -1544,7 +1639,8 @@ nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport,
static int
spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_device *device,
struct spdk_nvmf_rdma_request *rdma_req)
struct spdk_nvmf_rdma_request *rdma_req,
uint32_t length)
{
struct spdk_nvmf_rdma_qpair *rqpair;
struct spdk_nvmf_rdma_poll_group *rgroup;
@ -1557,7 +1653,7 @@ spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
rgroup = rqpair->poller->group;
req->iovcnt = 0;
num_buffers = SPDK_CEIL_DIV(req->length, rtransport->transport.opts.io_unit_size);
num_buffers = SPDK_CEIL_DIV(length, rtransport->transport.opts.io_unit_size);
if (spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport,
num_buffers)) {
@ -1566,8 +1662,18 @@ spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
req->iovcnt = 0;
rc = nvmf_rdma_fill_buffers(rtransport, rgroup, device, req, &rdma_req->data.wr,
req->length);
if (spdk_unlikely(rdma_req->dif_insert_or_strip)) {
rc = nvmf_rdma_fill_buffers_with_md_interleave(rtransport,
rgroup,
device,
&rdma_req->req,
&rdma_req->data.wr,
length,
rdma_req->dif_ctx.block_size - rdma_req->dif_ctx.md_size,
rdma_req->dif_ctx.md_size);
} else {
rc = nvmf_rdma_fill_buffers(rtransport, rgroup, device, req, &rdma_req->data.wr, length);
}
if (rc != 0) {
goto err_exit;
}
@ -1695,6 +1801,7 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvme_cpl *rsp;
struct spdk_nvme_sgl_descriptor *sgl;
int rc;
uint32_t length;
cmd = &rdma_req->req.cmd->nvme_cmd;
rsp = &rdma_req->req.rsp->nvme_cpl;
@ -1703,9 +1810,11 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK &&
(sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS ||
sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) {
if (sgl->keyed.length > rtransport->transport.opts.max_io_size) {
length = sgl->keyed.length;
if (length > rtransport->transport.opts.max_io_size) {
SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
sgl->keyed.length, rtransport->transport.opts.max_io_size);
length, rtransport->transport.opts.max_io_size);
rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
return -1;
}
@ -1719,9 +1828,15 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
#endif
/* fill request length and populate iovs */
rdma_req->req.length = sgl->keyed.length;
rdma_req->req.length = length;
if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) {
if (spdk_unlikely(rdma_req->dif_insert_or_strip)) {
rdma_req->orig_length = length;
length = spdk_dif_get_length_with_md(length, &rdma_req->dif_ctx);
rdma_req->elba_length = length;
}
if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req, length) < 0) {
/* No available buffers. Queue this request up. */
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req);
return 0;
@ -1731,7 +1846,6 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
rdma_req->req.data = rdma_req->req.iov[0].iov_base;
/* rdma wr specifics */
rdma_req->data.wr.num_sge = rdma_req->req.iovcnt;
rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key;
rdma_req->data.wr.wr.rdma.remote_addr = sgl->address;
if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
@ -1831,6 +1945,10 @@ nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
rdma_req->req.data = NULL;
rdma_req->rsp.wr.next = NULL;
rdma_req->data.wr.next = NULL;
rdma_req->dif_insert_or_strip = false;
rdma_req->elba_length = 0;
rdma_req->orig_length = 0;
memset(&rdma_req->dif_ctx, 0, sizeof(rdma_req->dif_ctx));
rqpair->qd--;
STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link);
@ -1850,6 +1968,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
enum spdk_nvmf_rdma_request_state prev_state;
bool progress = false;
int data_posted;
uint32_t num_blocks;
rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
device = rqpair->port->device;
@ -1895,6 +2014,10 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
break;
}
if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->dif_ctx))) {
rdma_req->dif_insert_or_strip = true;
}
/* The next state transition depends on the data transfer needs of this request. */
rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req);
@ -1982,6 +2105,28 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
case RDMA_REQUEST_STATE_READY_TO_EXECUTE:
spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0,
(uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
if (spdk_unlikely(rdma_req->dif_insert_or_strip)) {
if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
/* generate DIF for write operation */
num_blocks = SPDK_CEIL_DIV(rdma_req->elba_length, rdma_req->dif_ctx.block_size);
assert(num_blocks > 0);
rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt,
num_blocks, &rdma_req->dif_ctx);
if (rc != 0) {
SPDK_ERRLOG("DIF generation failed\n");
rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
spdk_nvmf_rdma_start_disconnect(rqpair);
break;
}
}
assert(rdma_req->elba_length >= rdma_req->req.length);
/* set extended length before IO operation */
rdma_req->req.length = rdma_req->elba_length;
}
rdma_req->state = RDMA_REQUEST_STATE_EXECUTING;
spdk_nvmf_request_exec(&rdma_req->req);
break;
@ -2000,6 +2145,10 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
} else {
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
}
if (spdk_unlikely(rdma_req->dif_insert_or_strip)) {
/* restore the original length */
rdma_req->req.length = rdma_req->orig_length;
}
break;
case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0,
@ -2081,7 +2230,8 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
#define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES)
#define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095
#define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32
#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false;
#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
#define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
static void
spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
@ -2095,7 +2245,8 @@ spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS;
opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE;
opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ
opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
}
const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {

View File

@ -598,6 +598,9 @@ spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx)
opts.c2h_success = bval;
}
bval = spdk_conf_section_get_boolval(ctx->sp, "DifInsertOrStrip", false);
opts.dif_insert_or_strip = bval;
transport = spdk_nvmf_transport_create(trtype, &opts);
if (transport) {
spdk_nvmf_tgt_add_transport(g_spdk_nvmf_tgt, transport, spdk_nvmf_tgt_add_transport_done, ctx);

View File

@ -76,6 +76,8 @@ DEFINE_STUB_V(spdk_nvmf_request_exec, (struct spdk_nvmf_request *req));
DEFINE_STUB(spdk_nvme_transport_id_compare, int, (const struct spdk_nvme_transport_id *trid1,
const struct spdk_nvme_transport_id *trid2), 0);
DEFINE_STUB_V(spdk_nvmf_ctrlr_abort_aer, (struct spdk_nvmf_ctrlr *ctrlr));
DEFINE_STUB(spdk_nvmf_request_get_dif_ctx, bool, (struct spdk_nvmf_request *req,
struct spdk_dif_ctx *dif_ctx), false);
void
spdk_nvmf_request_free_buffers(struct spdk_nvmf_request *req,
@ -154,6 +156,9 @@ static void reset_nvmf_rdma_request(struct spdk_nvmf_rdma_request *rdma_req)
rdma_req->data.wr.num_sge = 0;
rdma_req->data.wr.wr.rdma.remote_addr = 0;
rdma_req->data.wr.wr.rdma.rkey = 0;
rdma_req->elba_length = 0;
rdma_req->orig_length = 0;
rdma_req->dif_insert_or_strip = false;
for (i = 0; i < SPDK_NVMF_MAX_SGL_ENTRIES; i++) {
rdma_req->req.iov[i].iov_base = 0;
@ -170,7 +175,7 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
{
struct spdk_nvmf_rdma_transport rtransport;
struct spdk_nvmf_rdma_device device;
struct spdk_nvmf_rdma_request rdma_req;
struct spdk_nvmf_rdma_request rdma_req = {};
struct spdk_nvmf_rdma_recv recv;
struct spdk_nvmf_rdma_poll_group group;
struct spdk_nvmf_rdma_qpair rqpair;