rdma: Add DIF support for write operation
Update transaction length wrt to medata size Change buffers handling in the case of enabled DIF - add function nvmf_rdma_fill_buffer_with_md_interleave to split SGL into several parts with metadata blocks between them in order to perform RDMA operation with appropriate offsets Add DIF generation before executing bdev IO operation Add parsing of DifInsertOrStrip config parameter. Since there is a limitation on the number of entries in SG list (16), the current approach has a limitation on the max transaction size which depends on the data block size. E.g. if data block size is 512 bytes then the maximum transaction size will be 512 * 16 = 8192 bytes. In adiition, the size of IO buffer (IOUnitSize conf param) must be aligned to metadata size for better perfromance since metadata is treated as part of this buffer. E.g. if the initiator uses transaction size = 4096, data block size on nvme disk is 512 then IO buffer size should be aligned to (512 + 8) which is 4160. In other case an extra IO buffer will be consumed which will increase the number of entries in SGL and in iov. Change-Id: I7ad2270fe9dcceb114ece34675eac44e5783a0d5 Signed-off-by: Alexey Marchuk <alexeymar@mellanox.com> Signed-off-by: Sasha Kotchubievsky <sashakot@mellanox.com> Signed-off-by: Evgenii Kochetov <evgeniik@mellanox.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/465248 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Seth Howell <seth.howell@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
This commit is contained in:
parent
a77217feb8
commit
1bc5710a9e
@ -1602,12 +1602,12 @@ dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *t
|
||||
spdk_json_write_named_uint32(w, "max_aq_depth", opts->max_aq_depth);
|
||||
spdk_json_write_named_uint32(w, "num_shared_buffers", opts->num_shared_buffers);
|
||||
spdk_json_write_named_uint32(w, "buf_cache_size", opts->buf_cache_size);
|
||||
spdk_json_write_named_bool(w, "dif_insert_or_strip", opts->dif_insert_or_strip);
|
||||
if (type == SPDK_NVME_TRANSPORT_RDMA) {
|
||||
spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth);
|
||||
spdk_json_write_named_bool(w, "no_srq", opts->no_srq);
|
||||
} else if (type == SPDK_NVME_TRANSPORT_TCP) {
|
||||
spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success);
|
||||
spdk_json_write_named_bool(w, "dif_insert_or_strip", opts->dif_insert_or_strip);
|
||||
spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority);
|
||||
}
|
||||
|
||||
|
173
lib/nvmf/rdma.c
173
lib/nvmf/rdma.c
@ -267,6 +267,11 @@ struct spdk_nvmf_rdma_request {
|
||||
uint32_t num_outstanding_data_wr;
|
||||
uint64_t receive_tsc;
|
||||
|
||||
struct spdk_dif_ctx dif_ctx;
|
||||
bool dif_insert_or_strip;
|
||||
uint32_t elba_length;
|
||||
uint32_t orig_length;
|
||||
|
||||
STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link;
|
||||
};
|
||||
|
||||
@ -503,6 +508,9 @@ struct spdk_nvmf_rdma_transport {
|
||||
TAILQ_HEAD(, spdk_nvmf_rdma_port) ports;
|
||||
};
|
||||
|
||||
static inline void
|
||||
spdk_nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair);
|
||||
|
||||
static inline int
|
||||
spdk_nvmf_rdma_check_ibv_state(enum ibv_qp_state state)
|
||||
{
|
||||
@ -1495,6 +1503,93 @@ nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fills iov and SGL, iov[i] points to buffer[i], SGE[i] is limited in length to data block size
|
||||
* and points to part of buffer
|
||||
*/
|
||||
static int
|
||||
nvmf_rdma_fill_buffers_with_md_interleave(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
struct spdk_nvmf_rdma_poll_group *rgroup,
|
||||
struct spdk_nvmf_rdma_device *device,
|
||||
struct spdk_nvmf_request *req,
|
||||
struct ibv_send_wr *wr,
|
||||
uint32_t length,
|
||||
uint32_t data_block_size,
|
||||
uint32_t md_size)
|
||||
{
|
||||
uint32_t remaining_length = length;
|
||||
uint32_t remaining_io_buffer_length;
|
||||
uint32_t remaining_data_block = data_block_size;
|
||||
uint32_t offset = 0;
|
||||
uint32_t sge_len;
|
||||
uint64_t translation_len;
|
||||
struct iovec *iovec;
|
||||
struct ibv_sge *sg_list;
|
||||
uint32_t lkey = 0;
|
||||
|
||||
wr->num_sge = 0;
|
||||
|
||||
while (remaining_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) {
|
||||
iovec = &req->iov[req->iovcnt];
|
||||
iovec->iov_base = (void *)((uintptr_t)(req->buffers[req->iovcnt] + NVMF_DATA_BUFFER_MASK)
|
||||
& ~NVMF_DATA_BUFFER_MASK);
|
||||
iovec->iov_len = spdk_min(remaining_length, rtransport->transport.opts.io_unit_size);
|
||||
remaining_io_buffer_length = iovec->iov_len - offset;
|
||||
translation_len = iovec->iov_len;
|
||||
|
||||
if (!g_nvmf_hooks.get_rkey) {
|
||||
lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, (uint64_t)iovec->iov_base,
|
||||
&translation_len))->lkey;
|
||||
} else {
|
||||
lkey = spdk_mem_map_translate(device->map, (uint64_t)iovec->iov_base, &translation_len);
|
||||
}
|
||||
/* This is a very rare case that can occur when using DPDK version < 19.05 */
|
||||
if (spdk_unlikely(translation_len < iovec->iov_len)) {
|
||||
SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n");
|
||||
if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[req->iovcnt]) == -ENOMEM) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
req->iovcnt++;
|
||||
|
||||
while (remaining_io_buffer_length && wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES) {
|
||||
sg_list = &wr->sg_list[wr->num_sge];
|
||||
sg_list->addr = (uintptr_t)((char *) iovec->iov_base + offset);
|
||||
sge_len = spdk_min(remaining_io_buffer_length, remaining_data_block);
|
||||
sg_list->length = sge_len;
|
||||
sg_list->lkey = lkey;
|
||||
remaining_io_buffer_length -= sge_len;
|
||||
remaining_data_block -= sge_len;
|
||||
offset += sge_len;
|
||||
wr->num_sge++;
|
||||
|
||||
if (remaining_data_block == 0) {
|
||||
/* skip metadata */
|
||||
offset += md_size;
|
||||
/* Metadata that do not fit this IO buffer will be included in the next IO buffer */
|
||||
remaining_io_buffer_length -= spdk_min(remaining_io_buffer_length, md_size);
|
||||
remaining_data_block = data_block_size;
|
||||
}
|
||||
|
||||
if (remaining_io_buffer_length == 0) {
|
||||
/* By subtracting the size of the last IOV from the offset, we ensure that we skip
|
||||
the remaining metadata bits at the beginning of the next buffer */
|
||||
offset -= iovec->iov_len;
|
||||
}
|
||||
}
|
||||
remaining_length -= iovec->iov_len;
|
||||
}
|
||||
|
||||
if (remaining_length) {
|
||||
SPDK_ERRLOG("Not enough SG entries to hold data buffer\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
struct spdk_nvmf_rdma_poll_group *rgroup,
|
||||
@ -1544,7 +1639,8 @@ nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
static int
|
||||
spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
struct spdk_nvmf_rdma_device *device,
|
||||
struct spdk_nvmf_rdma_request *rdma_req)
|
||||
struct spdk_nvmf_rdma_request *rdma_req,
|
||||
uint32_t length)
|
||||
{
|
||||
struct spdk_nvmf_rdma_qpair *rqpair;
|
||||
struct spdk_nvmf_rdma_poll_group *rgroup;
|
||||
@ -1557,7 +1653,7 @@ spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
rgroup = rqpair->poller->group;
|
||||
req->iovcnt = 0;
|
||||
|
||||
num_buffers = SPDK_CEIL_DIV(req->length, rtransport->transport.opts.io_unit_size);
|
||||
num_buffers = SPDK_CEIL_DIV(length, rtransport->transport.opts.io_unit_size);
|
||||
|
||||
if (spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport,
|
||||
num_buffers)) {
|
||||
@ -1566,8 +1662,18 @@ spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
|
||||
req->iovcnt = 0;
|
||||
|
||||
rc = nvmf_rdma_fill_buffers(rtransport, rgroup, device, req, &rdma_req->data.wr,
|
||||
req->length);
|
||||
if (spdk_unlikely(rdma_req->dif_insert_or_strip)) {
|
||||
rc = nvmf_rdma_fill_buffers_with_md_interleave(rtransport,
|
||||
rgroup,
|
||||
device,
|
||||
&rdma_req->req,
|
||||
&rdma_req->data.wr,
|
||||
length,
|
||||
rdma_req->dif_ctx.block_size - rdma_req->dif_ctx.md_size,
|
||||
rdma_req->dif_ctx.md_size);
|
||||
} else {
|
||||
rc = nvmf_rdma_fill_buffers(rtransport, rgroup, device, req, &rdma_req->data.wr, length);
|
||||
}
|
||||
if (rc != 0) {
|
||||
goto err_exit;
|
||||
}
|
||||
@ -1695,6 +1801,7 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
struct spdk_nvme_cpl *rsp;
|
||||
struct spdk_nvme_sgl_descriptor *sgl;
|
||||
int rc;
|
||||
uint32_t length;
|
||||
|
||||
cmd = &rdma_req->req.cmd->nvme_cmd;
|
||||
rsp = &rdma_req->req.rsp->nvme_cpl;
|
||||
@ -1703,9 +1810,11 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK &&
|
||||
(sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS ||
|
||||
sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) {
|
||||
if (sgl->keyed.length > rtransport->transport.opts.max_io_size) {
|
||||
|
||||
length = sgl->keyed.length;
|
||||
if (length > rtransport->transport.opts.max_io_size) {
|
||||
SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
|
||||
sgl->keyed.length, rtransport->transport.opts.max_io_size);
|
||||
length, rtransport->transport.opts.max_io_size);
|
||||
rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
|
||||
return -1;
|
||||
}
|
||||
@ -1719,9 +1828,15 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
#endif
|
||||
|
||||
/* fill request length and populate iovs */
|
||||
rdma_req->req.length = sgl->keyed.length;
|
||||
rdma_req->req.length = length;
|
||||
|
||||
if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) {
|
||||
if (spdk_unlikely(rdma_req->dif_insert_or_strip)) {
|
||||
rdma_req->orig_length = length;
|
||||
length = spdk_dif_get_length_with_md(length, &rdma_req->dif_ctx);
|
||||
rdma_req->elba_length = length;
|
||||
}
|
||||
|
||||
if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req, length) < 0) {
|
||||
/* No available buffers. Queue this request up. */
|
||||
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req);
|
||||
return 0;
|
||||
@ -1731,7 +1846,6 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
rdma_req->req.data = rdma_req->req.iov[0].iov_base;
|
||||
|
||||
/* rdma wr specifics */
|
||||
rdma_req->data.wr.num_sge = rdma_req->req.iovcnt;
|
||||
rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key;
|
||||
rdma_req->data.wr.wr.rdma.remote_addr = sgl->address;
|
||||
if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
|
||||
@ -1831,6 +1945,10 @@ nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
|
||||
rdma_req->req.data = NULL;
|
||||
rdma_req->rsp.wr.next = NULL;
|
||||
rdma_req->data.wr.next = NULL;
|
||||
rdma_req->dif_insert_or_strip = false;
|
||||
rdma_req->elba_length = 0;
|
||||
rdma_req->orig_length = 0;
|
||||
memset(&rdma_req->dif_ctx, 0, sizeof(rdma_req->dif_ctx));
|
||||
rqpair->qd--;
|
||||
|
||||
STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link);
|
||||
@ -1850,6 +1968,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
enum spdk_nvmf_rdma_request_state prev_state;
|
||||
bool progress = false;
|
||||
int data_posted;
|
||||
uint32_t num_blocks;
|
||||
|
||||
rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
|
||||
device = rqpair->port->device;
|
||||
@ -1895,6 +2014,10 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
break;
|
||||
}
|
||||
|
||||
if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->dif_ctx))) {
|
||||
rdma_req->dif_insert_or_strip = true;
|
||||
}
|
||||
|
||||
/* The next state transition depends on the data transfer needs of this request. */
|
||||
rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req);
|
||||
|
||||
@ -1982,6 +2105,28 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
case RDMA_REQUEST_STATE_READY_TO_EXECUTE:
|
||||
spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0,
|
||||
(uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
|
||||
|
||||
if (spdk_unlikely(rdma_req->dif_insert_or_strip)) {
|
||||
if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
|
||||
/* generate DIF for write operation */
|
||||
num_blocks = SPDK_CEIL_DIV(rdma_req->elba_length, rdma_req->dif_ctx.block_size);
|
||||
assert(num_blocks > 0);
|
||||
|
||||
rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt,
|
||||
num_blocks, &rdma_req->dif_ctx);
|
||||
if (rc != 0) {
|
||||
SPDK_ERRLOG("DIF generation failed\n");
|
||||
rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
|
||||
spdk_nvmf_rdma_start_disconnect(rqpair);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert(rdma_req->elba_length >= rdma_req->req.length);
|
||||
/* set extended length before IO operation */
|
||||
rdma_req->req.length = rdma_req->elba_length;
|
||||
}
|
||||
|
||||
rdma_req->state = RDMA_REQUEST_STATE_EXECUTING;
|
||||
spdk_nvmf_request_exec(&rdma_req->req);
|
||||
break;
|
||||
@ -2000,6 +2145,10 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
} else {
|
||||
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
|
||||
}
|
||||
if (spdk_unlikely(rdma_req->dif_insert_or_strip)) {
|
||||
/* restore the original length */
|
||||
rdma_req->req.length = rdma_req->orig_length;
|
||||
}
|
||||
break;
|
||||
case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
|
||||
spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0,
|
||||
@ -2081,7 +2230,8 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
|
||||
#define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES)
|
||||
#define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095
|
||||
#define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32
|
||||
#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false;
|
||||
#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
|
||||
#define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
|
||||
|
||||
static void
|
||||
spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
|
||||
@ -2095,7 +2245,8 @@ spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
|
||||
opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS;
|
||||
opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE;
|
||||
opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
|
||||
opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ
|
||||
opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
|
||||
opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
|
||||
}
|
||||
|
||||
const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {
|
||||
|
@ -598,6 +598,9 @@ spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx)
|
||||
opts.c2h_success = bval;
|
||||
}
|
||||
|
||||
bval = spdk_conf_section_get_boolval(ctx->sp, "DifInsertOrStrip", false);
|
||||
opts.dif_insert_or_strip = bval;
|
||||
|
||||
transport = spdk_nvmf_transport_create(trtype, &opts);
|
||||
if (transport) {
|
||||
spdk_nvmf_tgt_add_transport(g_spdk_nvmf_tgt, transport, spdk_nvmf_tgt_add_transport_done, ctx);
|
||||
|
@ -76,6 +76,8 @@ DEFINE_STUB_V(spdk_nvmf_request_exec, (struct spdk_nvmf_request *req));
|
||||
DEFINE_STUB(spdk_nvme_transport_id_compare, int, (const struct spdk_nvme_transport_id *trid1,
|
||||
const struct spdk_nvme_transport_id *trid2), 0);
|
||||
DEFINE_STUB_V(spdk_nvmf_ctrlr_abort_aer, (struct spdk_nvmf_ctrlr *ctrlr));
|
||||
DEFINE_STUB(spdk_nvmf_request_get_dif_ctx, bool, (struct spdk_nvmf_request *req,
|
||||
struct spdk_dif_ctx *dif_ctx), false);
|
||||
|
||||
void
|
||||
spdk_nvmf_request_free_buffers(struct spdk_nvmf_request *req,
|
||||
@ -154,6 +156,9 @@ static void reset_nvmf_rdma_request(struct spdk_nvmf_rdma_request *rdma_req)
|
||||
rdma_req->data.wr.num_sge = 0;
|
||||
rdma_req->data.wr.wr.rdma.remote_addr = 0;
|
||||
rdma_req->data.wr.wr.rdma.rkey = 0;
|
||||
rdma_req->elba_length = 0;
|
||||
rdma_req->orig_length = 0;
|
||||
rdma_req->dif_insert_or_strip = false;
|
||||
|
||||
for (i = 0; i < SPDK_NVMF_MAX_SGL_ENTRIES; i++) {
|
||||
rdma_req->req.iov[i].iov_base = 0;
|
||||
@ -170,7 +175,7 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
|
||||
{
|
||||
struct spdk_nvmf_rdma_transport rtransport;
|
||||
struct spdk_nvmf_rdma_device device;
|
||||
struct spdk_nvmf_rdma_request rdma_req;
|
||||
struct spdk_nvmf_rdma_request rdma_req = {};
|
||||
struct spdk_nvmf_rdma_recv recv;
|
||||
struct spdk_nvmf_rdma_poll_group group;
|
||||
struct spdk_nvmf_rdma_qpair rqpair;
|
||||
|
Loading…
Reference in New Issue
Block a user