From 62700dac2e33e7b6b0ba93b77c5c42030086b3b0 Mon Sep 17 00:00:00 2001 From: Seth Howell Date: Wed, 10 Oct 2018 14:24:17 -0700 Subject: [PATCH] nvmf/rdma: Add support for multiple sgl descriptors to sgl parser Enable parsing an nvmf request that contains an inline nvme_sgl_last_segment_descriptor element. This is the next step towards NVMe-oF SGL support in the NVMe-oF target. Change-Id: Ia2f1f7054e0de8a9e2bfe4dabe6af4085e3f12c4 Signed-off-by: Seth Howell Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/428745 Tested-by: SPDK CI Jenkins Reviewed-by: Jim Harris Reviewed-by: Ben Walker --- lib/nvmf/rdma.c | 178 +++++++++++++++++++++++++++- test/unit/lib/nvmf/rdma.c/rdma_ut.c | 2 + 2 files changed, 177 insertions(+), 3 deletions(-) diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c index 30a3826288..8326f7c4b4 100644 --- a/lib/nvmf/rdma.c +++ b/lib/nvmf/rdma.c @@ -584,6 +584,46 @@ spdk_nvmf_rdma_set_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair, return 0; } +static void +nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req, + struct spdk_nvmf_rdma_transport *rtransport) +{ + struct spdk_nvmf_rdma_request_data *current_data_wr = NULL, *next_data_wr = NULL; + struct ibv_send_wr *send_wr; + int i; + + rdma_req->num_outstanding_data_wr = 0; + current_data_wr = &rdma_req->data; + for (i = 0; i < current_data_wr->wr.num_sge; i++) { + current_data_wr->wr.sg_list[i].addr = 0; + current_data_wr->wr.sg_list[i].length = 0; + current_data_wr->wr.sg_list[i].lkey = 0; + } + current_data_wr->wr.num_sge = 0; + + send_wr = current_data_wr->wr.next; + if (send_wr != NULL && send_wr != &rdma_req->rsp.wr) { + next_data_wr = SPDK_CONTAINEROF(send_wr, struct spdk_nvmf_rdma_request_data, wr); + } + while (next_data_wr) { + current_data_wr = next_data_wr; + send_wr = current_data_wr->wr.next; + if (send_wr != NULL && send_wr != &rdma_req->rsp.wr) { + next_data_wr = SPDK_CONTAINEROF(send_wr, struct spdk_nvmf_rdma_request_data, wr); + } else { + next_data_wr = NULL; + } + + for (i = 0; i < current_data_wr->wr.num_sge; i++) { + current_data_wr->wr.sg_list[i].addr = 0; + current_data_wr->wr.sg_list[i].length = 0; + current_data_wr->wr.sg_list[i].lkey = 0; + } + current_data_wr->wr.num_sge = 0; + spdk_mempool_put(rtransport->data_wr_pool, current_data_wr); + } +} + static void nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req) { @@ -1032,8 +1072,8 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) assert(rqpair->current_recv_depth > 0); rqpair->current_recv_depth--; - /* Build the response which consists of an optional - * RDMA WRITE to transfer data, plus an RDMA SEND + /* Build the response which consists of optional + * RDMA WRITEs to transfer data, plus an RDMA SEND * containing the response. */ send_wr = &rdma_req->rsp.wr; @@ -1326,6 +1366,46 @@ spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) return xfer; } +static int +nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_request *rdma_req, + uint32_t num_sgl_descriptors) +{ + struct spdk_nvmf_rdma_request_data *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES]; + struct spdk_nvmf_rdma_request_data *current_data_wr; + uint32_t i; + + if (spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests, num_sgl_descriptors)) { + return -ENOMEM; + } + + current_data_wr = &rdma_req->data; + + for (i = 0; i < num_sgl_descriptors; i++) { + if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + current_data_wr->wr.opcode = IBV_WR_RDMA_WRITE; + } else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + current_data_wr->wr.opcode = IBV_WR_RDMA_READ; + } else { + assert(false); + } + work_requests[i]->wr.send_flags = IBV_SEND_SIGNALED; + work_requests[i]->wr.sg_list = work_requests[i]->sgl; + work_requests[i]->wr.wr_id = rdma_req->data.wr.wr_id; + current_data_wr->wr.next = &work_requests[i]->wr; + current_data_wr = work_requests[i]; + } + + if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + current_data_wr->wr.opcode = IBV_WR_RDMA_WRITE; + current_data_wr->wr.next = &rdma_req->rsp.wr; + } else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + current_data_wr->wr.opcode = IBV_WR_RDMA_READ; + current_data_wr->wr.next = NULL; + } + return 0; +} + static int nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport, struct spdk_nvmf_rdma_poll_group *rgroup, @@ -1423,6 +1503,83 @@ err_exit: return rc; } +static int +nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct ibv_send_wr *current_wr; + struct spdk_nvmf_request *req = &rdma_req->req; + struct spdk_nvme_sgl_descriptor *inline_segment, *desc; + uint32_t num_sgl_descriptors; + uint32_t i; + int rc; + + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + rgroup = rqpair->poller->group; + + inline_segment = &req->cmd->nvme_cmd.dptr.sgl1; + assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT); + assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); + + desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address; + num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor); + assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES); + + if (nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1) != 0) { + return -ENOMEM; + } + + /* The first WR must always be the embedded data WR. This is how we unwind them later. */ + current_wr = &rdma_req->data.wr; + + req->iovcnt = 0; + for (i = 0; i < num_sgl_descriptors; i++) { + /* The descriptors must be keyed data block descriptors with an address, not an offset. */ + if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK || + desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) { + rc = -EINVAL; + goto err_exit; + } + + current_wr->num_sge = 0; + req->length += desc->keyed.length; + + rc = nvmf_rdma_fill_buffers(rtransport, rgroup, device, rdma_req, current_wr, + desc->keyed.length); + if (rc != 0) { + rc = -ENOMEM; + goto err_exit; + } + + current_wr->wr.rdma.rkey = desc->keyed.key; + current_wr->wr.rdma.remote_addr = desc->address; + current_wr = current_wr->next; + desc++; + } + +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { + if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { + rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; + rdma_req->rsp.wr.imm_data = desc->keyed.key; + } + } +#endif + + rdma_req->num_outstanding_data_wr = num_sgl_descriptors; + rdma_req->data_from_pool = true; + + return 0; + +err_exit: + spdk_nvmf_rdma_request_free_buffers(rdma_req, &rgroup->group, &rtransport->transport); + nvmf_rdma_request_free_data(rdma_req, rtransport); + return rc; +} + static int spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, struct spdk_nvmf_rdma_device *device, @@ -1517,6 +1674,20 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, rdma_req->req.iov[0].iov_len = rdma_req->req.length; rdma_req->req.iovcnt = 1; + return 0; + } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT && + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { + if (nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req) < 0) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); + return 0; + } + + /* backward compatible */ + rdma_req->req.data = rdma_req->req.iov[0].iov_base; + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, + rdma_req->req.iovcnt); + return 0; } @@ -1539,10 +1710,11 @@ nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req, spdk_nvmf_rdma_request_free_buffers(rdma_req, &rgroup->group, &rtransport->transport); } - rdma_req->num_outstanding_data_wr = 0; + nvmf_rdma_request_free_data(rdma_req, rtransport); rdma_req->req.length = 0; rdma_req->req.iovcnt = 0; rdma_req->req.data = NULL; + rdma_req->data.wr.next = NULL; rqpair->qd--; STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link); diff --git a/test/unit/lib/nvmf/rdma.c/rdma_ut.c b/test/unit/lib/nvmf/rdma.c/rdma_ut.c index 7bd79b5c60..9af654cf9a 100644 --- a/test/unit/lib/nvmf/rdma.c/rdma_ut.c +++ b/test/unit/lib/nvmf/rdma.c/rdma_ut.c @@ -139,6 +139,8 @@ test_spdk_nvmf_rdma_request_parse_sgl(void) rdma_req.req.xfer = SPDK_NVME_DATA_CONTROLLER_TO_HOST; rtransport.transport.opts = g_rdma_ut_transport_opts; + rtransport.data_wr_pool = NULL; + rtransport.transport.data_buf_pool = NULL; device.attr.device_cap_flags = 0; g_rdma_mr.lkey = 0xABCD;