From 9fb6947617c3d3fe2f9edd5b9813db42c26e02b6 Mon Sep 17 00:00:00 2001 From: zkhatami88 Date: Mon, 6 Aug 2018 16:27:13 -0700 Subject: [PATCH] nvme: Add mechanism to override RDMA pd/mr behavior Add a mechanism to modify the RDMA transport's behavior when creating protection domains and registering memory. This is entirely optional. Change-Id: I7cd850e76a673bf5521ca4815b779c53ab9567e8 Signed-off-by: zkhatami88 Reviewed-on: https://review.gerrithub.io/421415 Tested-by: SPDK CI Jenkins Reviewed-by: Ben Walker Reviewed-by: Jim Harris --- include/spdk/nvme.h | 55 +++++++++++++++++++++ lib/nvme/nvme_rdma.c | 113 ++++++++++++++++++++++++++++++++----------- 2 files changed, 140 insertions(+), 28 deletions(-) diff --git a/include/spdk/nvme.h b/include/spdk/nvme.h index 196ac954ad..625243debf 100644 --- a/include/spdk/nvme.h +++ b/include/spdk/nvme.h @@ -44,6 +44,7 @@ extern "C" { #endif +#include "spdk/config.h" #include "spdk/env.h" #include "spdk/nvme_spec.h" #include "spdk/nvmf_spec.h" @@ -2038,6 +2039,60 @@ void spdk_nvme_qpair_remove_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, uint8_t opc); +#ifdef SPDK_CONFIG_RDMA +struct ibv_context; +struct ibv_pd; +struct ibv_mr; + +/** + * RDMA Transport Hooks + */ +struct spdk_nvme_rdma_hooks { + /** + * \brief Get a transport id specific context to be passed to + * the other hooks. + * + * \param trid the transport id + * + * \return ctx to be passed to the other hooks + */ + void *(*get_ctx)(const struct spdk_nvme_transport_id *trid); + + /** + * \brief Get an InfiniBand Verbs protection domain. + * + * \param ctx Context returned from get_hook_ctx. + * \param verbs Infiniband verbs context + * + * \return pd of the nvme ctrlr + */ + struct ibv_pd *(*get_ibv_pd)(void *ctx, struct ibv_context *verbs); + + /** + * \brief Get an InfiniBand Verbs memory region for a buffer. + * + * \param ctx Context returned from get_hook_ctx. + * \param buf Memory buffer for which an rkey should be returned. + * \param size size of buf + * + * \return Infiniband remote key (rkey) for this buf + */ + uint64_t (*get_rkey)(void *ctx, void *buf, size_t size); +}; + +/** + * \brief Set the global hooks for the RDMA transport, if necessary. + * + * This call is optional and must be performed prior to probing for + * any devices. By default, the RDMA transport will use the ibverbs + * library to create protection domains and register memory. This + * is a mechanism to subvert that and use an existing registration. + * + * \param hooks for initializing global hooks + */ +void spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks); + +#endif #ifdef __cplusplus } diff --git a/lib/nvme/nvme_rdma.c b/lib/nvme/nvme_rdma.c index b356e3a16e..30560c176f 100644 --- a/lib/nvme/nvme_rdma.c +++ b/lib/nvme/nvme_rdma.c @@ -71,6 +71,8 @@ struct spdk_nvmf_cmd { struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS]; }; +struct spdk_nvme_rdma_hooks g_nvme_hooks = {}; + /* Mapping from virtual address to ibv_mr pointer for a protection domain */ struct spdk_nvme_rdma_mr_map { struct ibv_pd *pd; @@ -82,6 +84,10 @@ struct spdk_nvme_rdma_mr_map { /* NVMe RDMA transport extensions for spdk_nvme_ctrlr */ struct nvme_rdma_ctrlr { struct spdk_nvme_ctrlr ctrlr; + + struct spdk_nvme_rdma_hooks hooks; + void *hook_ctx; + struct ibv_pd *pd; }; /* NVMe RDMA qpair extensions for spdk_nvme_qpair */ @@ -241,6 +247,7 @@ nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair) { int rc; struct ibv_qp_init_attr attr; + struct nvme_rdma_ctrlr *rctrlr; rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0); if (!rqpair->cq) { @@ -248,6 +255,13 @@ nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair) return -1; } + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + if (rctrlr->hooks.get_ibv_pd) { + rctrlr->pd = rctrlr->hooks.get_ibv_pd(rctrlr->hook_ctx, rqpair->cm_id->verbs); + } else { + rctrlr->pd = NULL; + } + memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.qp_type = IBV_QPT_RC; attr.send_cq = rqpair->cq; @@ -257,11 +271,12 @@ nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair) attr.cap.max_send_sge = NVME_RDMA_DEFAULT_TX_SGE; attr.cap.max_recv_sge = NVME_RDMA_DEFAULT_RX_SGE; - rc = rdma_create_qp(rqpair->cm_id, NULL, &attr); + rc = rdma_create_qp(rqpair->cm_id, rctrlr->pd, &attr); if (rc) { SPDK_ERRLOG("rdma_create_qp failed\n"); return -1; } + rctrlr->pd = rqpair->cm_id->qp->pd; rqpair->cm_id->context = &rqpair->qpair; @@ -611,29 +626,38 @@ nvme_rdma_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map, enum spdk_mem_map_notify_action action, void *vaddr, size_t size) { - struct ibv_pd *pd = cb_ctx; + struct nvme_rdma_ctrlr *rctrlr = cb_ctx; + struct ibv_pd *pd; struct ibv_mr *mr; int rc; switch (action) { case SPDK_MEM_MAP_NOTIFY_REGISTER: - mr = ibv_reg_mr(pd, vaddr, size, - IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_READ | - IBV_ACCESS_REMOTE_WRITE); - if (mr == NULL) { - SPDK_ERRLOG("ibv_reg_mr() failed\n"); - return -EFAULT; + if (!rctrlr->hooks.get_rkey) { + pd = rctrlr->pd; + mr = ibv_reg_mr(pd, vaddr, size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (mr == NULL) { + SPDK_ERRLOG("ibv_reg_mr() failed\n"); + return -EFAULT; + } else { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); + } } else { - rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, + rctrlr->hooks.get_rkey(rctrlr->hook_ctx, vaddr, size)); } break; case SPDK_MEM_MAP_NOTIFY_UNREGISTER: - mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); - rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); - if (mr) { - ibv_dereg_mr(mr); + if (!rctrlr->hooks.get_rkey) { + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); + if (mr) { + ibv_dereg_mr(mr); + } } + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); break; default: SPDK_UNREACHABLE(); @@ -673,7 +697,8 @@ nvme_rdma_register_mem(struct nvme_rdma_qpair *rqpair) mr_map->ref = 1; mr_map->pd = pd; - mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, pd); + mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, + nvme_rdma_ctrlr(rqpair->qpair.ctrlr)); if (mr_map->map == NULL) { SPDK_ERRLOG("spdk_mem_map_alloc() failed\n"); free(mr_map); @@ -918,9 +943,21 @@ nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair, assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); requested_size = req->payload_size; - mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)payload, - &requested_size); - if (mr == NULL || requested_size < req->payload_size) { + if (!nvme_rdma_ctrlr(rqpair->qpair.ctrlr)->hooks.get_rkey) { + + mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)payload, + &requested_size); + if (mr == NULL) { + return -1; + } + req->cmd.dptr.sgl1.keyed.key = mr->rkey; + } else { + req->cmd.dptr.sgl1.keyed.key = spdk_mem_map_translate(rqpair->mr_map->map, + (uint64_t)payload, + &requested_size); + } + + if (requested_size < req->payload_size) { return -1; } @@ -937,7 +974,6 @@ nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair, req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; req->cmd.dptr.sgl1.keyed.length = req->payload_size; - req->cmd.dptr.sgl1.keyed.key = mr->rkey; req->cmd.dptr.sgl1.address = (uint64_t)payload; return 0; @@ -977,17 +1013,27 @@ nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, sge_length = spdk_min(remaining_size, sge_length); mr_length = sge_length; - mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)virt_addr, - &mr_length); + if (!nvme_rdma_ctrlr(rqpair->qpair.ctrlr)->hooks.get_rkey) { + mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, + (uint64_t)virt_addr, + &mr_length); + if (mr == NULL) { + return -1; + } + cmd->sgl[num_sgl_desc].keyed.key = mr->rkey; + } else { + cmd->sgl[num_sgl_desc].keyed.key = spdk_mem_map_translate(rqpair->mr_map->map, + (uint64_t)virt_addr, + &mr_length); + } - if (mr == NULL || mr_length < sge_length) { + if (mr_length < sge_length) { return -1; } cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; cmd->sgl[num_sgl_desc].keyed.length = sge_length; - cmd->sgl[num_sgl_desc].keyed.key = mr->rkey; cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr; remaining_size -= sge_length; @@ -1017,11 +1063,11 @@ nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, * the NVMe command. */ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); - req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; - req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; - req->cmd.dptr.sgl1.keyed.length = req->payload_size; - req->cmd.dptr.sgl1.keyed.key = mr->rkey; - req->cmd.dptr.sgl1.address = rqpair->cmds[rdma_req->id].sgl[0].address; + req->cmd.dptr.sgl1.keyed.type = cmd->sgl[0].keyed.type; + req->cmd.dptr.sgl1.keyed.subtype = cmd->sgl[0].keyed.subtype; + req->cmd.dptr.sgl1.keyed.length = cmd->sgl[0].keyed.length; + req->cmd.dptr.sgl1.keyed.key = cmd->sgl[0].keyed.key; + req->cmd.dptr.sgl1.address = cmd->sgl[0].address; } else { /* * Otherwise, The SGL descriptor embedded in the command must point to the list of @@ -1363,6 +1409,11 @@ struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transpo nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs); + if (g_nvme_hooks.get_ctx) { + rctrlr->hooks = g_nvme_hooks; + rctrlr->hook_ctx = rctrlr->hooks.get_ctx(&rctrlr->ctrlr.trid); + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "successfully initialized the nvmf ctrlr\n"); return &rctrlr->ctrlr; } @@ -1632,3 +1683,9 @@ nvme_rdma_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, siz { return 0; } + +void +spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) +{ + g_nvme_hooks = *hooks; +}