numam-spdk/lib/nvmf/rdma.c
Ben Walker 4a8b3adb44 nvmf: Simplify qpair disconnect code path
This path works for disconnect events on qpairs at run time.
Disconnects in response to killing the target have
not been worked out yet.

This path does not currently wait for outstanding I/O to
complete.

Change-Id: I8e476c8444b460c18e51601fb950b9132d12f67d
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-on: https://review.gerrithub.io/412076
Tested-by: SPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
2018-05-30 17:38:26 +00:00

1929 lines
57 KiB
C

/*-
* BSD LICENSE
*
* Copyright (c) Intel Corporation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "spdk/stdinc.h"
#include <infiniband/verbs.h>
#include <rdma/rdma_cma.h>
#include <rdma/rdma_verbs.h>
#include "nvmf_internal.h"
#include "transport.h"
#include "spdk/assert.h"
#include "spdk/io_channel.h"
#include "spdk/nvmf.h"
#include "spdk/nvmf_spec.h"
#include "spdk/string.h"
#include "spdk/trace.h"
#include "spdk/util.h"
#include "spdk_internal/log.h"
/*
RDMA Connection Resouce Defaults
*/
#define NVMF_DEFAULT_TX_SGE 1
#define NVMF_DEFAULT_RX_SGE 2
/* The RDMA completion queue size */
#define NVMF_RDMA_CQ_SIZE 4096
/* AIO backend requires block size aligned data buffers,
* extra 4KiB aligned data buffer should work for most devices.
*/
#define SHIFT_4KB 12
#define NVMF_DATA_BUFFER_ALIGNMENT (1 << SHIFT_4KB)
#define NVMF_DATA_BUFFER_MASK (NVMF_DATA_BUFFER_ALIGNMENT - 1)
enum spdk_nvmf_rdma_request_state {
/* The request is not currently in use */
RDMA_REQUEST_STATE_FREE = 0,
/* Initial state when request first received */
RDMA_REQUEST_STATE_NEW,
/* The request is queued until a data buffer is available. */
RDMA_REQUEST_STATE_NEED_BUFFER,
/* The request is waiting on RDMA queue depth availability
* to transfer data from the host to the controller.
*/
RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER,
/* The request is currently transferring data from the host to the controller. */
RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
/* The request is ready to execute at the block device */
RDMA_REQUEST_STATE_READY_TO_EXECUTE,
/* The request is currently executing at the block device */
RDMA_REQUEST_STATE_EXECUTING,
/* The request finished executing at the block device */
RDMA_REQUEST_STATE_EXECUTED,
/* The request is waiting on RDMA queue depth availability
* to transfer data from the controller to the host.
*/
RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST,
/* The request is ready to send a completion */
RDMA_REQUEST_STATE_READY_TO_COMPLETE,
/* The request currently has a completion outstanding */
RDMA_REQUEST_STATE_COMPLETING,
/* The request completed and can be marked free. */
RDMA_REQUEST_STATE_COMPLETED,
};
/* This structure holds commands as they are received off the wire.
* It must be dynamically paired with a full request object
* (spdk_nvmf_rdma_request) to service a request. It is separate
* from the request because RDMA does not appear to order
* completions, so occasionally we'll get a new incoming
* command when there aren't any free request objects.
*/
struct spdk_nvmf_rdma_recv {
struct ibv_recv_wr wr;
struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE];
struct spdk_nvmf_rdma_qpair *qpair;
/* In-capsule data buffer */
uint8_t *buf;
TAILQ_ENTRY(spdk_nvmf_rdma_recv) link;
};
struct spdk_nvmf_rdma_request {
struct spdk_nvmf_request req;
void *data_from_pool;
enum spdk_nvmf_rdma_request_state state;
struct spdk_nvmf_rdma_recv *recv;
struct {
struct ibv_send_wr wr;
struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE];
} rsp;
struct {
struct ibv_send_wr wr;
struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE];
} data;
TAILQ_ENTRY(spdk_nvmf_rdma_request) link;
};
struct spdk_nvmf_rdma_qpair {
struct spdk_nvmf_qpair qpair;
struct spdk_nvmf_rdma_port *port;
struct spdk_nvmf_rdma_poller *poller;
struct rdma_cm_id *cm_id;
/* The maximum number of I/O outstanding on this connection at one time */
uint16_t max_queue_depth;
/* The maximum number of active RDMA READ and WRITE operations at one time */
uint16_t max_rw_depth;
/* The current number of I/O outstanding on this connection. This number
* includes all I/O from the time the capsule is first received until it is
* completed.
*/
uint16_t cur_queue_depth;
/* The number of RDMA READ and WRITE requests that are outstanding */
uint16_t cur_rdma_rw_depth;
/* Receives that are waiting for a request object */
TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue;
/* Requests that are not in use */
TAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue;
/* Requests that are waiting to perform an RDMA READ or WRITE */
TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_rw_queue;
/* Array of size "max_queue_depth" containing RDMA requests. */
struct spdk_nvmf_rdma_request *reqs;
/* Array of size "max_queue_depth" containing RDMA recvs. */
struct spdk_nvmf_rdma_recv *recvs;
/* Array of size "max_queue_depth" containing 64 byte capsules
* used for receive.
*/
union nvmf_h2c_msg *cmds;
struct ibv_mr *cmds_mr;
/* Array of size "max_queue_depth" containing 16 byte completions
* to be sent back to the user.
*/
union nvmf_c2h_msg *cpls;
struct ibv_mr *cpls_mr;
/* Array of size "max_queue_depth * InCapsuleDataSize" containing
* buffers to be used for in capsule data.
*/
void *bufs;
struct ibv_mr *bufs_mr;
TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link;
TAILQ_ENTRY(spdk_nvmf_rdma_qpair) pending_link;
/* Mgmt channel */
struct spdk_io_channel *mgmt_channel;
struct spdk_nvmf_rdma_mgmt_channel *ch;
};
struct spdk_nvmf_rdma_poller {
struct spdk_nvmf_rdma_device *device;
struct spdk_nvmf_rdma_poll_group *group;
struct ibv_cq *cq;
TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs;
TAILQ_ENTRY(spdk_nvmf_rdma_poller) link;
};
struct spdk_nvmf_rdma_poll_group {
struct spdk_nvmf_transport_poll_group group;
TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers;
};
/* Assuming rdma_cm uses just one protection domain per ibv_context. */
struct spdk_nvmf_rdma_device {
struct ibv_device_attr attr;
struct ibv_context *context;
struct spdk_mem_map *map;
struct ibv_pd *pd;
TAILQ_ENTRY(spdk_nvmf_rdma_device) link;
};
struct spdk_nvmf_rdma_port {
struct spdk_nvme_transport_id trid;
struct rdma_cm_id *id;
struct spdk_nvmf_rdma_device *device;
uint32_t ref;
TAILQ_ENTRY(spdk_nvmf_rdma_port) link;
};
struct spdk_nvmf_rdma_transport {
struct spdk_nvmf_transport transport;
struct rdma_event_channel *event_channel;
struct spdk_mempool *data_buf_pool;
pthread_mutex_t lock;
uint16_t max_queue_depth;
uint32_t max_io_size;
uint32_t in_capsule_data_size;
TAILQ_HEAD(, spdk_nvmf_rdma_device) devices;
TAILQ_HEAD(, spdk_nvmf_rdma_port) ports;
};
struct spdk_nvmf_rdma_mgmt_channel {
/* Requests that are waiting to obtain a data buffer */
TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue;
};
static int
spdk_nvmf_rdma_mgmt_channel_create(void *io_device, void *ctx_buf)
{
struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf;
TAILQ_INIT(&ch->pending_data_buf_queue);
return 0;
}
static void
spdk_nvmf_rdma_mgmt_channel_destroy(void *io_device, void *ctx_buf)
{
struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf;
if (!TAILQ_EMPTY(&ch->pending_data_buf_queue)) {
SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n");
}
}
static void
spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair)
{
if (rqpair->poller) {
TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link);
}
if (rqpair->cmds_mr) {
ibv_dereg_mr(rqpair->cmds_mr);
}
if (rqpair->cpls_mr) {
ibv_dereg_mr(rqpair->cpls_mr);
}
if (rqpair->bufs_mr) {
ibv_dereg_mr(rqpair->bufs_mr);
}
if (rqpair->cm_id) {
rdma_destroy_qp(rqpair->cm_id);
rdma_destroy_id(rqpair->cm_id);
}
if (rqpair->mgmt_channel) {
spdk_put_io_channel(rqpair->mgmt_channel);
}
/* Free all memory */
spdk_dma_free(rqpair->cmds);
spdk_dma_free(rqpair->cpls);
spdk_dma_free(rqpair->bufs);
free(rqpair->reqs);
free(rqpair->recvs);
free(rqpair);
}
static int
spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_qpair *rqpair;
int rc, i;
struct ibv_qp_init_attr attr;
struct spdk_nvmf_rdma_recv *rdma_recv;
struct spdk_nvmf_rdma_request *rdma_req;
rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
attr.qp_type = IBV_QPT_RC;
attr.send_cq = rqpair->poller->cq;
attr.recv_cq = rqpair->poller->cq;
attr.cap.max_send_wr = rqpair->max_queue_depth * 2; /* SEND, READ, and WRITE operations */
attr.cap.max_recv_wr = rqpair->max_queue_depth; /* RECV operations */
attr.cap.max_send_sge = NVMF_DEFAULT_TX_SGE;
attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE;
rc = rdma_create_qp(rqpair->cm_id, NULL, &attr);
if (rc) {
SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno));
rdma_destroy_id(rqpair->cm_id);
rqpair->cm_id = NULL;
spdk_nvmf_rdma_qpair_destroy(rqpair);
return -1;
}
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair);
rqpair->reqs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->reqs));
rqpair->recvs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->recvs));
rqpair->cmds = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cmds),
0x1000, NULL);
rqpair->cpls = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cpls),
0x1000, NULL);
rqpair->bufs = spdk_dma_zmalloc(rqpair->max_queue_depth * rtransport->in_capsule_data_size,
0x1000, NULL);
if (!rqpair->reqs || !rqpair->recvs || !rqpair->cmds ||
!rqpair->cpls || !rqpair->bufs) {
SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n");
spdk_nvmf_rdma_qpair_destroy(rqpair);
return -1;
}
rqpair->cmds_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cmds,
rqpair->max_queue_depth * sizeof(*rqpair->cmds),
IBV_ACCESS_LOCAL_WRITE);
rqpair->cpls_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cpls,
rqpair->max_queue_depth * sizeof(*rqpair->cpls),
0);
rqpair->bufs_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->bufs,
rqpair->max_queue_depth * rtransport->in_capsule_data_size,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE);
if (!rqpair->cmds_mr || !rqpair->cpls_mr || !rqpair->bufs_mr) {
SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n");
spdk_nvmf_rdma_qpair_destroy(rqpair);
return -1;
}
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n",
rqpair->cmds, rqpair->max_queue_depth * sizeof(*rqpair->cmds), rqpair->cmds_mr->lkey);
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n",
rqpair->cpls, rqpair->max_queue_depth * sizeof(*rqpair->cpls), rqpair->cpls_mr->lkey);
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n",
rqpair->bufs, rqpair->max_queue_depth * rtransport->in_capsule_data_size, rqpair->bufs_mr->lkey);
for (i = 0; i < rqpair->max_queue_depth; i++) {
struct ibv_recv_wr *bad_wr = NULL;
rdma_recv = &rqpair->recvs[i];
rdma_recv->qpair = rqpair;
/* Set up memory to receive commands */
rdma_recv->buf = (void *)((uintptr_t)rqpair->bufs + (i * rtransport->in_capsule_data_size));
rdma_recv->sgl[0].addr = (uintptr_t)&rqpair->cmds[i];
rdma_recv->sgl[0].length = sizeof(rqpair->cmds[i]);
rdma_recv->sgl[0].lkey = rqpair->cmds_mr->lkey;
rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf;
rdma_recv->sgl[1].length = rtransport->in_capsule_data_size;
rdma_recv->sgl[1].lkey = rqpair->bufs_mr->lkey;
rdma_recv->wr.wr_id = (uintptr_t)rdma_recv;
rdma_recv->wr.sg_list = rdma_recv->sgl;
rdma_recv->wr.num_sge = SPDK_COUNTOF(rdma_recv->sgl);
rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_recv->wr, &bad_wr);
if (rc) {
SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n");
spdk_nvmf_rdma_qpair_destroy(rqpair);
return -1;
}
}
for (i = 0; i < rqpair->max_queue_depth; i++) {
rdma_req = &rqpair->reqs[i];
rdma_req->req.qpair = &rqpair->qpair;
rdma_req->req.cmd = NULL;
/* Set up memory to send responses */
rdma_req->req.rsp = &rqpair->cpls[i];
rdma_req->rsp.sgl[0].addr = (uintptr_t)&rqpair->cpls[i];
rdma_req->rsp.sgl[0].length = sizeof(rqpair->cpls[i]);
rdma_req->rsp.sgl[0].lkey = rqpair->cpls_mr->lkey;
rdma_req->rsp.wr.wr_id = (uintptr_t)rdma_req;
rdma_req->rsp.wr.next = NULL;
rdma_req->rsp.wr.opcode = IBV_WR_SEND;
rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED;
rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl;
rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl);
/* Set up memory for data buffers */
rdma_req->data.wr.wr_id = (uint64_t)rdma_req;
rdma_req->data.wr.next = NULL;
rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED;
rdma_req->data.wr.sg_list = rdma_req->data.sgl;
rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl);
TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link);
}
return 0;
}
static int
request_transfer_in(struct spdk_nvmf_request *req)
{
int rc;
struct spdk_nvmf_rdma_request *rdma_req;
struct spdk_nvmf_qpair *qpair;
struct spdk_nvmf_rdma_qpair *rqpair;
struct ibv_send_wr *bad_wr = NULL;
qpair = req->qpair;
rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
rqpair->cur_rdma_rw_depth++;
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair);
spdk_trace_record(TRACE_RDMA_READ_START, 0, 0, (uintptr_t)req, 0);
rdma_req->data.wr.opcode = IBV_WR_RDMA_READ;
rdma_req->data.wr.next = NULL;
rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr);
if (rc) {
SPDK_ERRLOG("Unable to transfer data from host to target\n");
/* Decrement r/w counter back since data transfer
* has not started.
*/
rqpair->cur_rdma_rw_depth--;
return -1;
}
return 0;
}
static int
request_transfer_out(struct spdk_nvmf_request *req)
{
int rc;
struct spdk_nvmf_rdma_request *rdma_req;
struct spdk_nvmf_qpair *qpair;
struct spdk_nvmf_rdma_qpair *rqpair;
struct spdk_nvme_cpl *rsp;
struct ibv_recv_wr *bad_recv_wr = NULL;
struct ibv_send_wr *send_wr, *bad_send_wr = NULL;
qpair = req->qpair;
rsp = &req->rsp->nvme_cpl;
rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
/* Advance our sq_head pointer */
if (qpair->sq_head == qpair->sq_head_max) {
qpair->sq_head = 0;
} else {
qpair->sq_head++;
}
rsp->sqhd = qpair->sq_head;
/* Post the capsule to the recv buffer */
assert(rdma_req->recv != NULL);
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv,
rqpair);
rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr);
if (rc) {
SPDK_ERRLOG("Unable to re-post rx descriptor\n");
return rc;
}
rdma_req->recv = NULL;
/* Build the response which consists of an optional
* RDMA WRITE to transfer data, plus an RDMA SEND
* containing the response.
*/
send_wr = &rdma_req->rsp.wr;
if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair);
spdk_trace_record(TRACE_RDMA_WRITE_START, 0, 0, (uintptr_t)req, 0);
rqpair->cur_rdma_rw_depth++;
rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE;
rdma_req->data.wr.next = send_wr;
send_wr = &rdma_req->data.wr;
}
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair);
spdk_trace_record(TRACE_NVMF_IO_COMPLETE, 0, 0, (uintptr_t)req, 0);
/* Send the completion */
rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr);
if (rc) {
SPDK_ERRLOG("Unable to send response capsule\n");
if (rdma_req->data.wr.opcode == IBV_WR_RDMA_WRITE) {
/* Decrement r/w counter back since data transfer
* has not started.
*/
rqpair->cur_rdma_rw_depth--;
}
}
return rc;
}
static int
spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair)
{
struct spdk_nvmf_rdma_accept_private_data accept_data;
struct rdma_conn_param ctrlr_event_data = {};
int rc;
accept_data.recfmt = 0;
accept_data.crqsize = rqpair->max_queue_depth;
ctrlr_event_data.private_data = &accept_data;
ctrlr_event_data.private_data_len = sizeof(accept_data);
if (id->ps == RDMA_PS_TCP) {
ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */
ctrlr_event_data.initiator_depth = rqpair->max_rw_depth;
}
rc = rdma_accept(id, &ctrlr_event_data);
if (rc) {
SPDK_ERRLOG("Error %d on rdma_accept\n", errno);
} else {
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n");
}
return rc;
}
static void
spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error)
{
struct spdk_nvmf_rdma_reject_private_data rej_data;
rej_data.recfmt = 0;
rej_data.sts = error;
rdma_reject(id, &rej_data, sizeof(rej_data));
}
static int
nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event,
new_qpair_fn cb_fn)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_qpair *rqpair = NULL;
struct spdk_nvmf_rdma_port *port;
struct rdma_conn_param *rdma_param = NULL;
const struct spdk_nvmf_rdma_request_private_data *private_data = NULL;
uint16_t max_queue_depth;
uint16_t max_rw_depth;
rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
assert(event->id != NULL); /* Impossible. Can't even reject the connection. */
assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */
rdma_param = &event->param.conn;
if (rdma_param->private_data == NULL ||
rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) {
SPDK_ERRLOG("connect request: no private data provided\n");
spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH);
return -1;
}
private_data = rdma_param->private_data;
if (private_data->recfmt != 0) {
SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n");
spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT);
return -1;
}
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n",
event->id->verbs->device->name, event->id->verbs->device->dev_name);
port = event->listen_id->context;
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n",
event->listen_id, event->listen_id->verbs, port);
/* Figure out the supported queue depth. This is a multi-step process
* that takes into account hardware maximums, host provided values,
* and our target's internal memory limits */
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n");
/* Start with the maximum queue depth allowed by the target */
max_queue_depth = rtransport->max_queue_depth;
max_rw_depth = rtransport->max_queue_depth;
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", rtransport->max_queue_depth);
/* Next check the local NIC's hardware limitations */
SPDK_DEBUGLOG(SPDK_LOG_RDMA,
"Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n",
port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom);
max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr);
max_rw_depth = spdk_min(max_rw_depth, port->device->attr.max_qp_rd_atom);
/* Next check the remote NIC's hardware limitations */
SPDK_DEBUGLOG(SPDK_LOG_RDMA,
"Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n",
rdma_param->initiator_depth, rdma_param->responder_resources);
if (rdma_param->initiator_depth > 0) {
max_rw_depth = spdk_min(max_rw_depth, rdma_param->initiator_depth);
}
/* Finally check for the host software requested values, which are
* optional. */
if (rdma_param->private_data != NULL &&
rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) {
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize);
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize);
max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize);
max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1);
}
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n",
max_queue_depth, max_rw_depth);
rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair));
if (rqpair == NULL) {
SPDK_ERRLOG("Could not allocate new connection.\n");
spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
return -1;
}
rqpair->port = port;
rqpair->max_queue_depth = max_queue_depth;
rqpair->max_rw_depth = max_rw_depth;
rqpair->cm_id = event->id;
rqpair->qpair.transport = transport;
TAILQ_INIT(&rqpair->incoming_queue);
TAILQ_INIT(&rqpair->free_queue);
TAILQ_INIT(&rqpair->pending_rdma_rw_queue);
event->id->context = &rqpair->qpair;
cb_fn(&rqpair->qpair);
return 0;
}
static int
nvmf_rdma_disconnect(struct rdma_cm_event *evt)
{
struct spdk_nvmf_qpair *qpair;
if (evt->id == NULL) {
SPDK_ERRLOG("disconnect request: missing cm_id\n");
return -1;
}
qpair = evt->id->context;
if (qpair == NULL) {
SPDK_ERRLOG("disconnect request: no active connection\n");
return -1;
}
/* ack the disconnect event before rdma_destroy_id */
rdma_ack_cm_event(evt);
spdk_nvmf_ctrlr_disconnect(qpair);
return 0;
}
#ifdef DEBUG
static const char *CM_EVENT_STR[] = {
"RDMA_CM_EVENT_ADDR_RESOLVED",
"RDMA_CM_EVENT_ADDR_ERROR",
"RDMA_CM_EVENT_ROUTE_RESOLVED",
"RDMA_CM_EVENT_ROUTE_ERROR",
"RDMA_CM_EVENT_CONNECT_REQUEST",
"RDMA_CM_EVENT_CONNECT_RESPONSE",
"RDMA_CM_EVENT_CONNECT_ERROR",
"RDMA_CM_EVENT_UNREACHABLE",
"RDMA_CM_EVENT_REJECTED",
"RDMA_CM_EVENT_ESTABLISHED",
"RDMA_CM_EVENT_DISCONNECTED",
"RDMA_CM_EVENT_DEVICE_REMOVAL",
"RDMA_CM_EVENT_MULTICAST_JOIN",
"RDMA_CM_EVENT_MULTICAST_ERROR",
"RDMA_CM_EVENT_ADDR_CHANGE",
"RDMA_CM_EVENT_TIMEWAIT_EXIT"
};
#endif /* DEBUG */
static int
spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
enum spdk_mem_map_notify_action action,
void *vaddr, size_t size)
{
struct spdk_nvmf_rdma_device *device = cb_ctx;
struct ibv_pd *pd = device->pd;
struct ibv_mr *mr;
switch (action) {
case SPDK_MEM_MAP_NOTIFY_REGISTER:
mr = ibv_reg_mr(pd, vaddr, size,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_WRITE);
if (mr == NULL) {
SPDK_ERRLOG("ibv_reg_mr() failed\n");
return -1;
} else {
spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
}
break;
case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr);
spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
if (mr) {
ibv_dereg_mr(mr);
}
break;
}
return 0;
}
typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t;
static spdk_nvme_data_transfer_t
spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req)
{
enum spdk_nvme_data_transfer xfer;
struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd;
struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1;
/* Figure out data transfer direction */
if (cmd->opc == SPDK_NVME_OPC_FABRIC) {
xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype);
} else {
xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
/* Some admin commands are special cases */
if ((rdma_req->req.qpair->qid == 0) &&
((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) ||
(cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) {
switch (cmd->cdw10 & 0xff) {
case SPDK_NVME_FEAT_LBA_RANGE_TYPE:
case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
case SPDK_NVME_FEAT_HOST_IDENTIFIER:
break;
default:
xfer = SPDK_NVME_DATA_NONE;
}
}
}
if (xfer == SPDK_NVME_DATA_NONE) {
return xfer;
}
/* Even for commands that may transfer data, they could have specified 0 length.
* We want those to show up with xfer SPDK_NVME_DATA_NONE.
*/
switch (sgl->generic.type) {
case SPDK_NVME_SGL_TYPE_DATA_BLOCK:
case SPDK_NVME_SGL_TYPE_BIT_BUCKET:
case SPDK_NVME_SGL_TYPE_SEGMENT:
case SPDK_NVME_SGL_TYPE_LAST_SEGMENT:
case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK:
if (sgl->unkeyed.length == 0) {
xfer = SPDK_NVME_DATA_NONE;
}
break;
case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK:
if (sgl->keyed.length == 0) {
xfer = SPDK_NVME_DATA_NONE;
}
break;
}
return xfer;
}
static int
spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_device *device,
struct spdk_nvmf_rdma_request *rdma_req)
{
struct spdk_nvme_cmd *cmd;
struct spdk_nvme_cpl *rsp;
struct spdk_nvme_sgl_descriptor *sgl;
cmd = &rdma_req->req.cmd->nvme_cmd;
rsp = &rdma_req->req.rsp->nvme_cpl;
sgl = &cmd->dptr.sgl1;
if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK &&
(sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS ||
sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) {
if (sgl->keyed.length > rtransport->max_io_size) {
SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
sgl->keyed.length, rtransport->max_io_size);
rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
return -1;
}
rdma_req->req.length = sgl->keyed.length;
rdma_req->data_from_pool = spdk_mempool_get(rtransport->data_buf_pool);
if (!rdma_req->data_from_pool) {
/* No available buffers. Queue this request up. */
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req);
return 0;
}
/* AIO backend requires block size aligned data buffers,
* 4KiB aligned data buffer should work for most devices.
*/
rdma_req->req.data = (void *)((uintptr_t)(rdma_req->data_from_pool + NVMF_DATA_BUFFER_MASK)
& ~NVMF_DATA_BUFFER_MASK);
rdma_req->data.sgl[0].addr = (uintptr_t)rdma_req->req.data;
rdma_req->data.sgl[0].length = sgl->keyed.length;
rdma_req->data.sgl[0].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map,
(uint64_t)rdma_req->req.data))->lkey;
rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key;
rdma_req->data.wr.wr.rdma.remote_addr = sgl->address;
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took buffer from central pool\n", rdma_req);
return 0;
} else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
uint64_t offset = sgl->address;
uint32_t max_len = rtransport->in_capsule_data_size;
SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
offset, sgl->unkeyed.length);
if (offset > max_len) {
SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n",
offset, max_len);
rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET;
return -1;
}
max_len -= (uint32_t)offset;
if (sgl->unkeyed.length > max_len) {
SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
sgl->unkeyed.length, max_len);
rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
return -1;
}
rdma_req->req.data = rdma_req->recv->buf + offset;
rdma_req->data_from_pool = NULL;
rdma_req->req.length = sgl->unkeyed.length;
return 0;
}
SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n",
sgl->generic.type, sgl->generic.subtype);
rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID;
return -1;
}
static bool
spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_request *rdma_req)
{
struct spdk_nvmf_rdma_qpair *rqpair;
struct spdk_nvmf_rdma_device *device;
struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl;
int rc;
struct spdk_nvmf_rdma_recv *rdma_recv;
enum spdk_nvmf_rdma_request_state prev_state;
bool progress = false;
rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
device = rqpair->port->device;
assert(rdma_req->state != RDMA_REQUEST_STATE_FREE);
/* The loop here is to allow for several back-to-back state changes. */
do {
prev_state = rdma_req->state;
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state);
switch (rdma_req->state) {
case RDMA_REQUEST_STATE_FREE:
/* Some external code must kick a request into RDMA_REQUEST_STATE_NEW
* to escape this state. */
break;
case RDMA_REQUEST_STATE_NEW:
rqpair->cur_queue_depth++;
rdma_recv = rdma_req->recv;
/* The first element of the SGL is the NVMe command */
rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr;
memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp));
TAILQ_REMOVE(&rqpair->incoming_queue, rdma_recv, link);
TAILQ_REMOVE(&rqpair->free_queue, rdma_req, link);
/* The next state transition depends on the data transfer needs of this request. */
rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req);
/* If no data to transfer, ready to execute. */
if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) {
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
break;
}
rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER;
TAILQ_INSERT_TAIL(&rqpair->ch->pending_data_buf_queue, rdma_req, link);
break;
case RDMA_REQUEST_STATE_NEED_BUFFER:
assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE);
if (rdma_req != TAILQ_FIRST(&rqpair->ch->pending_data_buf_queue)) {
/* This request needs to wait in line to obtain a buffer */
break;
}
/* Try to get a data buffer */
rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req);
if (rc < 0) {
TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link);
rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
break;
}
if (!rdma_req->req.data) {
/* No buffers available. */
break;
}
TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link);
/* If data is transferring from host to controller and the data didn't
* arrive using in capsule data, we need to do a transfer from the host.
*/
if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool != NULL) {
rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER;
TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link);
break;
}
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
break;
case RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER:
if (rdma_req != TAILQ_FIRST(&rqpair->pending_rdma_rw_queue)) {
/* This request needs to wait in line to perform RDMA */
break;
}
if (rqpair->cur_rdma_rw_depth < rqpair->max_rw_depth) {
TAILQ_REMOVE(&rqpair->pending_rdma_rw_queue, rdma_req, link);
rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER;
rc = request_transfer_in(&rdma_req->req);
if (rc) {
rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
}
}
break;
case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
/* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE
* to escape this state. */
break;
case RDMA_REQUEST_STATE_READY_TO_EXECUTE:
rdma_req->state = RDMA_REQUEST_STATE_EXECUTING;
spdk_nvmf_request_exec(&rdma_req->req);
break;
case RDMA_REQUEST_STATE_EXECUTING:
/* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED
* to escape this state. */
break;
case RDMA_REQUEST_STATE_EXECUTED:
if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST;
TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link);
} else {
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
}
break;
case RDMA_REQUEST_STATE_TRANSFER_PENDING_CONTROLLER_TO_HOST:
if (rdma_req != TAILQ_FIRST(&rqpair->pending_rdma_rw_queue)) {
/* This request needs to wait in line to perform RDMA */
break;
}
if (rqpair->cur_rdma_rw_depth < rqpair->max_rw_depth) {
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
TAILQ_REMOVE(&rqpair->pending_rdma_rw_queue, rdma_req, link);
}
break;
case RDMA_REQUEST_STATE_READY_TO_COMPLETE:
rdma_req->state = RDMA_REQUEST_STATE_COMPLETING;
rc = request_transfer_out(&rdma_req->req);
assert(rc == 0); /* No good way to handle this currently */
break;
case RDMA_REQUEST_STATE_COMPLETING:
/* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
* to escape this state. */
break;
case RDMA_REQUEST_STATE_COMPLETED:
assert(rqpair->cur_queue_depth > 0);
rqpair->cur_queue_depth--;
if (rdma_req->data_from_pool) {
/* Put the buffer back in the pool */
spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data_from_pool);
rdma_req->data_from_pool = NULL;
}
rdma_req->req.length = 0;
rdma_req->req.data = NULL;
rdma_req->state = RDMA_REQUEST_STATE_FREE;
TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link);
break;
}
if (rdma_req->state != prev_state) {
progress = true;
}
} while (rdma_req->state != prev_state);
return progress;
}
/* Public API callbacks begin here */
static struct spdk_nvmf_transport *
spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt)
{
int rc;
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_device *device, *tmp;
struct ibv_context **contexts;
uint32_t i;
int flag;
rtransport = calloc(1, sizeof(*rtransport));
if (!rtransport) {
return NULL;
}
pthread_mutex_init(&rtransport->lock, NULL);
TAILQ_INIT(&rtransport->devices);
TAILQ_INIT(&rtransport->ports);
rtransport->transport.tgt = tgt;
rtransport->transport.ops = &spdk_nvmf_transport_rdma;
SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n");
rtransport->max_queue_depth = tgt->opts.max_queue_depth;
rtransport->max_io_size = tgt->opts.max_io_size;
rtransport->in_capsule_data_size = tgt->opts.in_capsule_data_size;
rtransport->event_channel = rdma_create_event_channel();
if (rtransport->event_channel == NULL) {
SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno));
free(rtransport);
return NULL;
}
flag = fcntl(rtransport->event_channel->fd, F_GETFL);
if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) {
SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
rtransport->event_channel->fd, spdk_strerror(errno));
free(rtransport);
return NULL;
}
rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma",
rtransport->max_queue_depth * 4, /* The 4 is arbitrarily chosen. Needs to be configurable. */
rtransport->max_io_size + NVMF_DATA_BUFFER_ALIGNMENT,
SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
SPDK_ENV_SOCKET_ID_ANY);
if (!rtransport->data_buf_pool) {
SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n");
free(rtransport);
return NULL;
}
spdk_io_device_register(rtransport, spdk_nvmf_rdma_mgmt_channel_create,
spdk_nvmf_rdma_mgmt_channel_destroy,
sizeof(struct spdk_nvmf_rdma_mgmt_channel));
contexts = rdma_get_devices(NULL);
i = 0;
rc = 0;
while (contexts[i] != NULL) {
device = calloc(1, sizeof(*device));
if (!device) {
SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n");
rc = -ENOMEM;
break;
}
device->context = contexts[i];
rc = ibv_query_device(device->context, &device->attr);
if (rc < 0) {
SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
free(device);
break;
}
device->pd = NULL;
device->map = NULL;
TAILQ_INSERT_TAIL(&rtransport->devices, device, link);
i++;
}
if (rc < 0) {
TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
TAILQ_REMOVE(&rtransport->devices, device, link);
free(device);
}
spdk_mempool_free(rtransport->data_buf_pool);
rdma_destroy_event_channel(rtransport->event_channel);
free(rtransport);
rdma_free_devices(contexts);
return NULL;
}
rdma_free_devices(contexts);
return &rtransport->transport;
}
static int
spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_port *port, *port_tmp;
struct spdk_nvmf_rdma_device *device, *device_tmp;
rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) {
TAILQ_REMOVE(&rtransport->ports, port, link);
rdma_destroy_id(port->id);
free(port);
}
if (rtransport->event_channel != NULL) {
rdma_destroy_event_channel(rtransport->event_channel);
}
TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
TAILQ_REMOVE(&rtransport->devices, device, link);
if (device->map) {
spdk_mem_map_free(&device->map);
}
free(device);
}
if (spdk_mempool_count(rtransport->data_buf_pool) != (rtransport->max_queue_depth * 4)) {
SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n",
spdk_mempool_count(rtransport->data_buf_pool),
rtransport->max_queue_depth * 4);
}
spdk_mempool_free(rtransport->data_buf_pool);
spdk_io_device_unregister(rtransport, NULL);
free(rtransport);
return 0;
}
static int
spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport,
const struct spdk_nvme_transport_id *trid)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_device *device;
struct spdk_nvmf_rdma_port *port_tmp, *port;
struct addrinfo *res;
struct addrinfo hints;
int family;
int rc;
rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
port = calloc(1, sizeof(*port));
if (!port) {
return -ENOMEM;
}
/* Selectively copy the trid. Things like NQN don't matter here - that
* mapping is enforced elsewhere.
*/
port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA;
port->trid.adrfam = trid->adrfam;
snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr);
snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid);
pthread_mutex_lock(&rtransport->lock);
assert(rtransport->event_channel != NULL);
TAILQ_FOREACH(port_tmp, &rtransport->ports, link) {
if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) {
port_tmp->ref++;
free(port);
/* Already listening at this address */
pthread_mutex_unlock(&rtransport->lock);
return 0;
}
}
rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP);
if (rc < 0) {
SPDK_ERRLOG("rdma_create_id() failed\n");
free(port);
pthread_mutex_unlock(&rtransport->lock);
return rc;
}
switch (port->trid.adrfam) {
case SPDK_NVMF_ADRFAM_IPV4:
family = AF_INET;
break;
case SPDK_NVMF_ADRFAM_IPV6:
family = AF_INET6;
break;
default:
SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam);
free(port);
pthread_mutex_unlock(&rtransport->lock);
return -EINVAL;
}
memset(&hints, 0, sizeof(hints));
hints.ai_family = family;
hints.ai_socktype = SOCK_STREAM;
hints.ai_protocol = 0;
rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res);
if (rc) {
SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc);
free(port);
pthread_mutex_unlock(&rtransport->lock);
return -EINVAL;
}
rc = rdma_bind_addr(port->id, res->ai_addr);
freeaddrinfo(res);
if (rc < 0) {
SPDK_ERRLOG("rdma_bind_addr() failed\n");
rdma_destroy_id(port->id);
free(port);
pthread_mutex_unlock(&rtransport->lock);
return rc;
}
rc = rdma_listen(port->id, 10); /* 10 = backlog */
if (rc < 0) {
SPDK_ERRLOG("rdma_listen() failed\n");
rdma_destroy_id(port->id);
free(port);
pthread_mutex_unlock(&rtransport->lock);
return rc;
}
TAILQ_FOREACH(device, &rtransport->devices, link) {
if (device->context == port->id->verbs) {
port->device = device;
break;
}
}
if (!port->device) {
SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n",
port->id->verbs);
rdma_destroy_id(port->id);
free(port);
pthread_mutex_unlock(&rtransport->lock);
return -EINVAL;
}
if (!device->map) {
device->pd = port->id->pd;
device->map = spdk_mem_map_alloc(0, spdk_nvmf_rdma_mem_notify, device);
if (!device->map) {
SPDK_ERRLOG("Unable to allocate memory map for new poll group\n");
return -1;
}
} else {
assert(device->pd == port->id->pd);
}
SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n",
port->trid.traddr, ntohs(rdma_get_src_port(port->id)));
port->ref = 1;
TAILQ_INSERT_TAIL(&rtransport->ports, port, link);
pthread_mutex_unlock(&rtransport->lock);
return 0;
}
static int
spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport,
const struct spdk_nvme_transport_id *_trid)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_port *port, *tmp;
struct spdk_nvme_transport_id trid = {};
rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
/* Selectively copy the trid. Things like NQN don't matter here - that
* mapping is enforced elsewhere.
*/
trid.trtype = SPDK_NVME_TRANSPORT_RDMA;
trid.adrfam = _trid->adrfam;
snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr);
snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid);
pthread_mutex_lock(&rtransport->lock);
TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) {
if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) {
assert(port->ref > 0);
port->ref--;
if (port->ref == 0) {
TAILQ_REMOVE(&rtransport->ports, port, link);
rdma_destroy_id(port->id);
free(port);
}
break;
}
}
pthread_mutex_unlock(&rtransport->lock);
return 0;
}
static void
spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct rdma_cm_event *event;
int rc;
rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
if (rtransport->event_channel == NULL) {
return;
}
while (1) {
rc = rdma_get_cm_event(rtransport->event_channel, &event);
if (rc == 0) {
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
case RDMA_CM_EVENT_ROUTE_ERROR:
/* No action required. The target never attempts to resolve routes. */
break;
case RDMA_CM_EVENT_CONNECT_REQUEST:
rc = nvmf_rdma_connect(transport, event, cb_fn);
if (rc < 0) {
SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc);
break;
}
break;
case RDMA_CM_EVENT_CONNECT_RESPONSE:
/* The target never initiates a new connection. So this will not occur. */
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
/* Can this happen? The docs say it can, but not sure what causes it. */
break;
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_REJECTED:
/* These only occur on the client side. */
break;
case RDMA_CM_EVENT_ESTABLISHED:
/* TODO: Should we be waiting for this event anywhere? */
break;
case RDMA_CM_EVENT_DISCONNECTED:
case RDMA_CM_EVENT_DEVICE_REMOVAL:
rc = nvmf_rdma_disconnect(event);
if (rc < 0) {
SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
break;
}
continue;
case RDMA_CM_EVENT_MULTICAST_JOIN:
case RDMA_CM_EVENT_MULTICAST_ERROR:
/* Multicast is not used */
break;
case RDMA_CM_EVENT_ADDR_CHANGE:
/* Not utilizing this event */
break;
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
/* For now, do nothing. The target never re-uses queue pairs. */
break;
default:
SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event);
break;
}
rdma_ack_cm_event(event);
} else {
if (errno != EAGAIN && errno != EWOULDBLOCK) {
SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno));
}
break;
}
}
}
static void
spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport,
struct spdk_nvme_transport_id *trid,
struct spdk_nvmf_discovery_log_page_entry *entry)
{
entry->trtype = SPDK_NVMF_TRTYPE_RDMA;
entry->adrfam = trid->adrfam;
entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED;
spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED;
entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE;
entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM;
}
static struct spdk_nvmf_transport_poll_group *
spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_poll_group *rgroup;
struct spdk_nvmf_rdma_poller *poller;
struct spdk_nvmf_rdma_device *device;
rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
rgroup = calloc(1, sizeof(*rgroup));
if (!rgroup) {
return NULL;
}
TAILQ_INIT(&rgroup->pollers);
pthread_mutex_lock(&rtransport->lock);
TAILQ_FOREACH(device, &rtransport->devices, link) {
if (device->map == NULL) {
/*
* The device is not in use (no listeners),
* so no protection domain has been constructed.
* Skip it.
*/
SPDK_NOTICELOG("Skipping unused RDMA device when creating poll group.\n");
continue;
}
poller = calloc(1, sizeof(*poller));
if (!poller) {
SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n");
free(rgroup);
pthread_mutex_unlock(&rtransport->lock);
return NULL;
}
poller->device = device;
poller->group = rgroup;
TAILQ_INIT(&poller->qpairs);
poller->cq = ibv_create_cq(device->context, NVMF_RDMA_CQ_SIZE, poller, NULL, 0);
if (!poller->cq) {
SPDK_ERRLOG("Unable to create completion queue\n");
free(poller);
free(rgroup);
pthread_mutex_unlock(&rtransport->lock);
return NULL;
}
TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link);
}
pthread_mutex_unlock(&rtransport->lock);
return &rgroup->group;
}
static void
spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
{
struct spdk_nvmf_rdma_poll_group *rgroup;
struct spdk_nvmf_rdma_poller *poller, *tmp;
rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
if (!rgroup) {
return;
}
TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
TAILQ_REMOVE(&rgroup->pollers, poller, link);
if (poller->cq) {
ibv_destroy_cq(poller->cq);
}
free(poller);
}
free(rgroup);
}
static int
spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_qpair *qpair)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_poll_group *rgroup;
struct spdk_nvmf_rdma_qpair *rqpair;
struct spdk_nvmf_rdma_device *device;
struct spdk_nvmf_rdma_poller *poller;
int rc;
rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
device = rqpair->port->device;
if (device->pd != rqpair->cm_id->pd) {
SPDK_ERRLOG("Mismatched protection domains\n");
return -1;
}
TAILQ_FOREACH(poller, &rgroup->pollers, link) {
if (poller->device == device) {
break;
}
}
if (!poller) {
SPDK_ERRLOG("No poller found for device.\n");
return -1;
}
TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link);
rqpair->poller = poller;
rc = spdk_nvmf_rdma_qpair_initialize(qpair);
if (rc < 0) {
SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair);
return -1;
}
rqpair->mgmt_channel = spdk_get_io_channel(rtransport);
if (!rqpair->mgmt_channel) {
spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
spdk_nvmf_rdma_qpair_destroy(rqpair);
return -1;
}
rqpair->ch = spdk_io_channel_get_ctx(rqpair->mgmt_channel);
assert(rqpair->ch != NULL);
rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair);
if (rc) {
/* Try to reject, but we probably can't */
spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
spdk_nvmf_rdma_qpair_destroy(rqpair);
return -1;
}
return 0;
}
static int
spdk_nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_qpair *qpair)
{
struct spdk_nvmf_rdma_poll_group *rgroup;
struct spdk_nvmf_rdma_qpair *rqpair;
struct spdk_nvmf_rdma_device *device;
struct spdk_nvmf_rdma_poller *poller;
struct spdk_nvmf_rdma_qpair *rq, *trq;
rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
device = rqpair->port->device;
TAILQ_FOREACH(poller, &rgroup->pollers, link) {
if (poller->device == device) {
break;
}
}
if (!poller) {
SPDK_ERRLOG("No poller found for device.\n");
return -1;
}
TAILQ_FOREACH_SAFE(rq, &poller->qpairs, link, trq) {
if (rq == rqpair) {
TAILQ_REMOVE(&poller->qpairs, rqpair, link);
break;
}
}
if (rq == NULL) {
SPDK_ERRLOG("RDMA qpair cannot be removed from group (not in group).\n");
return -1;
}
return 0;
}
static int
spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req)
{
struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport,
struct spdk_nvmf_rdma_transport, transport);
struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
rdma_req->state = RDMA_REQUEST_STATE_EXECUTED;
spdk_nvmf_rdma_request_process(rtransport, rdma_req);
return 0;
}
static void
spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair)
{
spdk_nvmf_rdma_qpair_destroy(SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair));
}
static void
spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_qpair *rqpair)
{
struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp;
struct spdk_nvmf_rdma_request *rdma_req, *req_tmp;
/* We process I/O in the pending_rdma_rw queue at the highest priority. */
TAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_rw_queue, link, req_tmp) {
if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) {
break;
}
}
/* The second highest priority is I/O waiting on memory buffers. */
TAILQ_FOREACH_SAFE(rdma_req, &rqpair->ch->pending_data_buf_queue, link, req_tmp) {
if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) {
break;
}
}
/* The lowest priority is processing newly received commands */
TAILQ_FOREACH_SAFE(rdma_recv, &rqpair->incoming_queue, link, recv_tmp) {
rdma_req = TAILQ_FIRST(&rqpair->free_queue);
if (rdma_req == NULL) {
/* Need to wait for more SEND completions */
break;
}
rdma_req->recv = rdma_recv;
rdma_req->state = RDMA_REQUEST_STATE_NEW;
if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) {
break;
}
}
}
static struct spdk_nvmf_rdma_request *
get_rdma_req_from_wc(struct ibv_wc *wc)
{
struct spdk_nvmf_rdma_request *rdma_req;
rdma_req = (struct spdk_nvmf_rdma_request *)wc->wr_id;
assert(rdma_req != NULL);
#ifdef DEBUG
struct spdk_nvmf_rdma_qpair *rqpair;
rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
assert(rdma_req - rqpair->reqs >= 0);
assert(rdma_req - rqpair->reqs < (ptrdiff_t)rqpair->max_queue_depth);
#endif
return rdma_req;
}
static struct spdk_nvmf_rdma_recv *
get_rdma_recv_from_wc(struct ibv_wc *wc)
{
struct spdk_nvmf_rdma_recv *rdma_recv;
assert(wc->byte_len >= sizeof(struct spdk_nvmf_capsule_cmd));
rdma_recv = (struct spdk_nvmf_rdma_recv *)wc->wr_id;
assert(rdma_recv != NULL);
#ifdef DEBUG
struct spdk_nvmf_rdma_qpair *rqpair = rdma_recv->qpair;
assert(rdma_recv - rqpair->recvs >= 0);
assert(rdma_recv - rqpair->recvs < (ptrdiff_t)rqpair->max_queue_depth);
#endif
return rdma_recv;
}
static int
spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poller *rpoller)
{
struct ibv_wc wc[32];
struct spdk_nvmf_rdma_request *rdma_req;
struct spdk_nvmf_rdma_recv *rdma_recv;
struct spdk_nvmf_rdma_qpair *rqpair;
int reaped, i;
int count = 0;
bool error = false;
/* Poll for completing operations. */
reaped = ibv_poll_cq(rpoller->cq, 32, wc);
if (reaped < 0) {
SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
errno, spdk_strerror(errno));
return -1;
}
for (i = 0; i < reaped; i++) {
if (wc[i].status) {
SPDK_ERRLOG("CQ error on CQ %p, Request 0x%lu (%d): %s\n",
rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status));
error = true;
continue;
}
switch (wc[i].opcode) {
case IBV_WC_SEND:
rdma_req = get_rdma_req_from_wc(&wc[i]);
rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
assert(rdma_req->state == RDMA_REQUEST_STATE_COMPLETING);
rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
spdk_nvmf_rdma_request_process(rtransport, rdma_req);
count++;
/* Try to process other queued requests */
spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair);
break;
case IBV_WC_RDMA_WRITE:
rdma_req = get_rdma_req_from_wc(&wc[i]);
rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
rqpair->cur_rdma_rw_depth--;
/* Try to process other queued requests */
spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair);
break;
case IBV_WC_RDMA_READ:
rdma_req = get_rdma_req_from_wc(&wc[i]);
rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER);
rqpair->cur_rdma_rw_depth--;
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
spdk_nvmf_rdma_request_process(rtransport, rdma_req);
/* Try to process other queued requests */
spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair);
break;
case IBV_WC_RECV:
rdma_recv = get_rdma_recv_from_wc(&wc[i]);
rqpair = rdma_recv->qpair;
TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link);
/* Try to process other queued requests */
spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair);
break;
default:
SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode);
continue;
}
}
if (error == true) {
return -1;
}
return count;
}
static int
spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
{
struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_poll_group *rgroup;
struct spdk_nvmf_rdma_poller *rpoller;
int count, rc;
rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport);
rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
count = 0;
TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller);
if (rc < 0) {
return rc;
}
count += rc;
}
return count;
}
static bool
spdk_nvmf_rdma_qpair_is_idle(struct spdk_nvmf_qpair *qpair)
{
struct spdk_nvmf_rdma_qpair *rqpair;
rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
if (rqpair->cur_queue_depth == 0 && rqpair->cur_rdma_rw_depth == 0) {
return true;
}
return false;
}
const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = {
.type = SPDK_NVME_TRANSPORT_RDMA,
.create = spdk_nvmf_rdma_create,
.destroy = spdk_nvmf_rdma_destroy,
.listen = spdk_nvmf_rdma_listen,
.stop_listen = spdk_nvmf_rdma_stop_listen,
.accept = spdk_nvmf_rdma_accept,
.listener_discover = spdk_nvmf_rdma_discover,
.poll_group_create = spdk_nvmf_rdma_poll_group_create,
.poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy,
.poll_group_add = spdk_nvmf_rdma_poll_group_add,
.poll_group_remove = spdk_nvmf_rdma_poll_group_remove,
.poll_group_poll = spdk_nvmf_rdma_poll_group_poll,
.req_complete = spdk_nvmf_rdma_request_complete,
.qpair_fini = spdk_nvmf_rdma_close_qpair,
.qpair_is_idle = spdk_nvmf_rdma_qpair_is_idle,
};
SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA)