From 4814a0a4ce8983cdae2a40a568d17f4c03baef3c Mon Sep 17 00:00:00 2001 From: Edward Tomasz Napierala Date: Thu, 26 May 2016 09:49:29 +0000 Subject: [PATCH] Bring in the Mellanox implementation of iSER (iSCSI over RDMA) initiator, written by Sagi Grimberg and Max Gurtovoy . This code comes from https://github.com/sagigrimberg/iser-freebsd, branch iser-rebase-11-current-r291993. It's not connected to the build just yet; it still needs some tweaks to adapt to my changes to iSCSI infrastructure. Big thanks to Mellanox for their support for FreeBSD! Obtained from: Mellanox Technologies MFC after: 1 month Relnotes: yes --- sys/dev/iser/icl_iser.c | 582 ++++++++++++++++++++ sys/dev/iser/icl_iser.h | 547 +++++++++++++++++++ sys/dev/iser/iser_initiator.c | 539 +++++++++++++++++++ sys/dev/iser/iser_memory.c | 348 ++++++++++++ sys/dev/iser/iser_verbs.c | 965 ++++++++++++++++++++++++++++++++++ sys/modules/iser/Makefile | 32 ++ 6 files changed, 3013 insertions(+) create mode 100644 sys/dev/iser/icl_iser.c create mode 100644 sys/dev/iser/icl_iser.h create mode 100644 sys/dev/iser/iser_initiator.c create mode 100644 sys/dev/iser/iser_memory.c create mode 100644 sys/dev/iser/iser_verbs.c create mode 100644 sys/modules/iser/Makefile diff --git a/sys/dev/iser/icl_iser.c b/sys/dev/iser/icl_iser.c new file mode 100644 index 000000000000..d255a6bd269f --- /dev/null +++ b/sys/dev/iser/icl_iser.c @@ -0,0 +1,582 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "icl_iser.h" + +SYSCTL_NODE(_kern, OID_AUTO, iser, CTLFLAG_RW, 0, "iSER module"); +int iser_debug = 0; +SYSCTL_INT(_kern_iser, OID_AUTO, debug, CTLFLAG_RWTUN, + &iser_debug, 0, "Enable iser debug messages"); + +static MALLOC_DEFINE(M_ICL_ISER, "icl_iser", "iSCSI iser backend"); +static uma_zone_t icl_pdu_zone; + +static volatile u_int icl_iser_ncons; +struct iser_global ig; + +static icl_conn_new_pdu_t iser_conn_new_pdu; +static icl_conn_pdu_free_t iser_conn_pdu_free; +static icl_conn_pdu_data_segment_length_t iser_conn_pdu_data_segment_length; +static icl_conn_pdu_append_data_t iser_conn_pdu_append_data; +static icl_conn_pdu_queue_t iser_conn_pdu_queue; +static icl_conn_handoff_t iser_conn_handoff; +static icl_conn_free_t iser_conn_free; +static icl_conn_close_t iser_conn_close; +static icl_conn_release_t iser_conn_release; +static icl_conn_connect_t iser_conn_connect; +static icl_conn_connected_t iser_conn_connected; +static icl_conn_task_setup_t iser_conn_task_setup; +static icl_conn_task_done_t iser_conn_task_done; +static icl_conn_pdu_get_data_t iser_conn_pdu_get_data; + +static kobj_method_t icl_iser_methods[] = { + KOBJMETHOD(icl_conn_new_pdu, iser_conn_new_pdu), + KOBJMETHOD(icl_conn_pdu_free, iser_conn_pdu_free), + KOBJMETHOD(icl_conn_pdu_data_segment_length, iser_conn_pdu_data_segment_length), + KOBJMETHOD(icl_conn_pdu_append_data, iser_conn_pdu_append_data), + KOBJMETHOD(icl_conn_pdu_queue, iser_conn_pdu_queue), + KOBJMETHOD(icl_conn_handoff, iser_conn_handoff), + KOBJMETHOD(icl_conn_free, iser_conn_free), + KOBJMETHOD(icl_conn_close, iser_conn_close), + KOBJMETHOD(icl_conn_release, iser_conn_release), + KOBJMETHOD(icl_conn_connect, iser_conn_connect), + KOBJMETHOD(icl_conn_connected, iser_conn_connected), + KOBJMETHOD(icl_conn_task_setup, iser_conn_task_setup), + KOBJMETHOD(icl_conn_task_done, iser_conn_task_done), + KOBJMETHOD(icl_conn_pdu_get_data, iser_conn_pdu_get_data), + { 0, 0 } +}; + +DEFINE_CLASS(icl_iser, icl_iser_methods, sizeof(struct iser_conn)); + +/** + * iser_initialize_headers() - Initialize task headers + * @pdu: iser pdu + * @iser_conn: iser connection + * + * Notes: + * This routine may race with iser teardown flow for scsi + * error handling TMFs. So for TMF we should acquire the + * state mutex to avoid dereferencing the IB device which + * may have already been terminated (racing teardown sequence). + */ +int +iser_initialize_headers(struct icl_iser_pdu *pdu, struct iser_conn *iser_conn) +{ + struct iser_tx_desc *tx_desc = &pdu->desc; + struct iser_device *device = iser_conn->ib_conn.device; + u64 dma_addr; + int ret = 0; + + dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (ib_dma_mapping_error(device->ib_device, dma_addr)) { + ret = -ENOMEM; + goto out; + } + + tx_desc->mapped = true; + tx_desc->dma_addr = dma_addr; + tx_desc->tx_sg[0].addr = tx_desc->dma_addr; + tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; + tx_desc->tx_sg[0].lkey = device->mr->lkey; + +out: + + return (ret); +} + +int +iser_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request, + const void *addr, size_t len, int flags) +{ + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + + if (request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_LOGIN_REQUEST || + request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_TEXT_REQUEST) { + ISER_DBG("copy to login buff"); + memcpy(iser_conn->login_req_buf, addr, len); + request->ip_data_len = len; + } + + return (0); +} + +void +iser_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, + size_t off, void *addr, size_t len) +{ + /* If we have a receive data, copy it to upper layer buffer */ + if (ip->ip_data_mbuf) + memcpy(addr, ip->ip_data_mbuf + off, len); +} + +/* + * Allocate icl_pdu with empty BHS to fill up by the caller. + */ +struct icl_pdu * +iser_new_pdu(struct icl_conn *ic, int flags) +{ + struct icl_iser_pdu *iser_pdu; + struct icl_pdu *ip; + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + + iser_pdu = uma_zalloc(icl_pdu_zone, flags | M_ZERO); + if (iser_pdu == NULL) { + ISER_WARN("failed to allocate %zd bytes", sizeof(*iser_pdu)); + return (NULL); + } + + iser_pdu->iser_conn = iser_conn; + ip = &iser_pdu->icl_pdu; + ip->ip_conn = ic; + ip->ip_bhs = &iser_pdu->desc.iscsi_header; + + return (ip); +} + +struct icl_pdu * +iser_conn_new_pdu(struct icl_conn *ic, int flags) +{ + return (iser_new_pdu(ic, flags)); +} + +void +iser_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) +{ + struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip); + + uma_zfree(icl_pdu_zone, iser_pdu); +} + +size_t +iser_conn_pdu_data_segment_length(struct icl_conn *ic, + const struct icl_pdu *request) +{ + uint32_t len = 0; + + len += request->ip_bhs->bhs_data_segment_len[0]; + len <<= 8; + len += request->ip_bhs->bhs_data_segment_len[1]; + len <<= 8; + len += request->ip_bhs->bhs_data_segment_len[2]; + + return (len); +} + +void +iser_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) +{ + iser_pdu_free(ic, ip); +} + +static bool +is_control_opcode(uint8_t opcode) +{ + bool is_control = false; + + switch (opcode & ISCSI_OPCODE_MASK) { + case ISCSI_BHS_OPCODE_NOP_OUT: + case ISCSI_BHS_OPCODE_LOGIN_REQUEST: + case ISCSI_BHS_OPCODE_LOGOUT_REQUEST: + case ISCSI_BHS_OPCODE_TEXT_REQUEST: + is_control = true; + break; + case ISCSI_BHS_OPCODE_SCSI_COMMAND: + is_control = false; + break; + default: + ISER_ERR("unknown opcode %d", opcode); + } + + return (is_control); +} + +void +iser_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) +{ + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip); + int ret; + + ret = iser_initialize_headers(iser_pdu, iser_conn); + if (ret) { + ISER_ERR("Failed to map TX descriptor pdu %p", iser_pdu); + return; + } + + if (is_control_opcode(ip->ip_bhs->bhs_opcode)) { + ret = iser_send_control(iser_conn, iser_pdu); + if (unlikely(ret)) + ISER_ERR("Failed to send control pdu %p", iser_pdu); + } else { + ret = iser_send_command(iser_conn, iser_pdu); + if (unlikely(ret)) + ISER_ERR("Failed to send command pdu %p", iser_pdu); + } +} + +static struct icl_conn * +iser_new_conn(const char *name, struct mtx *lock) +{ + struct iser_conn *iser_conn; + struct icl_conn *ic; + + refcount_acquire(&icl_iser_ncons); + + iser_conn = (struct iser_conn *)kobj_create(&icl_iser_class, M_ICL_ISER, M_WAITOK | M_ZERO); + if (!iser_conn) { + ISER_ERR("failed to allocate iser conn"); + refcount_release(&icl_iser_ncons); + return (NULL); + } + + cv_init(&iser_conn->up_cv, "iser_cv"); + sx_init(&iser_conn->state_mutex, "iser_conn_state_mutex"); + mtx_init(&iser_conn->ib_conn.beacon.flush_lock, "flush_lock", NULL, MTX_DEF); + cv_init(&iser_conn->ib_conn.beacon.flush_cv, "flush_cv"); + mtx_init(&iser_conn->ib_conn.lock, "lock", NULL, MTX_DEF); + + ic = &iser_conn->icl_conn; + ic->ic_lock = lock; + ic->ic_name = name; + ic->ic_driver = strdup("iser", M_TEMP); + ic->ic_iser = true; + + return (ic); +} + +void +iser_conn_free(struct icl_conn *ic) +{ + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + + cv_destroy(&iser_conn->ib_conn.beacon.flush_cv); + mtx_destroy(&iser_conn->ib_conn.beacon.flush_lock); + sx_destroy(&iser_conn->state_mutex); + cv_destroy(&iser_conn->up_cv); + kobj_delete((struct kobj *)iser_conn, M_ICL_ISER); + refcount_release(&icl_iser_ncons); +} + +int +iser_conn_handoff(struct icl_conn *ic, int cmds_max) +{ + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + int error = 0; + + sx_xlock(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_UP) { + error = EINVAL; + ISER_ERR("iser_conn %p state is %d, teardown started\n", + iser_conn, iser_conn->state); + goto out; + } + + /* + * In discovery session no need to allocate rx desc and posting recv + * work request + */ + if (ic->ic_session_type_discovery(ic)) + goto out; + + error = iser_alloc_rx_descriptors(iser_conn, cmds_max); + if (error) + goto out; + + error = iser_post_recvm(iser_conn, iser_conn->min_posted_rx); + if (error) + goto post_error; + + sx_xunlock(&iser_conn->state_mutex); + return (error); + +post_error: + iser_free_rx_descriptors(iser_conn); +out: + sx_xunlock(&iser_conn->state_mutex); + return (error); + +} + +/** + * Frees all conn objects + */ +void +iser_conn_release(struct icl_conn *ic) +{ + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_conn *curr, *tmp; + + mtx_lock(&ig.connlist_mutex); + /* + * Search for iser connection in global list. + * It may not be there in case of failure in connection establishment + * stage. + */ + list_for_each_entry_safe(curr, tmp, &ig.connlist, conn_list) { + if (iser_conn == curr) { + ISER_WARN("found iser_conn %p", iser_conn); + list_del(&iser_conn->conn_list); + } + } + mtx_unlock(&ig.connlist_mutex); + + /* + * In case we reconnecting or removing session, we need to + * release IB resources (which is safe to call more than once). + */ + sx_xlock(&iser_conn->state_mutex); + iser_free_ib_conn_res(iser_conn, true); + sx_xunlock(&iser_conn->state_mutex); + + if (ib_conn->cma_id != NULL) { + rdma_destroy_id(ib_conn->cma_id); + ib_conn->cma_id = NULL; + } + +} + +void +iser_conn_close(struct icl_conn *ic) +{ + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + + ISER_INFO("closing conn %p", iser_conn); + + sx_xlock(&iser_conn->state_mutex); + /* + * In case iser connection is waiting on conditional variable + * (state PENDING) and we try to close it before connection establishment, + * we need to signal it to continue releasing connection properly. + */ + if (!iser_conn_terminate(iser_conn) && iser_conn->state == ISER_CONN_PENDING) + cv_signal(&iser_conn->up_cv); + sx_xunlock(&iser_conn->state_mutex); + +} + +int +iser_conn_connect(struct icl_conn *ic, int domain, int socktype, + int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa) +{ + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + struct ib_conn *ib_conn = &iser_conn->ib_conn; + int err = 0; + + sx_xlock(&iser_conn->state_mutex); + /* the device is known only --after-- address resolution */ + ib_conn->device = NULL; + + iser_conn->state = ISER_CONN_PENDING; + + ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)iser_conn, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ib_conn->cma_id)) { + err = -PTR_ERR(ib_conn->cma_id); + ISER_ERR("rdma_create_id failed: %d", err); + goto id_failure; + } + + err = rdma_resolve_addr(ib_conn->cma_id, from_sa, to_sa, 1000); + if (err) { + ISER_ERR("rdma_resolve_addr failed: %d", err); + if (err < 0) + err = -err; + goto addr_failure; + } + + ISER_DBG("before cv_wait: %p", iser_conn); + cv_wait(&iser_conn->up_cv, &iser_conn->state_mutex); + ISER_DBG("after cv_wait: %p", iser_conn); + + if (iser_conn->state != ISER_CONN_UP) { + err = EIO; + goto addr_failure; + } + + err = iser_alloc_login_buf(iser_conn); + if (err) + goto addr_failure; + sx_xunlock(&iser_conn->state_mutex); + + mtx_lock(&ig.connlist_mutex); + list_add(&iser_conn->conn_list, &ig.connlist); + mtx_unlock(&ig.connlist_mutex); + + return (0); + +id_failure: + ib_conn->cma_id = NULL; +addr_failure: + sx_xunlock(&iser_conn->state_mutex); + return (err); +} + +/** + * Called with session spinlock held. + * No need to lock state mutex on an advisory check. + **/ +bool +iser_conn_connected(struct icl_conn *ic) +{ + struct iser_conn *iser_conn = icl_to_iser_conn(ic); + + return (iser_conn->state == ISER_CONN_UP); +} + +int +iser_conn_task_setup(struct icl_conn *ic, struct ccb_scsiio *csio, + uint32_t *task_tagp, void **prvp, struct icl_pdu *ip) +{ + struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip); + + *prvp = ip; + iser_pdu->csio = csio; + + return (0); +} + +void +iser_conn_task_done(struct icl_conn *ic, void *prv) +{ + struct icl_pdu *ip = prv; + struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip); + struct iser_device *device = iser_pdu->iser_conn->ib_conn.device; + struct iser_tx_desc *tx_desc = &iser_pdu->desc; + + if (iser_pdu->dir[ISER_DIR_IN]) { + iser_unreg_rdma_mem(iser_pdu, ISER_DIR_IN); + iser_dma_unmap_task_data(iser_pdu, + &iser_pdu->data[ISER_DIR_IN], + DMA_FROM_DEVICE); + } + + if (iser_pdu->dir[ISER_DIR_OUT]) { + iser_unreg_rdma_mem(iser_pdu, ISER_DIR_OUT); + iser_dma_unmap_task_data(iser_pdu, + &iser_pdu->data[ISER_DIR_OUT], + DMA_TO_DEVICE); + } + + if (likely(tx_desc->mapped)) { + ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + tx_desc->mapped = false; + } + + iser_pdu_free(ic, ip); +} + +static u_int32_t +iser_hba_misc() +{ + return (PIM_UNMAPPED); +} + +static int +iser_limits(size_t *limitp) +{ + *limitp = 128 * 1024; + + return (0); +} + +static int +icl_iser_load(void) +{ + int error; + + ISER_DBG("Starting iSER datamover..."); + + icl_pdu_zone = uma_zcreate("icl_iser_pdu", sizeof(struct icl_iser_pdu), + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + /* FIXME: Check rc */ + + refcount_init(&icl_iser_ncons, 0); + + error = icl_register("iser", 0, iser_limits, iser_new_conn, iser_hba_misc); + KASSERT(error == 0, ("failed to register iser")); + + memset(&ig, 0, sizeof(struct iser_global)); + + /* device init is called only after the first addr resolution */ + sx_init(&ig.device_list_mutex, "global_device_lock"); + INIT_LIST_HEAD(&ig.device_list); + mtx_init(&ig.connlist_mutex, "global_conn_lock", NULL, MTX_DEF); + INIT_LIST_HEAD(&ig.connlist); + sx_init(&ig.close_conns_mutex, "global_close_conns_lock"); + + return (error); +} + +static int +icl_iser_unload(void) +{ + ISER_DBG("Removing iSER datamover..."); + + if (icl_iser_ncons != 0) + return (EBUSY); + + sx_destroy(&ig.close_conns_mutex); + mtx_destroy(&ig.connlist_mutex); + sx_destroy(&ig.device_list_mutex); + + icl_unregister("iser"); + + uma_zdestroy(icl_pdu_zone); + + return (0); +} + +static int +icl_iser_modevent(module_t mod, int what, void *arg) +{ + switch (what) { + case MOD_LOAD: + return (icl_iser_load()); + case MOD_UNLOAD: + return (icl_iser_unload()); + default: + return (EINVAL); + } +} + +moduledata_t icl_iser_data = { + .name = "icl_iser", + .evhand = icl_iser_modevent, + .priv = 0 +}; + +DECLARE_MODULE(icl_iser, icl_iser_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); +MODULE_DEPEND(icl_iser, icl, 1, 1, 1); +MODULE_DEPEND(icl_iser, iscsi, 1, 1, 1); +MODULE_DEPEND(icl_iser, ibcore, 1, 1, 1); +MODULE_DEPEND(icl_iser, linuxkpi, 1, 1, 1); +MODULE_VERSION(icl_iser, 1); + diff --git a/sys/dev/iser/icl_iser.h b/sys/dev/iser/icl_iser.h new file mode 100644 index 000000000000..995a0fa33667 --- /dev/null +++ b/sys/dev/iser/icl_iser.h @@ -0,0 +1,547 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef ICL_ISER_H +#define ICL_ISER_H + +/* + * iSCSI Common Layer for RDMA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define ISER_DBG(X, ...) \ + do { \ + if (unlikely(iser_debug > 2)) \ + printf("DEBUG: %s: " X "\n", \ + __func__, ## __VA_ARGS__); \ + } while (0) + +#define ISER_INFO(X, ...) \ + do { \ + if (unlikely(iser_debug > 1)) \ + printf("INFO: %s: " X "\n", \ + __func__, ## __VA_ARGS__); \ + } while (0) + +#define ISER_WARN(X, ...) \ + do { \ + if (unlikely(iser_debug > 0)) { \ + printf("WARNING: %s: " X "\n", \ + __func__, ## __VA_ARGS__); \ + } \ + } while (0) + +#define ISER_ERR(X, ...) \ + printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__) + +#define ISER_VER 0x10 +#define ISER_WSV 0x08 +#define ISER_RSV 0x04 + +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL +#define ISER_BEACON_WRID 0xfffffffffffffffeULL + +#define SHIFT_4K 12 +#define SIZE_4K (1ULL << SHIFT_4K) +#define MASK_4K (~(SIZE_4K-1)) + +/* support up to 512KB in one RDMA */ +#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) +#define ISER_DEF_XMIT_CMDS_MAX 256 + +/* the max RX (recv) WR supported by the iSER QP is defined by * + * max_recv_wr = commands_max + recv_beacon */ +#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX + 1) +#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ +#define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */ + +/* the max TX (send) WR supported by the iSER QP is defined by * + * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * + * to have at max for SCSI command. The tx posting & completion handling code * + * supports -EAGAIN scheme where tx is suspended till the QP has room for more * + * send WR. D=8 comes from 64K/8K */ + +#define ISER_INFLIGHT_DATAOUTS 8 + +/* the send_beacon increase the max_send_wr by 1 */ +#define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_INFLIGHT_DATAOUTS) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS + 1) + +#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \ + - ISER_MAX_TX_MISC_PDUS \ + - ISER_MAX_RX_MISC_PDUS - 1) / \ + (1 + ISER_INFLIGHT_DATAOUTS)) + +#define ISER_WC_BATCH_COUNT 16 +#define ISER_SIGNAL_CMD_COUNT 32 + +/* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might * + * encounter a CQ overrun state. */ +#define ISCSI_ISER_MAX_CONN 8 +#define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \ + ISCSI_ISER_MAX_CONN) + +#define ISER_ZBVA_NOT_SUPPORTED 0x80 +#define ISER_SEND_W_INV_NOT_SUPPORTED 0x40 + +#define icl_to_iser_conn(ic) \ + container_of(ic, struct iser_conn, icl_conn) +#define icl_to_iser_pdu(ip) \ + container_of(ip, struct icl_iser_pdu, icl_pdu) + +/** + * struct iser_hdr - iSER header + * + * @flags: flags support (zbva, remote_inv) + * @rsvd: reserved + * @write_stag: write rkey + * @write_va: write virtual address + * @reaf_stag: read rkey + * @read_va: read virtual address + */ +struct iser_hdr { + u8 flags; + u8 rsvd[3]; + __be32 write_stag; + __be64 write_va; + __be32 read_stag; + __be64 read_va; +} __attribute__((packed)); + +struct iser_cm_hdr { + u8 flags; + u8 rsvd[3]; +} __packed; + +/* Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE) + +#define ISER_RECV_DATA_SEG_LEN 128 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) + +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +enum iser_conn_state { + ISER_CONN_INIT, /* descriptor allocd, no conn */ + ISER_CONN_PENDING, /* in the process of being established */ + ISER_CONN_UP, /* up and running */ + ISER_CONN_TERMINATING, /* in the process of being terminated */ + ISER_CONN_DOWN, /* shut down */ + ISER_CONN_STATES_NUM +}; + +enum iser_task_status { + ISER_TASK_STATUS_INIT = 0, + ISER_TASK_STATUS_STARTED, + ISER_TASK_STATUS_COMPLETED +}; + +enum iser_data_dir { + ISER_DIR_IN = 0, /* to initiator */ + ISER_DIR_OUT, /* from initiator */ + ISER_DIRS_NUM +}; + +/** + * struct iser_mem_reg - iSER memory registration info + * + * @sge: memory region sg element + * @rkey: memory region remote key + * @mem_h: pointer to registration context (FMR/Fastreg) + */ +struct iser_mem_reg { + struct ib_sge sge; + u32 rkey; + void *mem_h; +}; + +enum iser_desc_type { + ISCSI_TX_CONTROL , + ISCSI_TX_SCSI_COMMAND, + ISCSI_TX_DATAOUT +}; + +/** + * struct iser_data_buf - iSER data buffer + * + * @sg: pointer to the sg list + * @size: num entries of this sg + * @data_len: total beffer byte len + * @dma_nents: returned by dma_map_sg + * @copy_buf: allocated copy buf for SGs unaligned + * for rdma which are copied + * @orig_sg: pointer to the original sg list (in case + * we used a copy) + * @sg_single: SG-ified clone of a non SG SC or + * unaligned SG + */ +struct iser_data_buf { + struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE]; + void *sg; + unsigned int size; + unsigned long data_len; + unsigned int dma_nents; + char *copy_buf; + struct scatterlist *orig_sg; + struct scatterlist sg_single; + }; + +/* fwd declarations */ +struct iser_conn; +struct ib_conn; +struct iser_device; + +/** + * struct iser_tx_desc - iSER TX descriptor (for send wr_id) + * + * @iser_header: iser header + * @iscsi_header: iscsi header (bhs) + * @type: command/control/dataout + * @dma_addr: header buffer dma_address + * @tx_sg: sg[0] points to iser/iscsi headers + * sg[1] optionally points to either of immediate data + * unsolicited data-out or control + * @num_sge: number sges used on this TX task + * @mapped: indicates if the descriptor is dma mapped + */ +struct iser_tx_desc { + struct iser_hdr iser_header; + struct iscsi_bhs iscsi_header __attribute__((packed)); + enum iser_desc_type type; + u64 dma_addr; + struct ib_sge tx_sg[2]; + int num_sge; + bool mapped; +}; + +#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ + sizeof(u64) + sizeof(struct ib_sge))) +/** + * struct iser_rx_desc - iSER RX descriptor (for recv wr_id) + * + * @iser_header: iser header + * @iscsi_header: iscsi header + * @data: received data segment + * @dma_addr: receive buffer dma address + * @rx_sg: ib_sge of receive buffer + * @pad: for sense data TODO: Modify to maximum sense length supported + */ +struct iser_rx_desc { + struct iser_hdr iser_header; + struct iscsi_bhs iscsi_header; + char data[ISER_RECV_DATA_SEG_LEN]; + u64 dma_addr; + struct ib_sge rx_sg; + char pad[ISER_RX_PAD_SIZE]; +} __attribute__((packed)); + +struct icl_iser_pdu { + struct icl_pdu icl_pdu; + struct iser_tx_desc desc; + struct iser_conn *iser_conn; + enum iser_task_status status; + struct ccb_scsiio *csio; + int command_sent; + int dir[ISER_DIRS_NUM]; + struct iser_mem_reg rdma_reg[ISER_DIRS_NUM]; + struct iser_data_buf data[ISER_DIRS_NUM]; +}; + +/** + * struct iser_comp - iSER completion context + * + * @device: pointer to device handle + * @cq: completion queue + * @wcs: work completion array + * @tq: taskqueue handle + * @task: task to run task_fn + * @active_qps: Number of active QPs attached + * to completion context + */ +struct iser_comp { + struct iser_device *device; + struct ib_cq *cq; + struct ib_wc wcs[ISER_WC_BATCH_COUNT]; + struct taskqueue *tq; + struct task task; + int active_qps; +}; + +/** + * struct iser_device - iSER device handle + * + * @ib_device: RDMA device + * @pd: Protection Domain for this device + * @dev_attr: Device attributes container + * @mr: Global DMA memory region + * @event_handler: IB events handle routine + * @ig_list: entry in devices list + * @refcount: Reference counter, dominated by open iser connections + * @comps_used: Number of completion contexts used, Min between online + * cpus and device max completion vectors + * @comps: Dinamically allocated array of completion handlers + */ +struct iser_device { + struct ib_device *ib_device; + struct ib_pd *pd; + struct ib_device_attr dev_attr; + struct ib_mr *mr; + struct ib_event_handler event_handler; + struct list_head ig_list; + int refcount; + int comps_used; + struct iser_comp *comps; +}; + +/** + * struct iser_reg_resources - Fast registration recources + * + * @mr: memory region + * @frpl: fast reg page list + * @mr_valid: is mr valid indicator + */ +struct iser_reg_resources { + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + u8 mr_valid:1; +}; + +/** + * struct fast_reg_descriptor - Fast registration descriptor + * + * @list: entry in connection fastreg pool + * @rsc: data buffer registration resources + */ +struct fast_reg_descriptor { + struct list_head list; + struct iser_reg_resources rsc; +}; + + +/** + * struct iser_beacon - beacon to signal all flush errors were drained + * + * @send: send wr + * @recv: recv wr + * @flush_lock: protects flush_cv + * @flush_cv: condition variable for beacon flush + */ +struct iser_beacon { + union { + struct ib_send_wr send; + struct ib_recv_wr recv; + }; + struct mtx flush_lock; + struct cv flush_cv; +}; + +/** + * struct ib_conn - Infiniband related objects + * + * @cma_id: rdma_cm connection maneger handle + * @qp: Connection Queue-pair + * @device: reference to iser device + * @comp: iser completion context + */ +struct ib_conn { + struct rdma_cm_id *cma_id; + struct ib_qp *qp; + int post_recv_buf_count; + u8 sig_count; + struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; + struct iser_device *device; + struct iser_comp *comp; + struct iser_beacon beacon; + struct mtx lock; + union { + struct { + struct ib_fmr_pool *pool; + struct iser_page_vec *page_vec; + } fmr; + struct { + struct list_head pool; + int pool_size; + } fastreg; + }; +}; + +struct iser_conn { + struct icl_conn icl_conn; + struct ib_conn ib_conn; + struct cv up_cv; + struct list_head conn_list; + struct sx state_mutex; + enum iser_conn_state state; + int qp_max_recv_dtos; + int min_posted_rx; + u16 max_cmds; + char *login_buf; + char *login_req_buf, *login_resp_buf; + u64 login_req_dma, login_resp_dma; + unsigned int rx_desc_head; + struct iser_rx_desc *rx_descs; + u32 num_rx_descs; +}; + +/** + * struct iser_global: iSER global context + * + * @device_list_mutex: protects device_list + * @device_list: iser devices global list + * @connlist_mutex: protects connlist + * @connlist: iser connections global list + * @desc_cache: kmem cache for tx dataout + * @close_conns_mutex: serializes conns closure + */ +struct iser_global { + struct sx device_list_mutex; + struct list_head device_list; + struct mtx connlist_mutex; + struct list_head connlist; + struct sx close_conns_mutex; +}; + +extern struct iser_global ig; +extern int iser_debug; + +void +iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *); + +int +iser_post_recvl(struct iser_conn *); + +int +iser_post_recvm(struct iser_conn *, int); + +int +iser_alloc_login_buf(struct iser_conn *iser_conn); + +void +iser_free_login_buf(struct iser_conn *iser_conn); + +int +iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool); + +void +iser_snd_completion(struct iser_tx_desc *, struct ib_conn *); + +void +iser_rcv_completion(struct iser_rx_desc *, unsigned long, + struct ib_conn *); + +void +iser_pdu_free(struct icl_conn *, struct icl_pdu *); + +struct icl_pdu * +iser_new_pdu(struct icl_conn *ic, int flags); + +int +iser_alloc_rx_descriptors(struct iser_conn *, int); + +void +iser_free_rx_descriptors(struct iser_conn *); + +int +iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *); + +int +iser_send_control(struct iser_conn *, struct icl_iser_pdu *); + +int +iser_send_command(struct iser_conn *, struct icl_iser_pdu *); + +int +iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir); + +void +iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir); + +int +iser_create_fastreg_pool(struct ib_conn *, unsigned); + +void +iser_free_fastreg_pool(struct ib_conn *); + +int +iser_dma_map_task_data(struct icl_iser_pdu *, + struct iser_data_buf *, enum iser_data_dir, + enum dma_data_direction); + +int +iser_conn_terminate(struct iser_conn *); + +void +iser_free_ib_conn_res(struct iser_conn *, bool); + +void +iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *, + enum dma_data_direction); + +int +iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *); + +#endif /* !ICL_ISER_H */ diff --git a/sys/dev/iser/iser_initiator.c b/sys/dev/iser/iser_initiator.c new file mode 100644 index 000000000000..7a10c9c31d4f --- /dev/null +++ b/sys/dev/iser/iser_initiator.c @@ -0,0 +1,539 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "icl_iser.h" + +static MALLOC_DEFINE(M_ISER_INITIATOR, "iser_initiator", "iser initiator backend"); + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_IN].data_len, Protection size + * os stored in task->prot[ISER_DIR_IN].data_len + */ +static int +iser_prepare_read_cmd(struct icl_iser_pdu *iser_pdu) +{ + struct iser_hdr *hdr = &iser_pdu->desc.iser_header; + struct iser_data_buf *buf_in = &iser_pdu->data[ISER_DIR_IN]; + struct iser_mem_reg *mem_reg; + int err; + + err = iser_dma_map_task_data(iser_pdu, + buf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return (err); + + err = iser_reg_rdma_mem(iser_pdu, ISER_DIR_IN); + if (err) { + ISER_ERR("Failed to set up Data-IN RDMA"); + return (err); + } + + mem_reg = &iser_pdu->rdma_reg[ISER_DIR_IN]; + + hdr->flags |= ISER_RSV; + hdr->read_stag = cpu_to_be32(mem_reg->rkey); + hdr->read_va = cpu_to_be64(mem_reg->sge.addr); + + return (0); +} + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_OUT].data_len, Protection size + * is stored at task->prot[ISER_DIR_OUT].data_len + */ +static int +iser_prepare_write_cmd(struct icl_iser_pdu *iser_pdu) +{ + struct iser_hdr *hdr = &iser_pdu->desc.iser_header; + struct iser_data_buf *buf_out = &iser_pdu->data[ISER_DIR_OUT]; + struct iser_mem_reg *mem_reg; + int err; + + err = iser_dma_map_task_data(iser_pdu, + buf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return (err); + + err = iser_reg_rdma_mem(iser_pdu, ISER_DIR_OUT); + if (err) { + ISER_ERR("Failed to set up Data-out RDMA"); + return (err); + } + + mem_reg = &iser_pdu->rdma_reg[ISER_DIR_OUT]; + + hdr->flags |= ISER_WSV; + hdr->write_stag = cpu_to_be32(mem_reg->rkey); + hdr->write_va = cpu_to_be64(mem_reg->sge.addr); + + return (0); +} + +/* creates a new tx descriptor and adds header regd buffer */ +void +iser_create_send_desc(struct iser_conn *iser_conn, + struct iser_tx_desc *tx_desc) +{ + struct iser_device *device = iser_conn->ib_conn.device; + + ib_dma_sync_single_for_cpu(device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + tx_desc->iser_header.flags = ISER_VER; + + tx_desc->num_sge = 1; + + if (tx_desc->tx_sg[0].lkey != device->mr->lkey) { + tx_desc->tx_sg[0].lkey = device->mr->lkey; + ISER_DBG("sdesc %p lkey mismatch, fixing", tx_desc); + } +} + +void +iser_free_login_buf(struct iser_conn *iser_conn) +{ + struct iser_device *device = iser_conn->ib_conn.device; + + if (!iser_conn->login_buf) + return; + + if (iser_conn->login_req_dma) + ib_dma_unmap_single(device->ib_device, + iser_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + + if (iser_conn->login_resp_dma) + ib_dma_unmap_single(device->ib_device, + iser_conn->login_resp_dma, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + + free(iser_conn->login_buf, M_ISER_INITIATOR); + + /* make sure we never redo any unmapping */ + iser_conn->login_req_dma = 0; + iser_conn->login_resp_dma = 0; + iser_conn->login_buf = NULL; +} + +int +iser_alloc_login_buf(struct iser_conn *iser_conn) +{ + struct iser_device *device = iser_conn->ib_conn.device; + int req_err, resp_err; + + BUG_ON(device == NULL); + + iser_conn->login_buf = malloc(ISCSI_DEF_MAX_RECV_SEG_LEN + ISER_RX_LOGIN_SIZE, + M_ISER_INITIATOR, M_WAITOK | M_ZERO); + + if (!iser_conn->login_buf) + goto out_err; + + iser_conn->login_req_buf = iser_conn->login_buf; + iser_conn->login_resp_buf = iser_conn->login_buf + + ISCSI_DEF_MAX_RECV_SEG_LEN; + + iser_conn->login_req_dma = ib_dma_map_single(device->ib_device, + iser_conn->login_req_buf, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_TO_DEVICE); + + iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device, + iser_conn->login_resp_buf, + ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); + + req_err = ib_dma_mapping_error(device->ib_device, + iser_conn->login_req_dma); + resp_err = ib_dma_mapping_error(device->ib_device, + iser_conn->login_resp_dma); + + if (req_err || resp_err) { + if (req_err) + iser_conn->login_req_dma = 0; + if (resp_err) + iser_conn->login_resp_dma = 0; + goto free_login_buf; + } + + return (0); + +free_login_buf: + iser_free_login_buf(iser_conn); + +out_err: + ISER_DBG("unable to alloc or map login buf"); + return (ENOMEM); +} + +int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, int cmds_max) +{ + int i, j; + u64 dma_addr; + struct iser_rx_desc *rx_desc; + struct ib_sge *rx_sg; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + iser_conn->qp_max_recv_dtos = cmds_max; + iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; + + if (iser_create_fastreg_pool(ib_conn, cmds_max)) + goto create_rdma_reg_res_failed; + + + iser_conn->num_rx_descs = cmds_max; + iser_conn->rx_descs = malloc(iser_conn->num_rx_descs * + sizeof(struct iser_rx_desc), M_ISER_INITIATOR, + M_WAITOK | M_ZERO); + if (!iser_conn->rx_descs) + goto rx_desc_alloc_fail; + + rx_desc = iser_conn->rx_descs; + + for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) { + dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device->ib_device, dma_addr)) + goto rx_desc_dma_map_failed; + + rx_desc->dma_addr = dma_addr; + + rx_sg = &rx_desc->rx_sg; + rx_sg->addr = rx_desc->dma_addr; + rx_sg->length = ISER_RX_PAYLOAD_SIZE; + rx_sg->lkey = device->mr->lkey; + } + + iser_conn->rx_desc_head = 0; + + return (0); + +rx_desc_dma_map_failed: + rx_desc = iser_conn->rx_descs; + for (j = 0; j < i; j++, rx_desc++) + ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + free(iser_conn->rx_descs, M_ISER_INITIATOR); + iser_conn->rx_descs = NULL; +rx_desc_alloc_fail: + iser_free_fastreg_pool(ib_conn); +create_rdma_reg_res_failed: + ISER_ERR("failed allocating rx descriptors / data buffers"); + + return (ENOMEM); +} + +void +iser_free_rx_descriptors(struct iser_conn *iser_conn) +{ + int i; + struct iser_rx_desc *rx_desc; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + iser_free_fastreg_pool(ib_conn); + + rx_desc = iser_conn->rx_descs; + for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) + ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + + free(iser_conn->rx_descs, M_ISER_INITIATOR); + + /* make sure we never redo any unmapping */ + iser_conn->rx_descs = NULL; +} + +static void +iser_buf_to_sg(void *buf, struct iser_data_buf *data_buf) +{ + struct scatterlist *sg; + int i; + size_t len, tlen; + int offset; + + tlen = data_buf->data_len; + + for (i = 0; 0 < tlen; i++, tlen -= len) { + sg = &data_buf->sgl[i]; + offset = ((uintptr_t)buf) & ~PAGE_MASK; + len = min(PAGE_SIZE - offset, tlen); + sg_set_buf(sg, buf, len); + buf = (void *)(((u64)buf) + (u64)len); + } + + data_buf->size = i; + sg_mark_end(sg); +} + + +static void +iser_bio_to_sg(struct bio *bp, struct iser_data_buf *data_buf) +{ + struct scatterlist *sg; + int i; + size_t len, tlen; + int offset; + + tlen = bp->bio_bcount; + offset = bp->bio_ma_offset; + + for (i = 0; 0 < tlen; i++, tlen -= len) { + sg = &data_buf->sgl[i]; + len = min(PAGE_SIZE - offset, tlen); + sg_set_page(sg, bp->bio_ma[i], len, offset); + offset = 0; + } + + data_buf->size = i; + sg_mark_end(sg); +} + +static int +iser_csio_to_sg(struct ccb_scsiio *csio, struct iser_data_buf *data_buf) +{ + struct ccb_hdr *ccbh; + int err = 0; + + ccbh = &csio->ccb_h; + switch ((ccbh->flags & CAM_DATA_MASK)) { + case CAM_DATA_BIO: + iser_bio_to_sg((struct bio *) csio->data_ptr, data_buf); + break; + case CAM_DATA_VADDR: + /* + * Support KVA buffers for various scsi commands such as: + * - REPORT_LUNS + * - MODE_SENSE_6 + * - INQUIRY + * - SERVICE_ACTION_IN. + * The data of these commands always mapped into KVA. + */ + iser_buf_to_sg(csio->data_ptr, data_buf); + break; + default: + ISER_ERR("flags 0x%X unimplemented", ccbh->flags); + err = EINVAL; + } + return (err); +} + +static inline bool +iser_signal_comp(u8 sig_count) +{ + return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0); +} + +int +iser_send_command(struct iser_conn *iser_conn, + struct icl_iser_pdu *iser_pdu) +{ + struct iser_data_buf *data_buf; + struct iser_tx_desc *tx_desc = &iser_pdu->desc; + struct iscsi_bhs_scsi_command *hdr = (struct iscsi_bhs_scsi_command *) &(iser_pdu->desc.iscsi_header); + struct ccb_scsiio *csio = iser_pdu->csio; + int err = 0; + u8 sig_count = ++iser_conn->ib_conn.sig_count; + + /* build the tx desc regd header and add it to the tx desc dto */ + tx_desc->type = ISCSI_TX_SCSI_COMMAND; + iser_create_send_desc(iser_conn, tx_desc); + + if (hdr->bhssc_flags & BHSSC_FLAGS_R) { + data_buf = &iser_pdu->data[ISER_DIR_IN]; + } else { + data_buf = &iser_pdu->data[ISER_DIR_OUT]; + } + + data_buf->sg = csio->data_ptr; + data_buf->data_len = csio->dxfer_len; + + if (likely(csio->dxfer_len)) { + err = iser_csio_to_sg(csio, data_buf); + if (unlikely(err)) + goto send_command_error; + } + + if (hdr->bhssc_flags & BHSSC_FLAGS_R) { + err = iser_prepare_read_cmd(iser_pdu); + if (err) + goto send_command_error; + } else if (hdr->bhssc_flags & BHSSC_FLAGS_W) { + err = iser_prepare_write_cmd(iser_pdu); + if (err) + goto send_command_error; + } + + err = iser_post_send(&iser_conn->ib_conn, tx_desc, + iser_signal_comp(sig_count)); + if (!err) + return (0); + +send_command_error: + ISER_ERR("iser_conn %p itt %u len %u err %d", iser_conn, + hdr->bhssc_initiator_task_tag, + hdr->bhssc_expected_data_transfer_length, + err); + return (err); +} + +int +iser_send_control(struct iser_conn *iser_conn, + struct icl_iser_pdu *iser_pdu) +{ + struct iser_tx_desc *mdesc; + struct iser_device *device; + size_t datalen = iser_pdu->icl_pdu.ip_data_len; + struct icl_conn *ic = &iser_conn->icl_conn; + int err; + + mdesc = &iser_pdu->desc; + + /* build the tx desc regd header and add it to the tx desc dto */ + mdesc->type = ISCSI_TX_CONTROL; + iser_create_send_desc(iser_conn, mdesc); + + device = iser_conn->ib_conn.device; + + if (datalen > 0) { + struct ib_sge *tx_dsg = &mdesc->tx_sg[1]; + ib_dma_sync_single_for_cpu(device->ib_device, + iser_conn->login_req_dma, datalen, + DMA_TO_DEVICE); + + ib_dma_sync_single_for_device(device->ib_device, + iser_conn->login_req_dma, datalen, + DMA_TO_DEVICE); + + tx_dsg->addr = iser_conn->login_req_dma; + tx_dsg->length = datalen; + tx_dsg->lkey = device->mr->lkey; + mdesc->num_sge = 2; + } + + /* For discovery session we re-use the login buffer */ + if (ic->ic_session_login_phase(ic) || ic->ic_session_type_discovery(ic)) { + err = iser_post_recvl(iser_conn); + if (err) + goto send_control_error; + } + + err = iser_post_send(&iser_conn->ib_conn, mdesc, true); + if (!err) + return (0); + +send_control_error: + ISER_ERR("conn %p failed err %d", iser_conn, err); + + return (err); + +} + +/** + * iser_rcv_dto_completion - recv DTO completion + */ +void +iser_rcv_completion(struct iser_rx_desc *rx_desc, + unsigned long rx_xfer_len, + struct ib_conn *ib_conn) +{ + struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, + ib_conn); + struct icl_conn *ic = &iser_conn->icl_conn; + struct icl_pdu *response; + struct iscsi_bhs *hdr; + u64 rx_dma; + int rx_buflen; + int outstanding, count, err; + + /* differentiate between login to all other PDUs */ + if ((char *)rx_desc == iser_conn->login_resp_buf) { + rx_dma = iser_conn->login_resp_dma; + rx_buflen = ISER_RX_LOGIN_SIZE; + } else { + rx_dma = rx_desc->dma_addr; + rx_buflen = ISER_RX_PAYLOAD_SIZE; + } + + ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma, + rx_buflen, DMA_FROM_DEVICE); + + hdr = &rx_desc->iscsi_header; + + response = iser_new_pdu(ic, M_NOWAIT); + response->ip_bhs = hdr; + response->ip_data_len = rx_xfer_len - ISER_HEADERS_LEN; + + /* + * In case we got data in the receive buffer, assign the ip_data_mbuf + * to the rx_buffer - later we'll copy it to upper layer buffers + */ + if (response->ip_data_len) + response->ip_data_mbuf = (struct mbuf *)(rx_desc->data); + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, + rx_buflen, DMA_FROM_DEVICE); + + /* decrementing conn->post_recv_buf_count only --after-- freeing the * + * task eliminates the need to worry on tasks which are completed in * + * parallel to the execution of iser_conn_term. So the code that waits * + * for the posted rx bufs refcount to become zero handles everything */ + ib_conn->post_recv_buf_count--; + + if (rx_dma == iser_conn->login_resp_dma) + goto receive; + + outstanding = ib_conn->post_recv_buf_count; + if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) { + count = min(iser_conn->qp_max_recv_dtos - outstanding, + iser_conn->min_posted_rx); + err = iser_post_recvm(iser_conn, count); + if (err) + ISER_ERR("posting %d rx bufs err %d", count, err); + } + +receive: + (ic->ic_receive)(response); +} + +void +iser_snd_completion(struct iser_tx_desc *tx_desc, + struct ib_conn *ib_conn) +{ + struct icl_iser_pdu *iser_pdu = container_of(tx_desc, struct icl_iser_pdu, desc); + struct iser_conn *iser_conn = iser_pdu->iser_conn; + + if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) + iser_pdu_free(&iser_conn->icl_conn, &iser_pdu->icl_pdu); +} diff --git a/sys/dev/iser/iser_memory.c b/sys/dev/iser/iser_memory.c new file mode 100644 index 000000000000..1acc1ba88af2 --- /dev/null +++ b/sys/dev/iser/iser_memory.c @@ -0,0 +1,348 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "icl_iser.h" + +static struct fast_reg_descriptor * +iser_reg_desc_get(struct ib_conn *ib_conn) +{ + struct fast_reg_descriptor *desc; + + mtx_lock(&ib_conn->lock); + desc = list_first_entry(&ib_conn->fastreg.pool, + struct fast_reg_descriptor, list); + list_del(&desc->list); + mtx_unlock(&ib_conn->lock); + + return (desc); +} + +static void +iser_reg_desc_put(struct ib_conn *ib_conn, + struct fast_reg_descriptor *desc) +{ + mtx_lock(&ib_conn->lock); + list_add(&desc->list, &ib_conn->fastreg.pool); + mtx_unlock(&ib_conn->lock); +} + +#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) + +/** + * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses + * and returns the length of resulting physical address array (may be less than + * the original due to possible compaction). + * + * we build a "page vec" under the assumption that the SG meets the RDMA + * alignment requirements. Other then the first and last SG elements, all + * the "internal" elements can be compacted into a list whose elements are + * dma addresses of physical pages. The code supports also the weird case + * where --few fragments of the same page-- are present in the SG as + * consecutive elements. Also, it handles one entry SG. + */ +static int +iser_sg_to_page_vec(struct iser_data_buf *data, + struct ib_device *ibdev, u64 *pages, + int *offset, int *data_size) +{ + struct scatterlist *sg, *sgl = data->sgl; + u64 start_addr, end_addr, page, chunk_start = 0; + unsigned long total_sz = 0; + unsigned int dma_len; + int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; + + /* compute the offset of first element */ + *offset = (u64) sgl[0].offset & ~MASK_4K; + + new_chunk = 1; + cur_page = 0; + for_each_sg(sgl, sg, data->dma_nents, i) { + start_addr = ib_sg_dma_address(ibdev, sg); + if (new_chunk) + chunk_start = start_addr; + dma_len = ib_sg_dma_len(ibdev, sg); + end_addr = start_addr + dma_len; + total_sz += dma_len; + + /* collect page fragments until aligned or end of SG list */ + if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { + new_chunk = 0; + continue; + } + new_chunk = 1; + + /* address of the first page in the contiguous chunk; + masking relevant for the very first SG entry, + which might be unaligned */ + page = chunk_start & MASK_4K; + do { + pages[cur_page++] = page; + page += SIZE_4K; + } while (page < end_addr); + } + + *data_size = total_sz; + + return (cur_page); +} + +/** + * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned + * for RDMA sub-list of a scatter-gather list of memory buffers, and returns + * the number of entries which are aligned correctly. Supports the case where + * consecutive SG elements are actually fragments of the same physcial page. + */ +static int +iser_data_buf_aligned_len(struct iser_data_buf *data, struct ib_device *ibdev) +{ + struct scatterlist *sg, *sgl, *next_sg = NULL; + u64 start_addr, end_addr; + int i, ret_len, start_check = 0; + + if (data->dma_nents == 1) + return (1); + + sgl = data->sgl; + start_addr = ib_sg_dma_address(ibdev, sgl); + + for_each_sg(sgl, sg, data->dma_nents, i) { + if (start_check && !IS_4K_ALIGNED(start_addr)) + break; + + next_sg = sg_next(sg); + if (!next_sg) + break; + + end_addr = start_addr + ib_sg_dma_len(ibdev, sg); + start_addr = ib_sg_dma_address(ibdev, next_sg); + + if (end_addr == start_addr) { + start_check = 0; + continue; + } else + start_check = 1; + + if (!IS_4K_ALIGNED(end_addr)) + break; + } + ret_len = (next_sg) ? i : i+1; + + return (ret_len); +} + +void +iser_dma_unmap_task_data(struct icl_iser_pdu *iser_pdu, + struct iser_data_buf *data, + enum dma_data_direction dir) +{ + struct ib_device *dev; + + dev = iser_pdu->iser_conn->ib_conn.device->ib_device; + ib_dma_unmap_sg(dev, data->sgl, data->size, dir); +} + +static int +iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, + struct iser_mem_reg *reg) +{ + struct scatterlist *sg = mem->sgl; + + reg->sge.lkey = device->mr->lkey; + reg->rkey = device->mr->rkey; + reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); + reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); + + return (0); +} + +/** + * TODO: This should be a verb + * iser_ib_inc_rkey - increments the key portion of the given rkey. Can be used + * for calculating a new rkey for type 2 memory windows. + * @rkey - the rkey to increment. + */ +static inline u32 +iser_ib_inc_rkey(u32 rkey) +{ + const u32 mask = 0x000000ff; + + return (((rkey + 1) & mask) | (rkey & ~mask)); +} + +static void +iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) +{ + u32 rkey; + + memset(inv_wr, 0, sizeof(*inv_wr)); + inv_wr->opcode = IB_WR_LOCAL_INV; + inv_wr->wr_id = ISER_FASTREG_LI_WRID; + inv_wr->ex.invalidate_rkey = mr->rkey; + + rkey = iser_ib_inc_rkey(mr->rkey); + ib_update_fast_reg_key(mr, rkey); +} + +static int +iser_fast_reg_mr(struct icl_iser_pdu *iser_pdu, + struct iser_data_buf *mem, + struct iser_reg_resources *rsc, + struct iser_mem_reg *reg) +{ + struct ib_conn *ib_conn = &iser_pdu->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_send_wr fastreg_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + int ret, offset, size, plen; + + /* if there a single dma entry, dma mr suffices */ + if (mem->dma_nents == 1) + return iser_reg_dma(device, mem, reg); + + /* rsc is not null */ + plen = iser_sg_to_page_vec(mem, device->ib_device, + rsc->frpl->page_list, + &offset, &size); + if (plen * SIZE_4K < size) { + ISER_ERR("fast reg page_list too short to hold this SG"); + return (EINVAL); + } + + if (!rsc->mr_valid) { + iser_inv_rkey(&inv_wr, rsc->mr); + wr = &inv_wr; + } + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = rsc->frpl->page_list[0] + offset; + fastreg_wr.wr.fast_reg.page_list = rsc->frpl; + fastreg_wr.wr.fast_reg.page_list_len = plen; + fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; + fastreg_wr.wr.fast_reg.length = size; + fastreg_wr.wr.fast_reg.rkey = rsc->mr->rkey; + fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + if (!wr) + wr = &fastreg_wr; + else + wr->next = &fastreg_wr; + + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); + if (ret) { + ISER_ERR("fast registration failed, ret:%d", ret); + return (ret); + } + rsc->mr_valid = 0; + + reg->sge.lkey = rsc->mr->lkey; + reg->rkey = rsc->mr->rkey; + reg->sge.addr = rsc->frpl->page_list[0] + offset; + reg->sge.length = size; + + return (ret); +} + +/** + * iser_reg_rdma_mem - Registers memory intended for RDMA, + * using Fast Registration WR (if possible) obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +int +iser_reg_rdma_mem(struct icl_iser_pdu *iser_pdu, + enum iser_data_dir cmd_dir) +{ + struct ib_conn *ib_conn = &iser_pdu->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct iser_data_buf *mem = &iser_pdu->data[cmd_dir]; + struct iser_mem_reg *mem_reg = &iser_pdu->rdma_reg[cmd_dir]; + struct fast_reg_descriptor *desc = NULL; + int err, aligned_len; + + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + ISER_ERR("bounce buffer is not supported"); + return 1; + } + + if (mem->dma_nents != 1) { + desc = iser_reg_desc_get(ib_conn); + mem_reg->mem_h = desc; + } + + err = iser_fast_reg_mr(iser_pdu, mem, desc ? &desc->rsc : NULL, + mem_reg); + if (err) + goto err_reg; + + return (0); + +err_reg: + if (desc) + iser_reg_desc_put(ib_conn, desc); + + return (err); +} + +void +iser_unreg_rdma_mem(struct icl_iser_pdu *iser_pdu, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_pdu->rdma_reg[cmd_dir]; + + if (!reg->mem_h) + return; + + iser_reg_desc_put(&iser_pdu->iser_conn->ib_conn, + reg->mem_h); + reg->mem_h = NULL; +} + +int +iser_dma_map_task_data(struct icl_iser_pdu *iser_pdu, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir) +{ + struct ib_device *dev; + + iser_pdu->dir[iser_dir] = 1; + dev = iser_pdu->iser_conn->ib_conn.device->ib_device; + + data->dma_nents = ib_dma_map_sg(dev, data->sgl, data->size, dma_dir); + if (data->dma_nents == 0) { + ISER_ERR("dma_map_sg failed"); + return (EINVAL); + } + + return (0); +} diff --git a/sys/dev/iser/iser_verbs.c b/sys/dev/iser/iser_verbs.c new file mode 100644 index 000000000000..2f638e45e9a0 --- /dev/null +++ b/sys/dev/iser/iser_verbs.c @@ -0,0 +1,965 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "icl_iser.h" + +static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend"); +static int iser_cq_poll_limit = 512; + +static void +iser_cq_event_callback(struct ib_event *cause, void *context) +{ + ISER_ERR("got cq event %d", cause->event); +} + +static void +iser_qp_event_callback(struct ib_event *cause, void *context) +{ + ISER_ERR("got qp event %d", cause->event); +} + +static void +iser_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + ISER_ERR("async event %d on device %s port %d", + event->event, event->device->name, + event->element.port_num); +} + +/** + * is_iser_tx_desc - Indicate if the completion wr_id + * is a TX descriptor or not. + * @iser_conn: iser connection + * @wr_id: completion WR identifier + * + * Since we cannot rely on wc opcode in FLUSH errors + * we must work around it by checking if the wr_id address + * falls in the iser connection rx_descs buffer. If so + * it is an RX descriptor, otherwize it is a TX. + */ +static inline bool +is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id) +{ + void *start = iser_conn->rx_descs; + u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs); + void *end = (void *)((uintptr_t)start + (uintptr_t)len); + + if (start) { + if (wr_id >= start && wr_id < end) + return false; + } else { + return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf); + } + + return true; +} + +/** + * iser_handle_comp_error() - Handle error completion + * @ib_conn: connection RDMA resources + * @wc: work completion + * + * Notes: Update post_recv_buf_count in case of recv error completion. + * For non-FLUSH error completion we should also notify iscsi layer that + * connection is failed (in case we passed bind stage). + */ +static void +iser_handle_comp_error(struct ib_conn *ib_conn, + struct ib_wc *wc) +{ + void *wr_id = (void *)(uintptr_t)wc->wr_id; + struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, + ib_conn); + + if (is_iser_tx_desc(iser_conn, wr_id)) { + ISER_DBG("conn %p got send comp error", iser_conn); + } else { + ISER_DBG("conn %p got recv comp error", iser_conn); + ib_conn->post_recv_buf_count--; + } + if (wc->status != IB_WC_WR_FLUSH_ERR) + iser_conn->icl_conn.ic_error(&iser_conn->icl_conn); +} + +/** + * iser_handle_wc - handle a single work completion + * @wc: work completion + * + * Soft-IRQ context, work completion can be either + * SEND or RECV, and can turn out successful or + * with error (or flush error). + */ +static void iser_handle_wc(struct ib_wc *wc) +{ + struct ib_conn *ib_conn; + struct iser_tx_desc *tx_desc; + struct iser_rx_desc *rx_desc; + + ib_conn = wc->qp->qp_context; + if (likely(wc->status == IB_WC_SUCCESS)) { + if (wc->opcode == IB_WC_RECV) { + rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id; + iser_rcv_completion(rx_desc, wc->byte_len, + ib_conn); + } else + if (wc->opcode == IB_WC_SEND) { + tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id; + iser_snd_completion(tx_desc, ib_conn); + } else { + ISER_ERR("Unknown wc opcode %d", wc->opcode); + } + } else { + struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, + ib_conn); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + ISER_ERR("conn %p wr id %lx status %d vend_err %x", + iser_conn, wc->wr_id, wc->status, wc->vendor_err); + } else { + ISER_DBG("flush error: conn %p wr id %lx", iser_conn, wc->wr_id); + } + + if (wc->wr_id == ISER_BEACON_WRID) { + /* all flush errors were consumed */ + mtx_lock(&ib_conn->beacon.flush_lock); + ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn); + cv_signal(&ib_conn->beacon.flush_cv); + mtx_unlock(&ib_conn->beacon.flush_lock); + } else { + iser_handle_comp_error(ib_conn, wc); + } + } +} + +static void +iser_cq_tasklet_fn(void *data, int pending) +{ + struct iser_comp *comp = (struct iser_comp *)data; + struct ib_cq *cq = comp->cq; + struct ib_wc *const wcs = comp->wcs; + int completed = 0; + int i; + int n; + + while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) { + for (i = 0; i < n; i++) + iser_handle_wc(&wcs[i]); + + completed += n; + if (completed >= iser_cq_poll_limit) + break; + } + + /* + * It is assumed here that arming CQ only once its empty + * would not cause interrupts to be missed. + */ + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +} + +static void +iser_cq_callback(struct ib_cq *cq, void *cq_context) +{ + struct iser_comp *comp = cq_context; + + taskqueue_enqueue_fast(comp->tq, &comp->task); +} + +/** + * iser_create_device_ib_res - creates Protection Domain (PD), Completion + * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with + * the adapator. + * + * returns 0 on success, -1 on failure + */ +static int +iser_create_device_ib_res(struct iser_device *device) +{ + struct ib_device_attr *dev_attr = &device->dev_attr; + int ret, i, max_cqe; + + ret = ib_query_device(device->ib_device, dev_attr); + if (ret) { + ISER_ERR("Query device failed for %s", device->ib_device->name); + return (ret); + } + + if (!(dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { + ISER_ERR("device %s doesn't support Fastreg, " + "can't register memory", device->ib_device->name); + return (1); + } + + device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors); + + device->comps = malloc(device->comps_used * sizeof(*device->comps), + M_ISER_VERBS, M_WAITOK | M_ZERO); + if (!device->comps) + goto comps_err; + + max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe); + + ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d", + device->comps_used, device->ib_device->name, + device->ib_device->num_comp_vectors, max_cqe); + + device->pd = ib_alloc_pd(device->ib_device); + if (IS_ERR(device->pd)) + goto pd_err; + + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + + comp->device = device; + comp->cq = ib_create_cq(device->ib_device, + iser_cq_callback, + iser_cq_event_callback, + (void *)comp, + max_cqe, i); + if (IS_ERR(comp->cq)) { + comp->cq = NULL; + goto cq_err; + } + + if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP)) + goto cq_err; + + TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp); + comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT, + taskqueue_thread_enqueue, &comp->tq); + if (!comp->tq) + goto tq_err; + taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq"); + } + + device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (IS_ERR(device->mr)) + goto tq_err; + + INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, + iser_event_handler); + if (ib_register_event_handler(&device->event_handler)) + goto handler_err; + + return (0); + +handler_err: + ib_dereg_mr(device->mr); +tq_err: + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + if (comp->tq) + taskqueue_free(comp->tq); + } +cq_err: + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + if (comp->cq) + ib_destroy_cq(comp->cq); + } + ib_dealloc_pd(device->pd); +pd_err: + free(device->comps, M_ISER_VERBS); +comps_err: + ISER_ERR("failed to allocate an IB resource"); + return (1); +} + +/** + * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR, + * CQ and PD created with the device associated with the adapator. + */ +static void +iser_free_device_ib_res(struct iser_device *device) +{ + int i; + + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + + taskqueue_free(comp->tq); + ib_destroy_cq(comp->cq); + comp->cq = NULL; + } + + (void)ib_unregister_event_handler(&device->event_handler); + (void)ib_dereg_mr(device->mr); + (void)ib_dealloc_pd(device->pd); + + free(device->comps, M_ISER_VERBS); + device->comps = NULL; + + device->mr = NULL; + device->pd = NULL; +} + +static int +iser_alloc_reg_res(struct ib_device *ib_device, + struct ib_pd *pd, + struct iser_reg_resources *res) +{ + int ret; + + res->frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(res->frpl)) { + ret = -PTR_ERR(res->frpl); + ISER_ERR("Failed to allocate fast reg page list err=%d", ret); + return (ret); + } + + res->mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(res->mr)) { + ret = -PTR_ERR(res->mr); + ISER_ERR("Failed to allocate fast reg mr err=%d", ret); + goto fast_reg_mr_failure; + } + res->mr_valid = 1; + + return (0); + +fast_reg_mr_failure: + ib_free_fast_reg_page_list(res->frpl); + + return (ret); +} + +static void +iser_free_reg_res(struct iser_reg_resources *rsc) +{ + ib_dereg_mr(rsc->mr); + ib_free_fast_reg_page_list(rsc->frpl); +} + +static struct fast_reg_descriptor * +iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd) +{ + struct fast_reg_descriptor *desc; + int ret; + + desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO); + if (!desc) { + ISER_ERR("Failed to allocate a new fastreg descriptor"); + return (NULL); + } + + ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc); + if (ret) { + ISER_ERR("failed to allocate reg_resources"); + goto err; + } + + return (desc); +err: + free(desc, M_ISER_VERBS); + return (NULL); +} + +/** + * iser_create_fmr_pool - Creates FMR pool and page_vector + * + * returns 0 on success, or errno code on failure + */ +int +iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max) +{ + struct iser_device *device = ib_conn->device; + struct fast_reg_descriptor *desc; + int i; + + INIT_LIST_HEAD(&ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size = 0; + for (i = 0; i < cmds_max; i++) { + desc = iser_create_fastreg_desc(device->ib_device, device->pd); + if (!desc) { + ISER_ERR("Failed to create fastreg descriptor"); + goto err; + } + + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size++; + } + + return (0); + +err: + iser_free_fastreg_pool(ib_conn); + return (ENOMEM); +} + +/** + * iser_free_fmr_pool - releases the FMR pool and page vec + */ +void +iser_free_fastreg_pool(struct ib_conn *ib_conn) +{ + struct fast_reg_descriptor *desc, *tmp; + int i = 0; + + if (list_empty(&ib_conn->fastreg.pool)) + return; + + ISER_DBG("freeing conn %p fr pool", ib_conn); + + list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { + list_del(&desc->list); + iser_free_reg_res(&desc->rsc); + free(desc, M_ISER_VERBS); + ++i; + } + + if (i < ib_conn->fastreg.pool_size) + ISER_WARN("pool still has %d regions registered", + ib_conn->fastreg.pool_size - i); +} + +/** + * iser_create_ib_conn_res - Queue-Pair (QP) + * + * returns 0 on success, 1 on failure + */ +static int +iser_create_ib_conn_res(struct ib_conn *ib_conn) +{ + struct iser_conn *iser_conn; + struct iser_device *device; + struct ib_device_attr *dev_attr; + struct ib_qp_init_attr init_attr; + int index, min_index = 0; + int ret = -ENOMEM; + + iser_conn = container_of(ib_conn, struct iser_conn, ib_conn); + device = ib_conn->device; + dev_attr = &device->dev_attr; + + mtx_lock(&ig.connlist_mutex); + /* select the CQ with the minimal number of usages */ + for (index = 0; index < device->comps_used; index++) { + if (device->comps[index].active_qps < + device->comps[min_index].active_qps) + min_index = index; + } + ib_conn->comp = &device->comps[min_index]; + ib_conn->comp->active_qps++; + mtx_unlock(&ig.connlist_mutex); + ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn); + + memset(&init_attr, 0, sizeof init_attr); + init_attr.event_handler = iser_qp_event_callback; + init_attr.qp_context = (void *)ib_conn; + init_attr.send_cq = ib_conn->comp->cq; + init_attr.recv_cq = ib_conn->comp->cq; + init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; + init_attr.cap.max_send_sge = 2; + init_attr.cap.max_recv_sge = 1; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + + if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) { + init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; + iser_conn->max_cmds = + ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS); + } else { + init_attr.cap.max_send_wr = dev_attr->max_qp_wr; + iser_conn->max_cmds = + ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr); + } + ISER_DBG("device %s supports max_send_wr %d", + device->ib_device->name, dev_attr->max_qp_wr); + + ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); + if (ret) + goto out_err; + + ib_conn->qp = ib_conn->cma_id->qp; + ISER_DBG("setting conn %p cma_id %p qp %p", + ib_conn, ib_conn->cma_id, + ib_conn->cma_id->qp); + + return (ret); + +out_err: + mtx_lock(&ig.connlist_mutex); + ib_conn->comp->active_qps--; + mtx_unlock(&ig.connlist_mutex); + ISER_ERR("unable to alloc mem or create resource, err %d", ret); + + return (ret); +} + +/** + * based on the resolved device node GUID see if there already allocated + * device for this device. If there's no such, create one. + */ +static struct iser_device * +iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + + sx_xlock(&ig.device_list_mutex); + + list_for_each_entry(device, &ig.device_list, ig_list) + /* find if there's a match using the node GUID */ + if (device->ib_device->node_guid == cma_id->device->node_guid) + goto inc_refcnt; + + device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO); + if (device == NULL) + goto out; + + /* assign this device to the device */ + device->ib_device = cma_id->device; + /* init the device and link it into ig device list */ + if (iser_create_device_ib_res(device)) { + free(device, M_ISER_VERBS); + device = NULL; + goto out; + } + list_add(&device->ig_list, &ig.device_list); + +inc_refcnt: + device->refcount++; + ISER_INFO("device %p refcount %d", device, device->refcount); +out: + sx_xunlock(&ig.device_list_mutex); + return (device); +} + +/* if there's no demand for this device, release it */ +static void +iser_device_try_release(struct iser_device *device) +{ + sx_xlock(&ig.device_list_mutex); + device->refcount--; + ISER_INFO("device %p refcount %d", device, device->refcount); + if (!device->refcount) { + iser_free_device_ib_res(device); + list_del(&device->ig_list); + free(device, M_ISER_VERBS); + device = NULL; + } + sx_xunlock(&ig.device_list_mutex); +} + +/** + * Called with state mutex held + **/ +static int iser_conn_state_comp_exch(struct iser_conn *iser_conn, + enum iser_conn_state comp, + enum iser_conn_state exch) +{ + int ret; + + ret = (iser_conn->state == comp); + if (ret) + iser_conn->state = exch; + + return ret; +} + +/** + * iser_free_ib_conn_res - release IB related resources + * @iser_conn: iser connection struct + * @destroy: indicator if we need to try to release the + * iser device and memory regoins pool (only iscsi + * shutdown and DEVICE_REMOVAL will use this). + * + * This routine is called with the iser state mutex held + * so the cm_id removal is out of here. It is Safe to + * be invoked multiple times. + */ +void +iser_free_ib_conn_res(struct iser_conn *iser_conn, + bool destroy) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + ISER_INFO("freeing conn %p cma_id %p qp %p", + iser_conn, ib_conn->cma_id, ib_conn->qp); + + if (ib_conn->qp != NULL) { + mtx_lock(&ig.connlist_mutex); + ib_conn->comp->active_qps--; + mtx_unlock(&ig.connlist_mutex); + rdma_destroy_qp(ib_conn->cma_id); + ib_conn->qp = NULL; + } + + if (destroy) { + if (iser_conn->login_buf) + iser_free_login_buf(iser_conn); + + if (iser_conn->rx_descs) + iser_free_rx_descriptors(iser_conn); + + if (device != NULL) { + iser_device_try_release(device); + ib_conn->device = NULL; + } + } +} + +/** + * triggers start of the disconnect procedures and wait for them to be done + * Called with state mutex held + */ +int +iser_conn_terminate(struct iser_conn *iser_conn) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct ib_send_wr *bad_send_wr; + struct ib_recv_wr *bad_recv_wr; + int err = 0; + + /* terminate the iser conn only if the conn state is UP */ + if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)) + return (0); + + ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state); + + if (ib_conn->qp == NULL) { + /* HOW can this be??? */ + ISER_WARN("qp wasn't created"); + return (1); + } + + /* + * Todo: This is a temporary workaround. + * We serialize the connection closure using global lock in order to + * receive all posted beacons completions. + * Without Serialization, in case we open many connections (QPs) on + * the same CQ, we might miss beacons because of missing interrupts. + */ + sx_xlock(&ig.close_conns_mutex); + + /* + * In case we didn't already clean up the cma_id (peer initiated + * a disconnection), we need to Cause the CMA to change the QP + * state to ERROR. + */ + if (ib_conn->cma_id) { + err = rdma_disconnect(ib_conn->cma_id); + if (err) + ISER_ERR("Failed to disconnect, conn: 0x%p err %d", + iser_conn, err); + + mtx_lock(&ib_conn->beacon.flush_lock); + memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr)); + ib_conn->beacon.send.wr_id = ISER_BEACON_WRID; + ib_conn->beacon.send.opcode = IB_WR_SEND; + /* post an indication that all send flush errors were consumed */ + err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr); + if (err) { + ISER_ERR("conn %p failed to post send_beacon", ib_conn); + mtx_unlock(&ib_conn->beacon.flush_lock); + goto out; + } + + ISER_DBG("before send cv_wait: %p", iser_conn); + cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock); + ISER_DBG("after send cv_wait: %p", iser_conn); + + memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr)); + ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID; + /* post an indication that all recv flush errors were consumed */ + err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr); + if (err) { + ISER_ERR("conn %p failed to post recv_beacon", ib_conn); + mtx_unlock(&ib_conn->beacon.flush_lock); + goto out; + } + + ISER_DBG("before recv cv_wait: %p", iser_conn); + cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock); + mtx_unlock(&ib_conn->beacon.flush_lock); + ISER_DBG("after recv cv_wait: %p", iser_conn); + } +out: + sx_xunlock(&ig.close_conns_mutex); + return (1); +} + +/** + * Called with state mutex held + **/ +static void +iser_connect_error(struct rdma_cm_id *cma_id) +{ + struct iser_conn *iser_conn; + + iser_conn = cma_id->context; + + ISER_ERR("conn %p", iser_conn); + + iser_conn->state = ISER_CONN_TERMINATING; + + cv_signal(&iser_conn->up_cv); +} + +/** + * Called with state mutex held + **/ +static void +iser_addr_handler(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + struct iser_conn *iser_conn; + struct ib_conn *ib_conn; + int ret; + + iser_conn = cma_id->context; + + ib_conn = &iser_conn->ib_conn; + device = iser_device_find_by_ib_device(cma_id); + if (!device) { + ISER_ERR("conn %p device lookup/creation failed", + iser_conn); + iser_connect_error(cma_id); + return; + } + + ib_conn->device = device; + + ret = rdma_resolve_route(cma_id, 1000); + if (ret) { + ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret); + iser_connect_error(cma_id); + return; + } +} + +/** + * Called with state mutex held + **/ +static void +iser_route_handler(struct rdma_cm_id *cma_id) +{ + struct rdma_conn_param conn_param; + int ret; + struct iser_cm_hdr req_hdr; + struct iser_conn *iser_conn = cma_id->context; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + ret = iser_create_ib_conn_res(ib_conn); + if (ret) + goto failure; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; + conn_param.retry_count = 7; + conn_param.rnr_retry_count = 6; + /* + * Initiaotr depth should not be set, but in order to compat + * with old targets, we keep this value set. + */ + conn_param.initiator_depth = 1; + + memset(&req_hdr, 0, sizeof(req_hdr)); + req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | + ISER_SEND_W_INV_NOT_SUPPORTED); + conn_param.private_data = (void *)&req_hdr; + conn_param.private_data_len = sizeof(struct iser_cm_hdr); + + ret = rdma_connect(cma_id, &conn_param); + if (ret) { + ISER_ERR("conn %p failure connecting: %d", iser_conn, ret); + goto failure; + } + + return; +failure: + iser_connect_error(cma_id); +} + +/** + * Called with state mutex held + **/ +static void +iser_connected_handler(struct rdma_cm_id *cma_id) +{ + struct iser_conn *iser_conn; + struct ib_qp_attr attr; + struct ib_qp_init_attr init_attr; + + iser_conn = cma_id->context; + + (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); + + ISER_INFO("remote qpn:%x my qpn:%x", + attr.dest_qp_num, cma_id->qp->qp_num); + + iser_conn->state = ISER_CONN_UP; + + cv_signal(&iser_conn->up_cv); +} + +/** + * Called with state mutex held + **/ +static void +iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy) +{ + struct iser_conn *iser_conn = cma_id->context; + + if (iser_conn_terminate(iser_conn)) + iser_conn->icl_conn.ic_error(&iser_conn->icl_conn); + +} + +int +iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + struct iser_conn *iser_conn; + int ret = 0; + + iser_conn = cma_id->context; + ISER_INFO("event %d status %d conn %p id %p", + event->event, event->status, cma_id->context, cma_id); + + sx_xlock(&iser_conn->state_mutex); + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + iser_addr_handler(cma_id); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + iser_route_handler(cma_id); + break; + case RDMA_CM_EVENT_ESTABLISHED: + iser_connected_handler(cma_id); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + iser_connect_error(cma_id); + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + iser_cleanup_handler(cma_id, false); + break; + default: + ISER_ERR("Unexpected RDMA CM event (%d)", event->event); + break; + } + sx_xunlock(&iser_conn->state_mutex); + + return (ret); +} + +int +iser_post_recvl(struct iser_conn *iser_conn) +{ + struct ib_recv_wr rx_wr, *rx_wr_failed; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct ib_sge sge; + int ib_ret; + + sge.addr = iser_conn->login_resp_dma; + sge.length = ISER_RX_LOGIN_SIZE; + sge.lkey = ib_conn->device->mr->lkey; + + rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf; + rx_wr.sg_list = &sge; + rx_wr.num_sge = 1; + rx_wr.next = NULL; + + ib_conn->post_recv_buf_count++; + ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); + if (ib_ret) { + ISER_ERR("ib_post_recv failed ret=%d", ib_ret); + ib_conn->post_recv_buf_count--; + } + + return (ib_ret); +} + +int +iser_post_recvm(struct iser_conn *iser_conn, int count) +{ + struct ib_recv_wr *rx_wr, *rx_wr_failed; + int i, ib_ret; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + unsigned int my_rx_head = iser_conn->rx_desc_head; + struct iser_rx_desc *rx_desc; + + for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { + rx_desc = &iser_conn->rx_descs[my_rx_head]; + rx_wr->wr_id = (uintptr_t)rx_desc; + rx_wr->sg_list = &rx_desc->rx_sg; + rx_wr->num_sge = 1; + rx_wr->next = rx_wr + 1; + my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos; + } + + rx_wr--; + rx_wr->next = NULL; /* mark end of work requests list */ + + ib_conn->post_recv_buf_count += count; + ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); + if (ib_ret) { + ISER_ERR("ib_post_recv failed ret=%d", ib_ret); + ib_conn->post_recv_buf_count -= count; + } else + iser_conn->rx_desc_head = my_rx_head; + + return (ib_ret); +} + +/** + * iser_start_send - Initiate a Send DTO operation + * + * returns 0 on success, -1 on failure + */ +int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, + bool signal) +{ + int ib_ret; + struct ib_send_wr send_wr, *send_wr_failed; + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, + DMA_TO_DEVICE); + + send_wr.next = NULL; + send_wr.wr_id = (uintptr_t)tx_desc; + send_wr.sg_list = tx_desc->tx_sg; + send_wr.num_sge = tx_desc->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0; + + ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); + if (ib_ret) + ISER_ERR("ib_post_send failed, ret:%d", ib_ret); + + return (ib_ret); +} diff --git a/sys/modules/iser/Makefile b/sys/modules/iser/Makefile new file mode 100644 index 000000000000..2f7955e56562 --- /dev/null +++ b/sys/modules/iser/Makefile @@ -0,0 +1,32 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../dev/iser/ + +.include + +KMOD= iser + +SRCS= icl_iser.c +SRCS+= iser_initiator.c +SRCS+= iser_memory.c +SRCS+= iser_verbs.c +SRCS+= vnode_if.h +SRCS+= opt_inet.h +SRCS+= opt_inet6.h +SRCS+= opt_cam.h +SRCS+= bus_if.h +SRCS+= device_if.h +SRCS+= icl_conn_if.h + +CFLAGS+= -I${.CURDIR}/../../ +CFLAGS+= -I${SYSDIR}/ofed/include +CFLAGS+= -I${SYSDIR}/compat/linuxkpi/common/include +CFLAGS+= -DCONFIG_INFINIBAND_USER_MEM +CFLAGS+= -DINET6 -DINET +CFLAGS+= -fms-extensions + +CFLAGS+=-DICL_KERNEL_PROXY + +MFILES= kern/bus_if.m kern/device_if.m dev/iscsi/icl_conn_if.m + +.include