diff --git a/sys/conf/files b/sys/conf/files index 3b8c2b431ba7..82c727912110 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3624,19 +3624,16 @@ ofed/drivers/infiniband/core/fmr_pool.c optional ofed \ ofed/drivers/infiniband/core/iwcm.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" -ofed/drivers/infiniband/core/local_sa.c optional ofed \ - no-depend \ - compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/mad_rmpp.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/multicast.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" -ofed/drivers/infiniband/core/notice.c optional ofed \ +ofed/drivers/infiniband/core/packer.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" -ofed/drivers/infiniband/core/packer.c optional ofed \ +ofed/drivers/infiniband/core/peer_mem.c optional ofed \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/sa_query.c optional ofed \ @@ -3741,6 +3738,9 @@ ofed/drivers/infiniband/hw/mlx4/mad.c optional mlx4ib \ ofed/drivers/infiniband/hw/mlx4/main.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c optional mlx4ib \ + no-depend \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/mr.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" diff --git a/sys/contrib/rdma/krping/krping.c b/sys/contrib/rdma/krping/krping.c index c89339edf31b..94b2eb94e41a 100644 --- a/sys/contrib/rdma/krping/krping.c +++ b/sys/contrib/rdma/krping/krping.c @@ -525,7 +525,7 @@ static void krping_setup_wr(struct krping_cb *cb) case MW: cb->bind_attr.wr_id = 0xabbaabba; cb->bind_attr.send_flags = 0; /* unsignaled */ - cb->bind_attr.length = cb->size; + cb->bind_attr.bind_info.length = cb->size; break; default: break; @@ -627,7 +627,7 @@ static int krping_setup_buffers(struct krping_cb *cb) cb->page_list, cb->page_list_len); break; case MW: - cb->mw = ib_alloc_mw(cb->pd); + cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1); if (IS_ERR(cb->mw)) { DEBUG_LOG(cb, "recv_buf alloc_mw failed\n"); ret = PTR_ERR(cb->mw); @@ -898,15 +898,15 @@ static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv) * Update the MW with new buf info. */ if (buf == (u64)cb->start_dma_addr) { - cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ; - cb->bind_attr.mr = cb->start_mr; + cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ; + cb->bind_attr.bind_info.mr = cb->start_mr; } else { - cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE; - cb->bind_attr.mr = cb->rdma_mr; + cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE; + cb->bind_attr.bind_info.mr = cb->rdma_mr; } - cb->bind_attr.addr = buf; + cb->bind_attr.bind_info.addr = buf; DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n", - cb->mw->rkey, buf, cb->bind_attr.mr->rkey); + cb->mw->rkey, buf, cb->bind_attr.bind_info.mr->rkey); ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr); if (ret) { PRINTF(cb, "bind mw error %d\n", ret); @@ -2304,7 +2304,7 @@ int krping_doit(char *cmd, void *cookie) goto out; } - cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP); + cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cb->cm_id)) { ret = PTR_ERR(cb->cm_id); PRINTF(cb, "rdma_create_id error %d\n", ret); diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c index 64ac36c01c10..9a3d75fab708 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c @@ -176,7 +176,7 @@ iwch_destroy_cq(struct ib_cq *ib_cq) } static struct ib_cq * -iwch_create_cq(struct ib_device *ibdev, int entries, int vector, +iwch_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *ib_context, struct ib_udata *udata) { @@ -187,6 +187,7 @@ iwch_create_cq(struct ib_device *ibdev, int entries, int vector, struct iwch_ucontext *ucontext = NULL; static int warned; size_t resplen; + int entries = attr->cqe; CTR3(KTR_IW_CXGB, "%s ib_dev %p entries %d", __FUNCTION__, ibdev, entries); rhp = to_iwch_dev(ibdev); @@ -545,16 +546,14 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int mr_id) { __be64 *pages; - int shift, i, n; + int shift, n, len; + int i, k, entry; int err = 0; - struct ib_umem_chunk *chunk; struct iwch_dev *rhp; struct iwch_pd *php; struct iwch_mr *mhp; struct iwch_reg_user_mr_resp uresp; -#ifdef notyet - int j, k, len; -#endif + struct scatterlist *sg; CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); @@ -575,9 +574,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, shift = ffs(mhp->umem->page_size) - 1; - n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - n += chunk->nents; + n = mhp->umem->nmap; err = iwch_alloc_pbl(mhp, n); if (err) @@ -591,7 +588,21 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; -#ifdef notyet + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address(sg) + + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = iwch_write_pbl(mhp, pages, i, n); + if (err) + goto pbl_done; + n += i; + i = 0; + } + } + } +#if 0 TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry) for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; @@ -612,9 +623,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (i) err = iwch_write_pbl(mhp, pages, i, n); -#ifdef notyet pbl_done: -#endif cxfree(pages); if (err) goto err_pbl; @@ -672,7 +681,7 @@ static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) return ibmr; } -static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd) +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct iwch_dev *rhp; struct iwch_pd *php; diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c index 648d96b867a8..3e8e6b3c9ef7 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c @@ -551,18 +551,18 @@ int iwch_bind_mw(struct ib_qp *qp, if (mw_bind->send_flags & IB_SEND_SIGNALED) t3_wr_flags = T3_COMPLETION_FLAG; - sgl.addr = mw_bind->addr; - sgl.lkey = mw_bind->mr->lkey; - sgl.length = mw_bind->length; + sgl.addr = mw_bind->bind_info.addr; + sgl.lkey = mw_bind->bind_info.mr->lkey; + sgl.length = mw_bind->bind_info.length; wqe->bind.reserved = 0; wqe->bind.type = T3_VA_BASED_TO; /* TBD: check perms */ - wqe->bind.perms = iwch_ib_to_mwbind_access(mw_bind->mw_access_flags); - wqe->bind.mr_stag = htobe32(mw_bind->mr->lkey); + wqe->bind.perms = iwch_ib_to_mwbind_access(mw_bind->bind_info.mw_access_flags); + wqe->bind.mr_stag = htobe32(mw_bind->bind_info.mr->lkey); wqe->bind.mw_stag = htobe32(mw->rkey); - wqe->bind.mw_len = htobe32(mw_bind->length); - wqe->bind.mw_va = htobe64(mw_bind->addr); + wqe->bind.mw_len = htobe32(mw_bind->bind_info.length); + wqe->bind.mw_va = htobe64(mw_bind->bind_info.addr); err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size); if (err) { mtx_unlock(&qhp->lock); diff --git a/sys/dev/cxgbe/iw_cxgbe/cq.c b/sys/dev/cxgbe/iw_cxgbe/cq.c index ec72a6c7b759..8710e03a829c 100644 --- a/sys/dev/cxgbe/iw_cxgbe/cq.c +++ b/sys/dev/cxgbe/iw_cxgbe/cq.c @@ -775,7 +775,7 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq) } struct ib_cq * -c4iw_create_cq(struct ib_device *ibdev, int entries, int vector, +c4iw_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *ib_context, struct ib_udata *udata) { struct c4iw_dev *rhp; @@ -785,6 +785,7 @@ c4iw_create_cq(struct ib_device *ibdev, int entries, int vector, int ret; size_t memsize, hwentries; struct c4iw_mm_entry *mm, *mm2; + int entries = attr->cqe; CTR3(KTR_IW_CXGBE, "%s ib_dev %p entries %d", __func__, ibdev, entries); diff --git a/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h b/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h index 245e045240b9..5f2542c55c0c 100644 --- a/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h +++ b/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h @@ -864,7 +864,7 @@ struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl( int page_list_len); struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth); int c4iw_dealloc_mw(struct ib_mw *mw); -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd); +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata, int mr_id); struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); @@ -881,8 +881,7 @@ int c4iw_reregister_phys_mem(struct ib_mr *mr, int acc, u64 *iova_start); int c4iw_dereg_mr(struct ib_mr *ib_mr); int c4iw_destroy_cq(struct ib_cq *ib_cq); -struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, - int vector, +struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *ib_context, struct ib_udata *udata); int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata); diff --git a/sys/dev/cxgbe/iw_cxgbe/mem.c b/sys/dev/cxgbe/iw_cxgbe/mem.c index 50c5ed0c1f80..f7c460ac5cac 100644 --- a/sys/dev/cxgbe/iw_cxgbe/mem.c +++ b/sys/dev/cxgbe/iw_cxgbe/mem.c @@ -563,9 +563,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { __be64 *pages; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct c4iw_dev *rhp; struct c4iw_pd *php; struct c4iw_mr *mhp; @@ -594,11 +594,8 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } shift = ffs(mhp->umem->page_size) - 1; - - n = 0; - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - n += chunk->nents; - + + n = mhp->umem->nmap; err = alloc_pbl(mhp, n); if (err) goto err; @@ -610,25 +607,23 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } i = n = 0; - - list_for_each_entry(chunk, &mhp->umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address( - &chunk->page_list[j]) + + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address(sg) + mhp->umem->page_size * k); - if (i == PAGE_SIZE / sizeof *pages) { - err = write_pbl(&mhp->rhp->rdev, - pages, - mhp->attr.pbl_addr + (n << 3), i); - if (err) - goto pbl_done; - n += i; - i = 0; - } + if (i == PAGE_SIZE / sizeof *pages) { + err = write_pbl(&mhp->rhp->rdev, + pages, + mhp->attr.pbl_addr + (n << 3), i); + if (err) + goto pbl_done; + n += i; + i = 0; + } } + } if (i) err = write_pbl(&mhp->rhp->rdev, pages, @@ -662,7 +657,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(err); } -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd) +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct c4iw_dev *rhp; struct c4iw_pd *php; diff --git a/sys/modules/ibcore/Makefile b/sys/modules/ibcore/Makefile index fea77b969637..6a471d254747 100644 --- a/sys/modules/ibcore/Makefile +++ b/sys/modules/ibcore/Makefile @@ -4,8 +4,8 @@ KMOD= ibcore SRCS= addr.c iwcm.c sa_query.c ucma.c uverbs_cmd.c \ - agent.c local_sa.c multicast.c smi.c ud_header.c uverbs_main.c \ - mad.c notice.c umem.c uverbs_marshall.c \ + agent.c multicast.c smi.c ud_header.c uverbs_main.c \ + mad.c peer_mem.c umem.c uverbs_marshall.c \ cache.c device.c packer.c sysfs.c user_mad.c verbs.c \ cm.c fmr_pool.c mad_rmpp.c ucm.c cma.c \ vnode_if.h device_if.h bus_if.h pci_if.h \ diff --git a/sys/modules/mlx4ib/Makefile b/sys/modules/mlx4ib/Makefile index 57592bc995cf..4ab7a4e7f338 100644 --- a/sys/modules/mlx4ib/Makefile +++ b/sys/modules/mlx4ib/Makefile @@ -6,6 +6,7 @@ KMOD= mlx4ib SRCS= device_if.h bus_if.h vnode_if.h pci_if.h \ opt_inet.h opt_inet6.h \ alias_GUID.c mcg.c sysfs.c ah.c cq.c \ + mlx4_exp.c \ doorbell.c mad.c main.c mr.c qp.c srq.c wc.c cm.c CFLAGS+= -I${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4 diff --git a/sys/ofed/drivers/infiniband/core/Makefile b/sys/ofed/drivers/infiniband/core/Makefile deleted file mode 100644 index f64604019939..000000000000 --- a/sys/ofed/drivers/infiniband/core/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o rdma_cm.o -user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o - -obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o iw_cm.o $(infiniband-y) -obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o -obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ - $(user_access-y) - -ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ - device.o fmr_pool.o cache.o -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o - -ib_mad-y := mad.o smi.o agent.o mad_rmpp.o - -ib_sa-y := sa_query.o multicast.o notice.o local_sa.o - -ib_cm-y := cm.o - -iw_cm-y := iwcm.o - -rdma_cm-y := cma.o - -rdma_ucm-y := ucma.o - -ib_addr-y := addr.o - -ib_umad-y := user_mad.o - -ib_ucm-y := ucm.o - -ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o diff --git a/sys/ofed/drivers/infiniband/core/addr.c b/sys/ofed/drivers/infiniband/core/addr.c index c3d5b4fc0bf6..e85b5546858d 100644 --- a/sys/ofed/drivers/infiniband/core/addr.c +++ b/sys/ofed/drivers/infiniband/core/addr.c @@ -69,6 +69,7 @@ static LIST_HEAD(req_list); static struct delayed_work work; static struct workqueue_struct *addr_wq; +static struct rdma_addr_client self; void rdma_addr_register_client(struct rdma_addr_client *client) { atomic_set(&client->refcount, 1); @@ -89,19 +90,6 @@ void rdma_addr_unregister_client(struct rdma_addr_client *client) } EXPORT_SYMBOL(rdma_addr_unregister_client); -#ifdef __linux__ -int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, - const unsigned char *dst_dev_addr) -{ - dev_addr->dev_type = dev->type; - memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); - memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); - if (dst_dev_addr) - memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); - dev_addr->bound_dev_if = dev->ifindex; - return 0; -} -#else int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, const unsigned char *dst_dev_addr) { @@ -119,10 +107,10 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, dev_addr->bound_dev_if = dev->if_index; return 0; } -#endif EXPORT_SYMBOL(rdma_copy_addr); -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, + u16 *vlan_id) { struct net_device *dev; int ret = -EADDRNOTAVAIL; @@ -137,33 +125,21 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) } switch (addr->sa_family) { -#ifdef INET case AF_INET: - dev = ip_dev_find(NULL, + dev = ip_dev_find(&init_net, ((struct sockaddr_in *) addr)->sin_addr.s_addr); if (!dev) return ret; ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); break; -#endif #if defined(INET6) case AF_INET6: -#ifdef __linux__ - read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { - if (ipv6_chk_addr(&init_net, - &((struct sockaddr_in6 *) addr)->sin6_addr, - dev, 1)) { - ret = rdma_copy_addr(dev_addr, dev, NULL); - break; - } - } - read_unlock(&dev_base_lock); -#else { struct sockaddr_in6 *sin6; struct ifaddr *ifa; @@ -179,11 +155,11 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) break; } ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(ifa->ifa_ifp); ifa_free(ifa); break; } -#endif - break; #endif } return ret; @@ -218,127 +194,6 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -#ifdef __linux__ -static int addr4_resolve(struct sockaddr_in *src_in, - struct sockaddr_in *dst_in, - struct rdma_dev_addr *addr) -{ - __be32 src_ip = src_in->sin_addr.s_addr; - __be32 dst_ip = dst_in->sin_addr.s_addr; - struct flowi fl; - struct rtable *rt; - struct neighbour *neigh; - int ret; - - memset(&fl, 0, sizeof fl); - fl.nl_u.ip4_u.daddr = dst_ip; - fl.nl_u.ip4_u.saddr = src_ip; - fl.oif = addr->bound_dev_if; - - ret = ip_route_output_key(&init_net, &rt, &fl); - if (ret) - goto out; - - src_in->sin_family = AF_INET; - src_in->sin_addr.s_addr = rt->rt_src; - - if (rt->idev->dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - goto put; - } - - /* If the device does ARP internally, return 'done' */ - if (rt->idev->dev->flags & IFF_NOARP) { - rdma_copy_addr(addr, rt->idev->dev, NULL); - goto put; - } - - neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev); - if (!neigh || !(neigh->nud_state & NUD_VALID)) { - neigh_event_send(rt->u.dst.neighbour, NULL); - ret = -ENODATA; - if (neigh) - goto release; - goto put; - } - - ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); -release: - neigh_release(neigh); -put: - ip_rt_put(rt); -out: - return ret; -} - -#if defined(INET6) -static int addr6_resolve(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) -{ - struct flowi fl; - struct neighbour *neigh; - struct dst_entry *dst; - int ret; - - memset(&fl, 0, sizeof fl); - ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr); - ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr); - fl.oif = addr->bound_dev_if; - - dst = ip6_route_output(&init_net, NULL, &fl); - if ((ret = dst->error)) - goto put; - - if (ipv6_addr_any(&fl.fl6_src)) { - ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev, - &fl.fl6_dst, 0, &fl.fl6_src); - if (ret) - goto put; - - src_in->sin6_family = AF_INET6; - ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src); - } - - if (dst->dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - goto put; - } - - /* If the device does ARP internally, return 'done' */ - if (dst->dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, dst->dev, NULL); - goto put; - } - - neigh = dst->neighbour; - if (!neigh || !(neigh->nud_state & NUD_VALID)) { - neigh_event_send(dst->neighbour, NULL); - ret = -ENODATA; - goto put; - } - - ret = rdma_copy_addr(addr, dst->dev, neigh->ha); -put: - dst_release(dst); - return ret; -} -#else -static int addr6_resolve(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) -{ - return -EADDRNOTAVAIL; -} -#endif - -#else -#include - static int addr_resolve(struct sockaddr *src_in, struct sockaddr *dst_in, struct rdma_dev_addr *addr) @@ -354,7 +209,6 @@ static int addr_resolve(struct sockaddr *src_in, int bcast; int is_gw = 0; int error = 0; - /* * Determine whether the address is unicast, multicast, or broadcast * and whether the source interface is valid. @@ -382,8 +236,7 @@ static int addr_resolve(struct sockaddr *src_in, port = sin->sin_port; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - } else - src_in = NULL; + } break; #endif #ifdef INET6 @@ -406,7 +259,7 @@ static int addr_resolve(struct sockaddr *src_in, * If we have a source address to use look it up first and verify * that it is a local interface. */ - if (src_in) { + if (sin->sin_addr.s_addr != INADDR_ANY) { ifa = ifa_ifwithaddr(src_in); if (sin) sin->sin_port = port; @@ -436,15 +289,20 @@ static int addr_resolve(struct sockaddr *src_in, * correct interface pointer and unlock the route. */ if (multi || bcast) { - if (ifp == NULL) + if (ifp == NULL) { ifp = rte->rt_ifp; + /* rt_ifa holds the route answer source address */ + ifa = rte->rt_ifa; + } RTFREE_LOCKED(rte); } else if (ifp && ifp != rte->rt_ifp) { RTFREE_LOCKED(rte); return -ENETUNREACH; } else { - if (ifp == NULL) + if (ifp == NULL) { ifp = rte->rt_ifp; + ifa = rte->rt_ifa; + } RT_UNLOCK(rte); } mcast: @@ -459,6 +317,8 @@ static int addr_resolve(struct sockaddr *src_in, error = rdma_copy_addr(addr, ifp, LLADDR((struct sockaddr_dl *)llsa)); free(llsa, M_IFMADDR); + if (error == 0) + memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr)); return error; } /* @@ -472,7 +332,7 @@ static int addr_resolve(struct sockaddr *src_in, #endif #ifdef INET6 case AF_INET6: - error = nd6_storelladdr(ifp, NULL, dst_in, (u_char *)edst,NULL); + error = nd6_storelladdr(ifp, NULL, dst_in, (u_char *)edst, NULL); break; #endif default: @@ -480,15 +340,15 @@ static int addr_resolve(struct sockaddr *src_in, error = -EINVAL; } RTFREE(rte); - if (error == 0) + if (error == 0) { + memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr)); return rdma_copy_addr(addr, ifp, edst); + } if (error == EWOULDBLOCK) return -ENODATA; return -error; } -#endif - static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; @@ -602,20 +462,94 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) } EXPORT_SYMBOL(rdma_addr_cancel); +struct resolve_cb_context { + struct rdma_dev_addr *addr; + struct completion comp; +}; + +static void resolve_cb(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context) +{ + memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct + rdma_dev_addr)); + complete(&((struct resolve_cb_context *)context)->comp); +} + +int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, + u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + struct resolve_cb_context ctx; + struct net_device *dev; + + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + + ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid); + if (ret) + return ret; + + ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid); + if (ret) + return ret; + + memset(&dev_addr, 0, sizeof(dev_addr)); + + ctx.addr = &dev_addr; + init_completion(&ctx.comp); + ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, + &dev_addr, 1000, resolve_cb, &ctx); + if (ret) + return ret; + + wait_for_completion(&ctx.comp); + + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); + dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); + if (!dev) + return -ENODEV; + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); + +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } gid_addr; + + ret = rdma_gid2ip(&gid_addr._sockaddr, sgid); + + if (ret) + return ret; + memset(&dev_addr, 0, sizeof(dev_addr)); + ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); + if (ret) + return ret; + + memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); + static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { if (event == NETEVENT_NEIGH_UPDATE) { -#ifdef __linux__ - struct neighbour *neigh = ctx; - - if (neigh->nud_state & NUD_VALID) { set_timeout(jiffies); } -#else - set_timeout(jiffies); -#endif - } return 0; } @@ -631,11 +565,13 @@ static int __init addr_init(void) return -ENOMEM; register_netevent_notifier(&nb); + rdma_addr_register_client(&self); return 0; } static void __exit addr_cleanup(void) { + rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); } diff --git a/sys/ofed/drivers/infiniband/core/cache.c b/sys/ofed/drivers/infiniband/core/cache.c index 660bff50a7db..d11e7c2a88f7 100644 --- a/sys/ofed/drivers/infiniband/core/cache.c +++ b/sys/ofed/drivers/infiniband/core/cache.c @@ -76,19 +76,21 @@ int ib_get_cached_gid(struct ib_device *device, { struct ib_gid_cache *cache; unsigned long flags; - int ret = 0; + int ret = -EINVAL; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.gid_cache[port_num - start_port(device)]; + if (device->cache.gid_cache) { + cache = device->cache.gid_cache[port_num - start_port(device)]; - if (index < 0 || index >= cache->table_len) - ret = -EINVAL; - else - *gid = cache->table[index]; + if (cache && index >= 0 && index < cache->table_len) { + *gid = cache->table[index]; + ret = 0; + } + } read_unlock_irqrestore(&device->cache.lock, flags); @@ -111,22 +113,24 @@ int ib_find_cached_gid(struct ib_device *device, *index = -1; read_lock_irqsave(&device->cache.lock, flags); - + if (!device->cache.gid_cache) + goto out; for (p = 0; p <= end_port(device) - start_port(device); ++p) { cache = device->cache.gid_cache[p]; + if (!cache) + continue; for (i = 0; i < cache->table_len; ++i) { if (!memcmp(gid, &cache->table[i], sizeof *gid)) { *port_num = p + start_port(device); if (index) *index = i; ret = 0; - goto found; + goto out; } } } -found: +out: read_unlock_irqrestore(&device->cache.lock, flags); - return ret; } EXPORT_SYMBOL(ib_find_cached_gid); @@ -138,19 +142,21 @@ int ib_get_cached_pkey(struct ib_device *device, { struct ib_pkey_cache *cache; unsigned long flags; - int ret = 0; + int ret = -EINVAL; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.pkey_cache[port_num - start_port(device)]; + if (device->cache.pkey_cache) { + cache = device->cache.pkey_cache[port_num - start_port(device)]; - if (index < 0 || index >= cache->table_len) - ret = -EINVAL; - else - *pkey = cache->table[index]; + if (cache && index >= 0 && index < cache->table_len) { + *pkey = cache->table[index]; + ret = 0; + } + } read_unlock_irqrestore(&device->cache.lock, flags); @@ -167,41 +173,93 @@ int ib_find_cached_pkey(struct ib_device *device, unsigned long flags; int i; int ret = -ENOENT; + int partial_ix = -1; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; + *index = -1; + read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.pkey_cache[port_num - start_port(device)]; + if (!device->cache.pkey_cache) + goto out; - *index = -1; + cache = device->cache.pkey_cache[port_num - start_port(device)]; + if (!cache) + goto out; for (i = 0; i < cache->table_len; ++i) if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { + if (cache->table[i] & 0x8000) { + *index = i; + ret = 0; + break; + } else + partial_ix = i; + } + + if (ret && partial_ix >= 0) { + *index = partial_ix; + ret = 0; + } +out: + read_unlock_irqrestore(&device->cache.lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_find_cached_pkey); + +int ib_find_exact_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + *index = -1; + + read_lock_irqsave(&device->cache.lock, flags); + + if (!device->cache.pkey_cache) + goto out; + + cache = device->cache.pkey_cache[port_num - start_port(device)]; + if (!cache) + goto out; + + for (i = 0; i < cache->table_len; ++i) + if (cache->table[i] == pkey) { *index = i; ret = 0; break; } - +out: read_unlock_irqrestore(&device->cache.lock, flags); - return ret; } -EXPORT_SYMBOL(ib_find_cached_pkey); +EXPORT_SYMBOL(ib_find_exact_cached_pkey); int ib_get_cached_lmc(struct ib_device *device, u8 port_num, u8 *lmc) { unsigned long flags; - int ret = 0; + int ret = -EINVAL; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *lmc = device->cache.lmc_cache[port_num - start_port(device)]; + if (device->cache.lmc_cache) { + *lmc = device->cache.lmc_cache[port_num - start_port(device)]; + ret = 0; + } read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -217,6 +275,10 @@ static void ib_cache_update(struct ib_device *device, int i; int ret; + if (!(device->cache.pkey_cache && device->cache.gid_cache && + device->cache.lmc_cache)) + return; + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) return; @@ -309,7 +371,7 @@ static void ib_cache_event(struct ib_event_handler *handler, INIT_WORK(&work->work, ib_cache_task); work->device = event->device; work->port_num = event->element.port_num; - schedule_work(&work->work); + queue_work(ib_wq, &work->work); } } } @@ -362,14 +424,21 @@ static void ib_cache_setup_one(struct ib_device *device) kfree(device->cache.pkey_cache); kfree(device->cache.gid_cache); kfree(device->cache.lmc_cache); + device->cache.pkey_cache = NULL; + device->cache.gid_cache = NULL; + device->cache.lmc_cache = NULL; } static void ib_cache_cleanup_one(struct ib_device *device) { int p; + if (!(device->cache.pkey_cache && device->cache.gid_cache && + device->cache.lmc_cache)) + return; + ib_unregister_event_handler(&device->cache.event_handler); - flush_scheduled_work(); + flush_workqueue(ib_wq); for (p = 0; p <= end_port(device) - start_port(device); ++p) { kfree(device->cache.pkey_cache[p]); diff --git a/sys/ofed/drivers/infiniband/core/cm.c b/sys/ofed/drivers/infiniband/core/cm.c index 3d2794d439e2..07f6e08f2901 100644 --- a/sys/ofed/drivers/infiniband/core/cm.c +++ b/sys/ofed/drivers/infiniband/core/cm.c @@ -36,16 +36,19 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include +#include #include @@ -57,16 +60,10 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); -#define PFX "ib_cm: " - -/* - * Limit CM message timeouts to something reasonable: - * 8 seconds per message, with up to 15 retries - */ -static int max_timeout = 21; -module_param(max_timeout, int, 0644); -MODULE_PARM_DESC(max_timeout, "Maximum IB CM per message timeout " - "(default=21, or ~8 seconds)"); +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ static void cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device); @@ -189,6 +186,8 @@ struct cm_av { struct ib_ah_attr ah_attr; u16 pkey_index; u8 timeout; + u8 valid; + u8 smac[ETH_ALEN]; }; struct cm_work { @@ -358,6 +357,23 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, grh, &av->ah_attr); } +int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac) +{ + struct cm_id_private *cm_id_priv; + + cm_id_priv = container_of(id, struct cm_id_private, id); + + if (smac != NULL) + memcpy(cm_id_priv->av.smac, smac, sizeof(cm_id_priv->av.smac)); + + if (alt_smac != NULL) + memcpy(cm_id_priv->alt_av.smac, alt_smac, + sizeof(cm_id_priv->alt_av.smac)); + + return 0; +} +EXPORT_SYMBOL(ib_update_cm_av); + static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) { struct cm_device *cm_dev; @@ -388,6 +404,9 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, &av->ah_attr); av->timeout = path->packet_life_time + 1; + memcpy(av->smac, path->smac, sizeof(av->smac)); + + av->valid = 1; return 0; } @@ -402,7 +421,7 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) ret = idr_get_new_above(&cm.local_id_table, cm_id_priv, next_id, &id); if (!ret) - next_id = ((unsigned) id + 1) & MAX_ID_MASK; + next_id = ((unsigned) id + 1) & MAX_IDR_MASK; spin_unlock_irqrestore(&cm.lock, flags); } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) ); @@ -794,11 +813,11 @@ static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) } } -static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id) +static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id, gfp_t flags) { struct cm_timewait_info *timewait_info; - timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL); + timewait_info = kzalloc(sizeof *timewait_info, flags); if (!timewait_info) return ERR_PTR(-ENOMEM); @@ -902,6 +921,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) break; case IB_CM_ESTABLISHED: spin_unlock_irq(&cm_id_priv->lock); + if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) + break; ib_send_cm_dreq(cm_id, NULL, 0); goto retest; case IB_CM_DREQ_SENT: @@ -1021,33 +1042,24 @@ static void cm_format_req(struct cm_req_msg *req_msg, req_msg->service_id = param->service_id; req_msg->local_ca_guid = cm_id_priv->id.device->node_guid; cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num)); - cm_req_set_resp_res(req_msg, param->responder_resources); cm_req_set_init_depth(req_msg, param->initiator_depth); cm_req_set_remote_resp_timeout(req_msg, param->remote_cm_response_timeout); - if (param->remote_cm_response_timeout > (u8) max_timeout) { - printk(KERN_WARNING PFX "req remote_cm_response_timeout %d > " - "%d, decreasing\n", param->remote_cm_response_timeout, - max_timeout); - cm_req_set_remote_resp_timeout(req_msg, (u8) max_timeout); - } cm_req_set_qp_type(req_msg, param->qp_type); cm_req_set_flow_ctrl(req_msg, param->flow_control); cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn)); cm_req_set_local_resp_timeout(req_msg, param->local_cm_response_timeout); - if (param->local_cm_response_timeout > (u8) max_timeout) { - printk(KERN_WARNING PFX "req local_cm_response_timeout %d > " - "%d, decreasing\n", param->local_cm_response_timeout, - max_timeout); - cm_req_set_local_resp_timeout(req_msg, (u8) max_timeout); - } - cm_req_set_retry_count(req_msg, param->retry_count); req_msg->pkey = param->primary_path->pkey; cm_req_set_path_mtu(req_msg, param->primary_path->mtu); - cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); cm_req_set_max_cm_retries(req_msg, param->max_cm_retries); + + if (param->qp_type != IB_QPT_XRC_INI) { + cm_req_set_resp_res(req_msg, param->responder_resources); + cm_req_set_retry_count(req_msg, param->retry_count); + cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); cm_req_set_srq(req_msg, param->srq); + } if (pri_path->hop_limit <= 1) { req_msg->primary_local_lid = pri_path->slid; @@ -1105,7 +1117,8 @@ static int cm_validate_req_param(struct ib_cm_req_param *param) if (!param->primary_path) return -EINVAL; - if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC) + if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC && + param->qp_type != IB_QPT_XRC_INI) return -EINVAL; if (param->private_data && @@ -1137,38 +1150,34 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_IDLE) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = -EINVAL; - goto out; + return -EINVAL; } - spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> - id.local_id); + id.local_id, + GFP_ATOMIC); if (IS_ERR(cm_id_priv->timewait_info)) { - ret = PTR_ERR(cm_id_priv->timewait_info); - goto out; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return (PTR_ERR(cm_id_priv->timewait_info)); } ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av); - if (ret) - goto error1; - if (param->alternate_path) { + if (!ret && param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av); - if (ret) + } + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); goto error1; } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = cm_convert_to_ms( param->primary_path->packet_life_time) * 2 + cm_convert_to_ms( param->remote_cm_response_timeout); - if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) { - printk(KERN_WARNING PFX "req timeout_ms %d > %d, decreasing\n", - cm_id_priv->timeout_ms, cm_convert_to_ms(max_timeout)); - cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout); - } cm_id_priv->max_cm_retries = param->max_cm_retries; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; @@ -1201,9 +1210,11 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; -error2: cm_free_msg(cm_id_priv->msg); -error1: kfree(cm_id_priv->timewait_info); -out: return ret; +error2: + cm_free_msg(cm_id_priv->msg); +error1: + kfree(cm_id_priv->timewait_info); + return ret; } EXPORT_SYMBOL(ib_send_cm_req); @@ -1556,7 +1567,8 @@ static int cm_req_handler(struct cm_work *work) work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> - id.local_id); + id.local_id, + GFP_KERNEL); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); goto destroy; @@ -1579,6 +1591,10 @@ static int cm_req_handler(struct cm_work *work) cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); + + /* Workarround: path in req_msg doesn't contain MAC, take it from wc */ + memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, 6); + work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id; ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); if (ret) { ib_get_cached_gid(work->port->cm_dev->ib_device, @@ -1600,13 +1616,6 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv->tid = req_msg->hdr.tid; cm_id_priv->timeout_ms = cm_convert_to_ms( cm_req_get_local_resp_timeout(req_msg)); - if (cm_req_get_local_resp_timeout(req_msg) > (u8) max_timeout) { - printk(KERN_WARNING PFX "rcvd cm_local_resp_timeout %d > %d, " - "decreasing used timeout_ms\n", - cm_req_get_local_resp_timeout(req_msg), max_timeout); - cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout); - } - cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg); cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg); cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg); @@ -1638,18 +1647,24 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid); rep_msg->local_comm_id = cm_id_priv->id.local_id; rep_msg->remote_comm_id = cm_id_priv->id.remote_id; - cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn)); rep_msg->resp_resources = param->responder_resources; - rep_msg->initiator_depth = param->initiator_depth; cm_rep_set_target_ack_delay(rep_msg, cm_id_priv->av.port->cm_dev->ack_delay); cm_rep_set_failover(rep_msg, param->failover_accepted); - cm_rep_set_flow_ctrl(rep_msg, param->flow_control); cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count); - cm_rep_set_srq(rep_msg, param->srq); rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid; + if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) { + rep_msg->initiator_depth = param->initiator_depth; + cm_rep_set_flow_ctrl(rep_msg, param->flow_control); + cm_rep_set_srq(rep_msg, param->srq); + cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); + } else { + cm_rep_set_srq(rep_msg, 1); + cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num)); + } + if (param->private_data && param->private_data_len) memcpy(rep_msg->private_data, param->private_data, param->private_data_len); @@ -1672,6 +1687,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { + pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto out; } @@ -1697,7 +1713,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg); - cm_id_priv->local_qpn = cm_rep_get_local_qpn(rep_msg); + cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; @@ -1738,6 +1754,7 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { + pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto error; } @@ -1768,7 +1785,7 @@ error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); } EXPORT_SYMBOL(ib_send_cm_rtu); -static void cm_format_rep_event(struct cm_work *work) +static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) { struct cm_rep_msg *rep_msg; struct ib_cm_rep_event_param *param; @@ -1777,7 +1794,7 @@ static void cm_format_rep_event(struct cm_work *work) param = &work->cm_event.param.rep_rcvd; param->remote_ca_guid = rep_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(rep_msg->local_qkey); - param->remote_qpn = be32_to_cpu(cm_rep_get_local_qpn(rep_msg)); + param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type)); param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg)); param->responder_resources = rep_msg->initiator_depth; param->initiator_depth = rep_msg->resp_resources; @@ -1842,10 +1859,11 @@ static int cm_rep_handler(struct cm_work *work) cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); if (!cm_id_priv) { cm_dup_rep_handler(work); + pr_debug("no cm_id_priv\n"); return -EINVAL; } - cm_format_rep_event(work); + cm_format_rep_event(work, cm_id_priv->qp_type); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { @@ -1855,12 +1873,13 @@ static int cm_rep_handler(struct cm_work *work) default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; + pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state); goto error; } cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id; cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid; - cm_id_priv->timewait_info->remote_qpn = cm_rep_get_local_qpn(rep_msg); + cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); spin_lock(&cm.lock); /* Check for duplicate REP. */ @@ -1868,6 +1887,7 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; + pr_debug("Failed to insert remote id\n"); goto error; } /* Check for a stale connection. */ @@ -1881,13 +1901,14 @@ static int cm_rep_handler(struct cm_work *work) IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; + pr_debug("Stale connection.\n"); goto error; } spin_unlock(&cm.lock); cm_id_priv->id.state = IB_CM_REP_RCVD; cm_id_priv->id.remote_id = rep_msg->local_comm_id; - cm_id_priv->remote_qpn = cm_rep_get_local_qpn(rep_msg); + cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); cm_id_priv->initiator_depth = rep_msg->resp_resources; cm_id_priv->responder_resources = rep_msg->initiator_depth; cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg); @@ -2021,10 +2042,15 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id, cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED) { + pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto out; } + if (cm_id->lap_state == IB_CM_LAP_SENT || + cm_id->lap_state == IB_CM_MRA_LAP_RCVD) + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) { cm_enter_timewait(cm_id_priv); @@ -2086,6 +2112,7 @@ int ib_send_cm_drep(struct ib_cm_id *cm_id, if (cm_id->state != IB_CM_DREQ_RCVD) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); + pr_debug("cm_id->state(%d) != IB_CM_DREQ_RCVD\n", cm_id->state); return -EINVAL; } @@ -2151,6 +2178,7 @@ static int cm_dreq_handler(struct cm_work *work) atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); + pr_debug("no cm_id_priv\n"); return -EINVAL; } @@ -2166,6 +2194,10 @@ static int cm_dreq_handler(struct cm_work *work) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); break; case IB_CM_ESTABLISHED: + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || + cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + break; case IB_CM_MRA_REP_RCVD: break; case IB_CM_TIMEWAIT: @@ -2187,6 +2219,7 @@ static int cm_dreq_handler(struct cm_work *work) counter[CM_DREQ_COUNTER]); goto unlock; default: + pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; @@ -2290,6 +2323,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, cm_enter_timewait(cm_id_priv); break; default: + pr_debug("cm_id->state: 0x%x\n", cm_id->state); ret = -EINVAL; goto out; } @@ -2386,11 +2420,21 @@ static int cm_rej_handler(struct cm_work *work) /* fall through */ case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: - case IB_CM_ESTABLISHED: cm_enter_timewait(cm_id_priv); break; + case IB_CM_ESTABLISHED: + if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT || + cm_id_priv->id.lap_state == IB_CM_LAP_SENT) { + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) + ib_cancel_mad(cm_id_priv->av.port->mad_agent, + cm_id_priv->msg); + cm_enter_timewait(cm_id_priv); + break; + } + /* fall through */ default: spin_unlock_irq(&cm_id_priv->lock); + pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; goto out; } @@ -2453,6 +2497,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, break; } default: + pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; goto error1; } @@ -2518,12 +2563,6 @@ static int cm_mra_handler(struct cm_work *work) cm_mra_get_service_timeout(mra_msg); timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) + cm_convert_to_ms(cm_id_priv->av.timeout); - if (timeout > cm_convert_to_ms(max_timeout)) { - printk(KERN_WARNING PFX "calculated mra timeout %d > %d, " - "decreasing used timeout_ms\n", timeout, - cm_convert_to_ms(max_timeout)); - timeout = cm_convert_to_ms(max_timeout); - } spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { @@ -2560,6 +2599,7 @@ static int cm_mra_handler(struct cm_work *work) counter[CM_MRA_COUNTER]); /* fall through */ default: + pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); goto out; } @@ -2746,7 +2786,8 @@ static int cm_lap_handler(struct cm_work *work) cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); - cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av); + if (cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av)) + goto unlock; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); @@ -2938,6 +2979,9 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_init_av_by_path(param->path, &cm_id_priv->av); if (ret) goto out; @@ -2945,12 +2989,6 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = param->timeout_ms; - if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) { - printk(KERN_WARNING PFX "sidr req timeout_ms %d > %d, " - "decreasing used timeout_ms\n", param->timeout_ms, - cm_convert_to_ms(max_timeout)); - cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout); - } cm_id_priv->max_cm_retries = param->max_cm_retries; ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) @@ -2961,21 +2999,19 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; - spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_IDLE) ret = ib_post_send_mad(msg, NULL); else ret = -EINVAL; if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); goto out; } cm_id->state = IB_CM_SIDR_REQ_SENT; cm_id_priv->msg = msg; - spin_unlock_irqrestore(&cm_id_priv->lock, flags); out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); @@ -3038,6 +3074,7 @@ static int cm_sidr_req_handler(struct cm_work *work) goto out; /* No match. */ } atomic_inc(&cur_cm_id_priv->refcount); + atomic_inc(&cm_id_priv->refcount); spin_unlock_irq(&cm.lock); cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler; @@ -3302,6 +3339,7 @@ static void cm_work_handler(struct work_struct *_work) ret = cm_timewait_handler(work); break; default: + pr_debug("work->cm_event.event: 0x%x\n", work->cm_event.event); ret = -EINVAL; break; } @@ -3332,6 +3370,7 @@ static int cm_establish(struct ib_cm_id *cm_id) ret = -EISCONN; break; default: + pr_debug("cm_id->state: 0x%x\n", cm_id->state); ret = -EINVAL; break; } @@ -3494,6 +3533,7 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3520,10 +3560,36 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; + if (!cm_id_priv->av.valid) + return -EINVAL; + if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) { + qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id; + *qp_attr_mask |= IB_QP_VID; + } + if (!is_zero_ether_addr(cm_id_priv->av.smac)) { + memcpy(qp_attr->smac, cm_id_priv->av.smac, + sizeof(qp_attr->smac)); + *qp_attr_mask |= IB_QP_SMAC; + } + if (cm_id_priv->alt_av.valid) { + if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) { + qp_attr->alt_vlan_id = + cm_id_priv->alt_av.ah_attr.vlan_id; + *qp_attr_mask |= IB_QP_ALT_VID; + } + if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) { + memcpy(qp_attr->alt_smac, + cm_id_priv->alt_av.smac, + sizeof(qp_attr->alt_smac)); + *qp_attr_mask |= IB_QP_ALT_SMAC; + } + } + qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); - if (cm_id_priv->qp_type == IB_QPT_RC) { + if (cm_id_priv->qp_type == IB_QPT_RC || + cm_id_priv->qp_type == IB_QPT_XRC_TGT) { *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; qp_attr->max_dest_rd_atomic = @@ -3540,6 +3606,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3568,15 +3635,21 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) { *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN; qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn); - if (cm_id_priv->qp_type == IB_QPT_RC) { - *qp_attr_mask |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | + switch (cm_id_priv->qp_type) { + case IB_QPT_RC: + case IB_QPT_XRC_INI: + *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC; - qp_attr->timeout = cm_id_priv->av.timeout; qp_attr->retry_cnt = cm_id_priv->retry_count; qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; - qp_attr->max_rd_atomic = - cm_id_priv->initiator_depth; + qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; + /* fall through */ + case IB_QPT_XRC_TGT: + *qp_attr_mask |= IB_QP_TIMEOUT; + qp_attr->timeout = cm_id_priv->av.timeout; + break; + default: + break; } if (cm_id_priv->alt_av.ah_attr.dlid) { *qp_attr_mask |= IB_QP_PATH_MIG_STATE; @@ -3593,6 +3666,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3619,6 +3693,7 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: + pr_debug("qp_attr->qp_state: 0x%x\n", qp_attr->qp_state); ret = -EINVAL; break; } @@ -3649,7 +3724,7 @@ static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, atomic_long_read(&group->counter[cm_attr->index])); } -static struct sysfs_ops cm_counter_ops = { +static const struct sysfs_ops cm_counter_ops = { .show = cm_show_counter }; @@ -3670,8 +3745,17 @@ static struct kobj_type cm_port_obj_type = { .release = cm_release_port_obj }; +static char *cm_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + struct class cm_class = { + .owner = THIS_MODULE, .name = "infiniband_cm", + .devnode = cm_devnode, }; EXPORT_SYMBOL(cm_class); @@ -3745,7 +3829,7 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, "%s", ib_device->name); - if (!cm_dev->device) { + if (IS_ERR(cm_dev->device)) { kfree(cm_dev); return; } @@ -3846,28 +3930,33 @@ static int __init ib_cm_init(void) cm.remote_sidr_table = RB_ROOT; idr_init(&cm.local_id_table); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); - idr_pre_get(&cm.local_id_table, GFP_KERNEL); + if (!idr_pre_get(&cm.local_id_table, GFP_KERNEL)) + return -ENOMEM; INIT_LIST_HEAD(&cm.timewait_list); ret = class_register(&cm_class); - if (ret) - return -ENOMEM; - - cm.wq = create_workqueue("ib_cm"); - if (!cm.wq) { + if (ret) { ret = -ENOMEM; goto error1; } + cm.wq = create_workqueue("ib_cm"); + if (!cm.wq) { + ret = -ENOMEM; + goto error2; + } + ret = ib_register_client(&cm_client); if (ret) - goto error2; + goto error3; return 0; -error2: +error3: destroy_workqueue(cm.wq); -error1: +error2: class_unregister(&cm_class); +error1: + idr_destroy(&cm.local_id_table); return ret; } diff --git a/sys/ofed/drivers/infiniband/core/cm_msgs.h b/sys/ofed/drivers/infiniband/core/cm_msgs.h index 7e63c08f697c..be068f47e47e 100644 --- a/sys/ofed/drivers/infiniband/core/cm_msgs.h +++ b/sys/ofed/drivers/infiniband/core/cm_msgs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * @@ -44,18 +44,6 @@ #define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */ -#define CM_REQ_ATTR_ID cpu_to_be16(0x0010) -#define CM_MRA_ATTR_ID cpu_to_be16(0x0011) -#define CM_REJ_ATTR_ID cpu_to_be16(0x0012) -#define CM_REP_ATTR_ID cpu_to_be16(0x0013) -#define CM_RTU_ATTR_ID cpu_to_be16(0x0014) -#define CM_DREQ_ATTR_ID cpu_to_be16(0x0015) -#define CM_DREP_ATTR_ID cpu_to_be16(0x0016) -#define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017) -#define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018) -#define CM_LAP_ATTR_ID cpu_to_be16(0x0019) -#define CM_APR_ATTR_ID cpu_to_be16(0x001A) - enum cm_msg_sequence { CM_MSG_SEQUENCE_REQ, CM_MSG_SEQUENCE_LAP, @@ -86,7 +74,7 @@ struct cm_req_msg { __be16 pkey; /* path MTU:4, RDC exists:1, RNR retry count:3. */ u8 offset50; - /* max CM Retries:4, SRQ:1, rsvd:3 */ + /* max CM Retries:4, SRQ:1, extended transport type:3 */ u8 offset51; __be16 primary_local_lid; @@ -175,6 +163,11 @@ static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) switch(transport_type) { case 0: return IB_QPT_RC; case 1: return IB_QPT_UC; + case 3: + switch (req_msg->offset51 & 0x7) { + case 1: return IB_QPT_XRC_TGT; + default: return 0; + } default: return 0; } } @@ -188,6 +181,12 @@ static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg, req_msg->offset40) & 0xFFFFFFF9) | 0x2); break; + case IB_QPT_XRC_INI: + req_msg->offset40 = cpu_to_be32((be32_to_cpu( + req_msg->offset40) & + 0xFFFFFFF9) | 0x6); + req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1; + break; default: req_msg->offset40 = cpu_to_be32(be32_to_cpu( req_msg->offset40) & @@ -527,6 +526,23 @@ static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn) (be32_to_cpu(rep_msg->offset12) & 0x000000FF)); } +static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg) +{ + return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8); +} + +static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn) +{ + rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) | + (be32_to_cpu(rep_msg->offset16) & 0x000000FF)); +} + +static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type) +{ + return (qp_type == IB_QPT_XRC_INI) ? + cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg); +} + static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg) { return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8); @@ -771,6 +787,7 @@ struct cm_apr_msg { u8 info_length; u8 ap_status; + __be16 rsvd; u8 info[IB_CM_APR_INFO_LENGTH]; u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE]; diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c index 318beb15a3d8..d2064b69a6d8 100644 --- a/sys/ofed/drivers/infiniband/core/cma.c +++ b/sys/ofed/drivers/infiniband/core/cma.c @@ -40,6 +40,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -55,28 +59,47 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); -static int tavor_quirk = 0; -module_param_named(tavor_quirk, tavor_quirk, int, 0644); -MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0"); - -int unify_tcp_port_space = 1; -module_param(unify_tcp_port_space, int, 0644); -MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " - "space allocation (default=1)"); - #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) -#define IBOE_PACKET_LIFETIME 18 +#define CMA_IBOE_PACKET_LIFETIME 18 static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT; module_param_named(cma_response_timeout, cma_response_timeout, int, 0644); -MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT default=20"); +MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT (default=20)"); static int def_prec2sl = 3; module_param_named(def_prec2sl, def_prec2sl, int, 0644); MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7"); +static int debug_level = 0; +#define cma_pr(level, priv, format, arg...) \ + printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg) + +#define cma_dbg(priv, format, arg...) \ + do { if (debug_level) cma_pr(KERN_DEBUG, priv, format, ## arg); } while (0) + +#define cma_warn(priv, format, arg...) \ + cma_pr(KERN_WARNING, priv, format, ## arg) + +#define CMA_GID_FMT "%2.2x%2.2x:%2.2x%2.2x" +#define CMA_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\ + ((u8 *)(gid))[13],\ + ((u8 *)(gid))[14],\ + ((u8 *)(gid))[15] + +#define CMA_GID_ARG(gid) CMA_GID_RAW_ARG((gid).raw) +#define cma_debug_path(priv, pfx, p) \ + cma_dbg(priv, pfx "sgid=" CMA_GID_FMT ",dgid=" \ + CMA_GID_FMT "\n", CMA_GID_ARG(p.sgid), \ + CMA_GID_ARG(p.dgid)) + +#define cma_debug_gid(priv, g) \ + cma_dbg(priv, "gid=" CMA_GID_FMT "\n", CMA_GID_ARG(g) + +module_param_named(debug_level, debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "debug level default=0"); + static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); @@ -92,13 +115,12 @@ static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; +static struct workqueue_struct *cma_free_wq; static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); static DEFINE_IDR(ipoib_ps); -#if defined(INET) -static int next_port; -#endif +static DEFINE_IDR(ib_ps); struct cma_device { struct list_head list; @@ -108,26 +130,16 @@ struct cma_device { struct list_head id_list; }; -enum cma_state { - CMA_IDLE, - CMA_ADDR_QUERY, - CMA_ADDR_RESOLVED, - CMA_ROUTE_QUERY, - CMA_ROUTE_RESOLVED, - CMA_CONNECT, - CMA_DISCONNECT, - CMA_ADDR_BOUND, - CMA_LISTEN, - CMA_DEVICE_REMOVAL, - CMA_DESTROYING -}; - struct rdma_bind_list { struct idr *ps; struct hlist_head owners; unsigned short port; }; +enum { + CMA_OPTION_AFONLY, +}; + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -138,7 +150,7 @@ struct rdma_id_private { struct rdma_cm_id id; struct rdma_bind_list *bind_list; - struct socket *sock; + struct socket *sock; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ @@ -146,13 +158,15 @@ struct rdma_id_private { struct list_head mc_list; int internal_id; - enum cma_state state; + enum rdma_cm_state state; spinlock_t lock; + spinlock_t cm_lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; + struct work_struct work; /* garbage coll */ int backlog; int timeout_ms; @@ -166,8 +180,16 @@ struct rdma_id_private { u32 seq_num; u32 qkey; u32 qp_num; + pid_t owner; + u32 options; u8 srq; u8 tos; + u8 reuseaddr; + u8 afonly; + int qp_timeout; + /* cache for mc record params */ + struct ib_sa_mcmember_rec rec; + int is_valid_rec; }; struct cma_multicast { @@ -184,8 +206,8 @@ struct cma_multicast { struct cma_work { struct work_struct work; struct rdma_id_private *id; - enum cma_state old_state; - enum cma_state new_state; + enum rdma_cm_state old_state; + enum rdma_cm_state new_state; struct rdma_cm_event event; }; @@ -236,7 +258,7 @@ struct sdp_hah { #define CMA_VERSION 0x00 #define SDP_MAJ_VERSION 0x2 -static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp) +static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) { unsigned long flags; int ret; @@ -248,7 +270,7 @@ static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp) } static int cma_comp_exch(struct rdma_id_private *id_priv, - enum cma_state comp, enum cma_state exch) + enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; @@ -260,11 +282,11 @@ static int cma_comp_exch(struct rdma_id_private *id_priv, return ret; } -static enum cma_state cma_exch(struct rdma_id_private *id_priv, - enum cma_state exch) +static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, + enum rdma_cm_state exch) { unsigned long flags; - enum cma_state old; + enum rdma_cm_state old; spin_lock_irqsave(&id_priv->lock, flags); old = id_priv->state; @@ -298,11 +320,6 @@ static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); } -static inline int cma_is_ud_ps(enum rdma_port_space ps) -{ - return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB); -} - static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { @@ -328,11 +345,13 @@ static inline void release_mc(struct kref *kref) kfree(mc); } -static void cma_detach_from_dev(struct rdma_id_private *id_priv) +static void cma_release_dev(struct rdma_id_private *id_priv) { + mutex_lock(&lock); list_del(&id_priv->list); cma_deref_dev(id_priv->cma_dev); id_priv->cma_dev = NULL; + mutex_unlock(&lock); } static int cma_set_qkey(struct rdma_id_private *id_priv) @@ -361,36 +380,71 @@ static int cma_set_qkey(struct rdma_id_private *id_priv) return ret; } +static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) +{ + int i; + int err; + struct ib_port_attr props; + union ib_gid tmp; + + err = ib_query_port(device, port_num, &props); + if (err) + return 1; + + for (i = 0; i < props.gid_tbl_len; ++i) { + err = ib_query_gid(device, port_num, i, &tmp); + if (err) + return 1; + if (!memcmp(&tmp, gid, sizeof tmp)) + return 0; + } + + return -EAGAIN; +} + static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid; + union ib_gid gid, iboe_gid; int ret = -ENODEV; + u8 port; + enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; - if (dev_addr->dev_type != ARPHRD_INFINIBAND) { - iboe_addr_get_sgid(dev_addr, &gid); - list_for_each_entry(cma_dev, &dev_list, list) { - ret = ib_find_cached_gid(cma_dev->device, &gid, - &id_priv->id.port_num, NULL); - if (!ret) - goto out; - } - } + if (dev_ll != IB_LINK_LAYER_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + mutex_lock(&lock); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); list_for_each_entry(cma_dev, &dev_list, list) { - ret = ib_find_cached_gid(cma_dev->device, &gid, - &id_priv->id.port_num, NULL); - if (!ret) + for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { + if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) + ret = find_gid_port(cma_dev->device, &iboe_gid, port); + else + ret = find_gid_port(cma_dev->device, &gid, port); + + if (!ret) { + id_priv->id.port_num = port; + goto out; + } else if (ret == 1) break; } + } + } out: if (!ret) cma_attach_to_dev(id_priv, cma_dev); + mutex_unlock(&lock); return ret; } @@ -401,7 +455,7 @@ static void cma_deref_id(struct rdma_id_private *id_priv) } static int cma_disable_callback(struct rdma_id_private *id_priv, - enum cma_state state) + enum rdma_cm_state state) { mutex_lock(&id_priv->handler_mutex); if (id_priv->state != state) { @@ -411,13 +465,9 @@ static int cma_disable_callback(struct rdma_id_private *id_priv, return 0; } -static int cma_has_cm_dev(struct rdma_id_private *id_priv) -{ - return (id_priv->id.device && id_priv->cm_id.ib); -} - struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps) + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type) { struct rdma_id_private *id_priv; @@ -425,11 +475,14 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, if (!id_priv) return ERR_PTR(-ENOMEM); - id_priv->state = CMA_IDLE; + id_priv->owner = curthread->td_proc->p_pid; + id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; + id_priv->id.qp_type = qp_type; spin_lock_init(&id_priv->lock); + spin_lock_init(&id_priv->cm_lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); @@ -496,7 +549,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, if (IS_ERR(qp)) return PTR_ERR(qp); - if (cma_is_ud_ps(id_priv->id.ps)) + if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); @@ -530,6 +583,7 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; + union ib_gid sgid; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { @@ -551,6 +605,20 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; + ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, + qp_attr.ah_attr.grh.sgid_index, &sgid); + if (ret) + goto out; + + if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) + == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) + == IB_LINK_LAYER_ETHERNET) { + ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL); + + if (ret) + goto out; + } if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; @@ -579,6 +647,12 @@ static int cma_modify_qp_rts(struct rdma_id_private *id_priv, if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; + + if (id_priv->qp_timeout && id_priv->id.qp->qp_type == IB_QPT_RC) { + qp_attr.timeout = id_priv->qp_timeout; + qp_attr_mask |= IB_QP_TIMEOUT; + } + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); @@ -624,7 +698,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; - if (cma_is_ud_ps(id_priv->id.ps)) { + if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_qkey(id_priv); if (ret) return ret; @@ -647,7 +721,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, id_priv = container_of(id, struct rdma_id_private, id); switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: - if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps)) + if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, @@ -656,6 +730,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, qp_attr->rq_psn = id_priv->seq_num; break; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; @@ -701,6 +776,21 @@ static inline int cma_any_addr(struct sockaddr *addr) return cma_zero_addr(addr) || cma_loopback_addr(addr); } +static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) +{ + if (src->sa_family != dst->sa_family) + return -1; + + switch (src->sa_family) { + case AF_INET: + return ((struct sockaddr_in *) src)->sin_addr.s_addr != + ((struct sockaddr_in *) dst)->sin_addr.s_addr; + default: + return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, + &((struct sockaddr_in6 *) dst)->sin6_addr); + } +} + static inline __be16 cma_port(struct sockaddr *addr) { if (addr->sa_family == AF_INET) @@ -831,16 +921,16 @@ static void cma_cancel_listens(struct rdma_id_private *id_priv) } static void cma_cancel_operation(struct rdma_id_private *id_priv, - enum cma_state state) + enum rdma_cm_state state) { switch (state) { - case CMA_ADDR_QUERY: + case RDMA_CM_ADDR_QUERY: rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; - case CMA_ROUTE_QUERY: + case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; - case CMA_LISTEN: + case RDMA_CM_LISTEN: if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) && !id_priv->cma_dev) cma_cancel_listens(id_priv); @@ -852,20 +942,21 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, static void cma_release_port(struct rdma_id_private *id_priv) { - struct rdma_bind_list *bind_list = id_priv->bind_list; - - if (!bind_list) - return; + struct rdma_bind_list *bind_list; mutex_lock(&lock); + bind_list = id_priv->bind_list; + if (!bind_list) { + mutex_unlock(&lock); + return; + } hlist_del(&id_priv->node); + id_priv->bind_list = NULL; if (hlist_empty(&bind_list->owners)) { idr_remove(bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); - if (id_priv->sock) - sock_release(id_priv->sock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) @@ -889,39 +980,11 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) } } } - -void rdma_destroy_id(struct rdma_cm_id *id) +static void __rdma_free(struct work_struct *work) { struct rdma_id_private *id_priv; - enum cma_state state; + id_priv = container_of(work, struct rdma_id_private, work); - id_priv = container_of(id, struct rdma_id_private, id); - state = cma_exch(id_priv, CMA_DESTROYING); - cma_cancel_operation(id_priv, state); - - mutex_lock(&lock); - if (id_priv->cma_dev) { - mutex_unlock(&lock); - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: - if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) - ib_destroy_cm_id(id_priv->cm_id.ib); - break; - case RDMA_TRANSPORT_IWARP: - if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw)) - iw_destroy_cm_id(id_priv->cm_id.iw); - break; - default: - break; - } - cma_leave_mc_groups(id_priv); - mutex_lock(&lock); - cma_detach_from_dev(id_priv); - } - mutex_unlock(&lock); - - cma_release_port(id_priv); - cma_deref_id(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) @@ -930,6 +993,54 @@ void rdma_destroy_id(struct rdma_cm_id *id) kfree(id_priv->id.route.path_rec); kfree(id_priv); } + +void rdma_destroy_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + enum rdma_cm_state state; + unsigned long flags; + struct ib_cm_id *ib; + + id_priv = container_of(id, struct rdma_id_private, id); + state = cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_cancel_operation(id_priv, state); + + /* + * Wait for any active callback to finish. New callbacks will find + * the id_priv state set to destroying and abort. + */ + mutex_lock(&id_priv->handler_mutex); + mutex_unlock(&id_priv->handler_mutex); + + if (id_priv->cma_dev) { + switch (rdma_node_get_transport(id_priv->id.device->node_type)) { + case RDMA_TRANSPORT_IB: + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) { + ib = id_priv->cm_id.ib; + id_priv->cm_id.ib = NULL; + spin_unlock_irqrestore(&id_priv->cm_lock, flags); + ib_destroy_cm_id(ib); + } else + spin_unlock_irqrestore(&id_priv->cm_lock, flags); + break; + case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: + if (id_priv->cm_id.iw) + iw_destroy_cm_id(id_priv->cm_id.iw); + break; + default: + break; + } + cma_leave_mc_groups(id_priv); + cma_release_dev(id_priv); + } + + cma_release_port(id_priv); + cma_deref_id(id_priv); + INIT_WORK(&id_priv->work, __rdma_free); + queue_work(cma_free_wq, &id_priv->work); +} EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) @@ -944,6 +1055,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) if (ret) goto reject; + cma_dbg(id_priv, "sending RTU\n"); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; @@ -951,6 +1063,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) return 0; reject: cma_modify_qp_err(id_priv); + cma_dbg(id_priv, "sending REJ\n"); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; @@ -987,11 +1100,10 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) int ret = 0; if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, CMA_CONNECT)) || + cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, CMA_DISCONNECT))) + cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) return 0; - memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: @@ -1020,7 +1132,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.status = -ETIMEDOUT; /* fall through */ case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: - if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT)) + if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, + RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; @@ -1047,7 +1160,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, CMA_DESTROYING); + cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; @@ -1070,12 +1183,12 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (cma_get_net_info(ib_event->private_data, listen_id->ps, &ip_ver, &port, &src, &dst)) - goto err; + return NULL; id = rdma_create_id(listen_id->event_handler, listen_id->context, - listen_id->ps); + listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) - goto err; + return NULL; cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); @@ -1085,7 +1198,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, GFP_KERNEL); if (!rt->path_rec) - goto destroy_id; + goto err; rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; if (rt->num_paths == 2) @@ -1094,22 +1207,21 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); - ib_addr_set_pkey(&rt->addr.dev_addr, rt->path_rec[0].pkey); + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else { ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, - &rt->addr.dev_addr); + &rt->addr.dev_addr, NULL); if (ret) - goto destroy_id; + goto err; } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv = container_of(id, struct rdma_id_private, id); - id_priv->state = CMA_CONNECT; + id_priv->state = RDMA_CM_CONNECT; return id_priv; -destroy_id: - rdma_destroy_id(id); err: + rdma_destroy_id(id); return NULL; } @@ -1124,7 +1236,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, int ret; id = rdma_create_id(listen_id->event_handler, listen_id->context, - listen_id->ps); + listen_id->ps, IB_QPT_UD); if (IS_ERR(id)) return NULL; @@ -1138,13 +1250,13 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr); + &id->route.addr.dev_addr, NULL); if (ret) goto err; } id_priv = container_of(id, struct rdma_id_private, id); - id_priv->state = CMA_CONNECT; + id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); @@ -1166,20 +1278,43 @@ static void cma_set_req_event_data(struct rdma_cm_event *event, event->param.conn.qp_num = req_data->remote_qpn; } +static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) +{ + return (((ib_event->event == IB_CM_REQ_RECEIVED) && + (ib_event->param.req_rcvd.qp_type == id->qp_type)) || + ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && + (id->qp_type == IB_QPT_UD)) || + (!id->qp_type)); +} + static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int offset, ret; + u8 smac[ETH_ALEN]; + u8 alt_smac[ETH_ALEN]; + u8 *psmac = smac; + u8 *palt_smac = alt_smac; + int is_iboe = ((rdma_node_get_transport(cm_id->device->node_type) == + RDMA_TRANSPORT_IB) && + (rdma_port_get_link_layer(cm_id->device, + ib_event->param.req_rcvd.port) == + IB_LINK_LAYER_ETHERNET)); + int is_sidr = 0; listen_id = cm_id->context; - if (cma_disable_callback(listen_id, CMA_LISTEN)) + if (!cma_check_req_qp_type(&listen_id->id, ib_event)) + return -EINVAL; + + if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) return -ECONNABORTED; memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; - if (cma_is_ud_ps(listen_id->id.ps)) { + if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { + is_sidr = 1; conn_id = cma_new_udp_id(&listen_id->id, ib_event); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = @@ -1191,45 +1326,69 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) } if (!conn_id) { ret = -ENOMEM; - goto out; + goto err1; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - mutex_lock(&lock); ret = cma_acquire_dev(conn_id); - mutex_unlock(&lock); if (ret) - goto release_conn_id; + goto err2; conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; + /* + * Protect against the user destroying conn_id from another thread + * until we're done accessing it. + */ + atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); - if (!ret) { + if (ret) + goto err3; + + if (is_iboe && !is_sidr) { + if (ib_event->param.req_rcvd.primary_path != NULL) + rdma_addr_find_smac_by_sgid( + &ib_event->param.req_rcvd.primary_path->sgid, + psmac, NULL); + else + psmac = NULL; + if (ib_event->param.req_rcvd.alternate_path != NULL) + rdma_addr_find_smac_by_sgid( + &ib_event->param.req_rcvd.alternate_path->sgid, + palt_smac, NULL); + else + palt_smac = NULL; + } /* * Acquire mutex to prevent user executing rdma_destroy_id() * while we're accessing the cm_id. */ mutex_lock(&lock); - if (cma_comp(conn_id, CMA_CONNECT) && - !cma_is_ud_ps(conn_id->id.ps)) + if (is_iboe && !is_sidr) + ib_update_cm_av(cm_id, psmac, palt_smac); + if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) { + cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n"); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + } mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); - goto out; - } + mutex_unlock(&listen_id->handler_mutex); + cma_deref_id(conn_id); + return 0; +err3: + cma_deref_id(conn_id); /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; - -release_conn_id: - cma_exch(conn_id, CMA_DESTROYING); +err2: + cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(&conn_id->id); - -out: +err1: mutex_unlock(&listen_id->handler_mutex); + if (conn_id) + rdma_destroy_id(&conn_id->id); return ret; } @@ -1244,9 +1403,7 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, struct cma_hdr *cma_data, *cma_mask; struct sdp_hh *sdp_data, *sdp_mask; __be32 ip4_addr; -#ifdef INET6 struct in6_addr ip6_addr; -#endif memset(compare, 0, sizeof *compare); cma_data = (void *) compare->data; @@ -1260,33 +1417,39 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, if (ps == RDMA_PS_SDP) { sdp_set_ip_ver(sdp_data, 4); sdp_set_ip_ver(sdp_mask, 0xF); - sdp_data->dst_addr.ip4.addr = ip4_addr; - sdp_mask->dst_addr.ip4.addr = htonl(~0); + if (!cma_any_addr(addr)) { + sdp_data->dst_addr.ip4.addr = ip4_addr; + sdp_mask->dst_addr.ip4.addr = htonl(~0); + } } else { cma_set_ip_ver(cma_data, 4); cma_set_ip_ver(cma_mask, 0xF); - cma_data->dst_addr.ip4.addr = ip4_addr; - cma_mask->dst_addr.ip4.addr = htonl(~0); + if (!cma_any_addr(addr)) { + cma_data->dst_addr.ip4.addr = ip4_addr; + cma_mask->dst_addr.ip4.addr = htonl(~0); + } } break; -#ifdef INET6 case AF_INET6: ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; if (ps == RDMA_PS_SDP) { sdp_set_ip_ver(sdp_data, 6); sdp_set_ip_ver(sdp_mask, 0xF); - sdp_data->dst_addr.ip6 = ip6_addr; - memset(&sdp_mask->dst_addr.ip6, 0xFF, - sizeof sdp_mask->dst_addr.ip6); + if (!cma_any_addr(addr)) { + sdp_data->dst_addr.ip6 = ip6_addr; + memset(&sdp_mask->dst_addr.ip6, 0xFF, + sizeof(sdp_mask->dst_addr.ip6)); + } } else { cma_set_ip_ver(cma_data, 6); cma_set_ip_ver(cma_mask, 0xF); - cma_data->dst_addr.ip6 = ip6_addr; - memset(&cma_mask->dst_addr.ip6, 0xFF, - sizeof cma_mask->dst_addr.ip6); + if (!cma_any_addr(addr)) { + cma_data->dst_addr.ip6 = ip6_addr; + memset(&cma_mask->dst_addr.ip6, 0xFF, + sizeof(cma_mask->dst_addr.ip6)); + } } break; -#endif default: break; } @@ -1299,7 +1462,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr_in *sin; int ret = 0; - if (cma_disable_callback(id_priv, CMA_CONNECT)) + if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) return 0; memset(&event, 0, sizeof event); @@ -1315,6 +1478,8 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) switch ((int)iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: @@ -1330,6 +1495,8 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; break; default: BUG_ON(1); @@ -1342,7 +1509,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; - cma_exch(id_priv, CMA_DESTROYING); + cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; @@ -1364,22 +1531,22 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct ib_device_attr attr; listen_id = cm_id->context; - if (cma_disable_callback(listen_id, CMA_LISTEN)) + if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, listen_id->id.context, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; } conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - conn_id->state = CMA_CONNECT; + conn_id->state = RDMA_CM_CONNECT; - dev = ip_dev_find(NULL, iw_event->local_addr.sin_addr.s_addr); + dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); if (!dev) { ret = -EADDRNOTAVAIL; mutex_unlock(&conn_id->handler_mutex); @@ -1393,9 +1560,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; } - mutex_lock(&lock); ret = cma_acquire_dev(conn_id); - mutex_unlock(&lock); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -1422,19 +1587,27 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; - event.param.conn.initiator_depth = attr.max_qp_init_rd_atom; - event.param.conn.responder_resources = attr.max_qp_rd_atom; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + + /* + * Protect against the user destroying conn_id from another thread + * until we're done accessing it. + */ + atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; - cma_exch(conn_id, CMA_DESTROYING); + cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); + cma_deref_id(conn_id); rdma_destroy_id(&conn_id->id); goto out; } mutex_unlock(&conn_id->handler_mutex); + cma_deref_id(conn_id); out: if (dev) @@ -1447,17 +1620,19 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) { struct ib_cm_compare_data compare_data; struct sockaddr *addr; + struct ib_cm_id *id; __be64 svc_id; int ret; - id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler, - id_priv); - if (IS_ERR(id_priv->cm_id.ib)) - return PTR_ERR(id_priv->cm_id.ib); + id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); + if (IS_ERR(id)) + return PTR_ERR(id); + + id_priv->cm_id.ib = id; addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; svc_id = cma_get_service_id(id_priv->id.ps, addr); - if (cma_any_addr(addr)) + if (cma_any_addr(addr) && !id_priv->afonly) ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); else { cma_set_compare_data(id_priv->id.ps, addr, &compare_data); @@ -1476,13 +1651,16 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct sockaddr_in *sin; + struct iw_cm_id *id; - id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device, + id = iw_create_cm_id(id_priv->id.device, id_priv->sock, iw_conn_req_handler, id_priv); - if (IS_ERR(id_priv->cm_id.iw)) - return PTR_ERR(id_priv->cm_id.iw); + if (IS_ERR(id)) + return PTR_ERR(id); + + id_priv->cm_id.iw = id; sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; id_priv->cm_id.iw->local_addr = *sin; @@ -1514,13 +1692,14 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct rdma_cm_id *id; int ret; - id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps); + id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, + id_priv->id.qp_type); if (IS_ERR(id)) return; dev_id_priv = container_of(id, struct rdma_id_private, id); - dev_id_priv->state = CMA_ADDR_BOUND; + dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); @@ -1528,11 +1707,11 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; + dev_id_priv->afonly = id_priv->afonly; ret = rdma_listen(id, id_priv->backlog); if (ret) - printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, " - "listening on device %s\n", ret, cma_dev->device->name); + cma_warn(id_priv, "cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -1546,50 +1725,6 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -int rdma_listen(struct rdma_cm_id *id, int backlog) -{ - struct rdma_id_private *id_priv; - int ret; - - id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == CMA_IDLE) { - ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; - ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); - if (ret) - return ret; - } - - if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN)) - return -EINVAL; - - id_priv->backlog = backlog; - if (id->device) { - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - ret = cma_ib_listen(id_priv); - if (ret) - goto err; - break; - case RDMA_TRANSPORT_IWARP: - ret = cma_iw_listen(id_priv, backlog); - if (ret) - goto err; - break; - default: - ret = -ENOSYS; - goto err; - } - } else - cma_listen_on_all(id_priv); - - return 0; -err: - id_priv->backlog = 0; - cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND); - return ret; -} -EXPORT_SYMBOL(rdma_listen); - void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; @@ -1599,6 +1734,15 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) } EXPORT_SYMBOL(rdma_set_service_type); +void rdma_set_timeout(struct rdma_cm_id *id, int timeout) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->qp_timeout = (u8) timeout; +} +EXPORT_SYMBOL(rdma_set_timeout); + static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { @@ -1611,8 +1755,8 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, route->num_paths = 1; *route->path_rec = *path_rec; } else { - work->old_state = CMA_ROUTE_QUERY; - work->new_state = CMA_ADDR_RESOLVED; + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; } @@ -1650,11 +1794,6 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; } - if (tavor_quirk) { - path_rec.mtu_selector = IB_SA_LT; - path_rec.mtu = IB_MTU_2048; - } - id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, @@ -1675,7 +1814,7 @@ static void cma_work_handler(struct work_struct *_work) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, CMA_DESTROYING); + cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } out: @@ -1693,12 +1832,12 @@ static void cma_ndev_work_handler(struct work_struct *_work) int destroy = 0; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state == CMA_DESTROYING || - id_priv->state == CMA_DEVICE_REMOVAL) + if (id_priv->state == RDMA_CM_DESTROYING || + id_priv->state == RDMA_CM_DEVICE_REMOVAL) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, CMA_DESTROYING); + cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } @@ -1722,8 +1861,8 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); - work->old_state = CMA_ROUTE_QUERY; - work->new_state = CMA_ROUTE_RESOLVED; + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); @@ -1752,19 +1891,21 @@ int rdma_set_ib_paths(struct rdma_cm_id *id, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED)) + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; - id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL); + id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, + GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } - memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths); + id->route.num_paths = num_paths; return 0; err: - cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED); + cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_paths); @@ -1779,8 +1920,8 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); - work->old_state = CMA_ROUTE_QUERY; - work->new_state = CMA_ROUTE_RESOLVED; + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; queue_work(cma_wq, &work->work); return 0; @@ -1800,7 +1941,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; - u16 vid; + if (src_addr->sin_family != dst_addr->sin_family) return -EINVAL; @@ -1827,10 +1968,15 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - vid = rdma_vlan_dev_vlan_id(ndev); + route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); + memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); + memcpy(route->path_rec->smac, IF_LLADDR(ndev), ndev->if_addrlen); - iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid); - iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid); + + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &route->path_rec->sgid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, + &route->path_rec->dgid); route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; @@ -1838,23 +1984,19 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = tos_to_sl(id_priv->tos); -#ifdef __linux__ - route->path_rec->mtu = iboe_get_mtu(ndev->mtu); -#else route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); -#endif route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; - route->path_rec->packet_life_time = IBOE_PACKET_LIFETIME; + route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } - work->old_state = CMA_ROUTE_QUERY; - work->new_state = CMA_ROUTE_RESOLVED; + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; work->event.status = 0; @@ -1876,7 +2018,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY)) + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); @@ -1894,6 +2036,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) } break; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: ret = cma_resolve_iw_route(id_priv, timeout_ms); break; default: @@ -1905,12 +2048,19 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) return 0; err: - cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED); + cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); +int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type) +{ + /* APM is not supported yet */ + return -EINVAL; +} +EXPORT_SYMBOL(rdma_enable_apm); + static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; @@ -1964,34 +2114,26 @@ static void addr_handler(int status, struct sockaddr *src_addr, memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); - - /* - * Grab mutex to block rdma_destroy_id() from removing the device while - * we're trying to acquire it. - */ - mutex_lock(&lock); - if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) { - mutex_unlock(&lock); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, + RDMA_CM_ADDR_RESOLVED)) goto out; - } + memcpy(&id_priv->id.route.addr.src_addr, src_addr, + ip_addr_size(src_addr)); if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv); - mutex_unlock(&lock); if (status) { - if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND)) + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; - } else { - memcpy(&id_priv->id.route.addr.src_addr, src_addr, - ip_addr_size(src_addr)); + } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; - } if (id_priv->id.event_handler(&id_priv->id, &event)) { - cma_exch(id_priv, CMA_DESTROYING); + cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); @@ -2026,18 +2168,18 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) if (cma_zero_addr(src)) { dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; if ((src->sa_family = dst->sa_family) == AF_INET) { - ((struct sockaddr_in *) src)->sin_addr.s_addr = - ((struct sockaddr_in *) dst)->sin_addr.s_addr; + ((struct sockaddr_in *)src)->sin_addr = + ((struct sockaddr_in *)dst)->sin_addr; } else { - ipv6_addr_copy(&((struct sockaddr_in6 *) src)->sin6_addr, - &((struct sockaddr_in6 *) dst)->sin6_addr); + ((struct sockaddr_in6 *)src)->sin6_addr = + ((struct sockaddr_in6 *)dst)->sin6_addr; } } work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); - work->old_state = CMA_ADDR_QUERY; - work->new_state = CMA_ADDR_RESOLVED; + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; @@ -2046,6 +2188,25 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) return ret; } +static int cma_resolve_scif(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + /* we probably can leave it empty here */ + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + queue_work(cma_wq, &work->work); + return 0; +} + static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr) { @@ -2061,11 +2222,12 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, else { struct sockaddr_in addr_in; - memset(&addr_in, 0, sizeof addr_in); - addr_in.sin_family = dst_addr->sa_family; - addr_in.sin_len = sizeof addr_in; - return rdma_bind_addr(id, (struct sockaddr *) &addr_in); + memset(&addr_in, 0, sizeof addr_in); + addr_in.sin_family = dst_addr->sa_family; + addr_in.sin_len = sizeof addr_in; + return rdma_bind_addr(id, (struct sockaddr *) &addr_in); } + } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, @@ -2075,19 +2237,22 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == CMA_IDLE) { + if (id_priv->state == RDMA_CM_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); if (ret) return ret; } - if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY)) + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); if (cma_any_addr(dst_addr)) ret = cma_resolve_loopback(id_priv); + else if (id_priv->id.device && + rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF) + ret = cma_resolve_scif(id_priv); else ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, dst_addr, &id->route.addr.dev_addr, @@ -2097,12 +2262,51 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return 0; err: - cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND); + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); +int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) +{ + struct rdma_id_private *id_priv; + unsigned long flags; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irqsave(&id_priv->lock, flags); + if (id_priv->state == RDMA_CM_IDLE) { + id_priv->reuseaddr = reuse; + ret = 0; + } else { + ret = -EINVAL; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_set_reuseaddr); + +int rdma_set_afonly(struct rdma_cm_id *id, int afonly) +{ + struct rdma_id_private *id_priv; + unsigned long flags; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irqsave(&id_priv->lock, flags); + if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { + id_priv->options |= (1 << CMA_OPTION_AFONLY); + id_priv->afonly = afonly; + ret = 0; + } else { + ret = -EINVAL; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_set_afonly); + static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { @@ -2149,126 +2353,100 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) { -#if defined(INET) - struct rdma_bind_list *bind_list; - int port, ret, low, high; - - bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); - if (!bind_list) - return -ENOMEM; - -retry: - /* FIXME: add proper port randomization per like inet_csk_get_port */ - do { - ret = idr_get_new_above(ps, bind_list, next_port, &port); - } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); - - if (ret) - goto err1; + static unsigned int last_used_port; + int low, high, remaining; + unsigned int rover; inet_get_local_port_range(&low, &high); - if (port > high) { - if (next_port != low) { - idr_remove(ps, port); - next_port = low; - goto retry; + remaining = (high - low) + 1; + rover = random() % remaining + low; +retry: + if (last_used_port != rover && + !idr_find(ps, (unsigned short) rover)) { + int ret = cma_alloc_port(ps, id_priv, rover); + /* + * Remember previously used port number in order to avoid + * re-using same port immediately after it is closed. + */ + if (!ret) + last_used_port = rover; + if (ret != -EADDRNOTAVAIL) + return ret; } - ret = -EADDRNOTAVAIL; - goto err2; + if (--remaining) { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + goto retry; } + return -EADDRNOTAVAIL; +} - if (port == high) - next_port = low; - else - next_port = port + 1; +/* + * Check that the requested port is available. This is called when trying to + * bind to a specific port, or when trying to listen on a bound port. In + * the latter case, the provided id_priv may already be on the bind_list, but + * we still need to check that it's okay to start listening. + */ +static int cma_check_port(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv, uint8_t reuseaddr) +{ + struct rdma_id_private *cur_id; + struct sockaddr *addr, *cur_addr; + struct hlist_node *node; - bind_list->ps = ps; - bind_list->port = (unsigned short) port; - cma_bind_port(bind_list, id_priv); + addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; + hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { + if (id_priv == cur_id) + continue; + + if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && + cur_id->reuseaddr) + continue; + + cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; + if (id_priv->afonly && cur_id->afonly && + (addr->sa_family != cur_addr->sa_family)) + continue; + + if (cma_any_addr(addr) || cma_any_addr(cur_addr)) + return -EADDRNOTAVAIL; + + if (!cma_addr_cmp(addr, cur_addr)) + return -EADDRINUSE; + } return 0; -err2: - idr_remove(ps, port); -err1: - kfree(bind_list); - return ret; -#else - return -ENOSPC; -#endif } static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) { - struct rdma_id_private *cur_id; - struct sockaddr_in *sin, *cur_sin; struct rdma_bind_list *bind_list; - struct hlist_node *node; unsigned short snum; + int ret; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - snum = ntohs(sin->sin_port); -#ifdef __linux__ - if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) - return -EACCES; -#endif + snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); bind_list = idr_find(ps, snum); - if (!bind_list) - return cma_alloc_port(ps, id_priv, snum); - - /* - * We don't support binding to any address if anyone is bound to - * a specific address on the same port. - */ - if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) - return -EADDRNOTAVAIL; - - hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { - if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr)) - return -EADDRNOTAVAIL; - - cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr; - if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr) - return -EADDRINUSE; + if (!bind_list) { + ret = cma_alloc_port(ps, id_priv, snum); + } else { + ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); + if (!ret) + cma_bind_port(bind_list, id_priv); } - - cma_bind_port(bind_list, id_priv); - return 0; + return ret; } -static int cma_get_tcp_port(struct rdma_id_private *id_priv) +static int cma_bind_listen(struct rdma_id_private *id_priv) { - int ret; - int size; - struct socket *sock; + struct rdma_bind_list *bind_list = id_priv->bind_list; + int ret = 0; - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret) - return ret; -#ifdef __linux__ - ret = sock->ops->bind(sock, - (struct sockaddr *) &id_priv->id.route.addr.src_addr, - ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); -#else - ret = -sobind(sock, - (struct sockaddr *)&id_priv->id.route.addr.src_addr, - curthread); -#endif - if (ret) { - sock_release(sock); - return ret; - } - - size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); - ret = sock_getname(sock, - (struct sockaddr *) &id_priv->id.route.addr.src_addr, - &size, 0); - if (ret) { - sock_release(sock); - return ret; - } - - id_priv->sock = sock; - return 0; + mutex_lock(&lock); + if (bind_list->owners.first->next) + ret = cma_check_port(bind_list, id_priv, 0); + mutex_unlock(&lock); + return ret; } static int cma_get_port(struct rdma_id_private *id_priv) @@ -2282,11 +2460,6 @@ static int cma_get_port(struct rdma_id_private *id_priv) break; case RDMA_PS_TCP: ps = &tcp_ps; - if (unify_tcp_port_space) { - ret = cma_get_tcp_port(id_priv); - if (ret) - goto out; - } break; case RDMA_PS_UDP: ps = &udp_ps; @@ -2294,6 +2467,9 @@ static int cma_get_port(struct rdma_id_private *id_priv) case RDMA_PS_IPOIB: ps = &ipoib_ps; break; + case RDMA_PS_IB: + ps = &ib_ps; + break; default: return -EPROTONOSUPPORT; } @@ -2304,7 +2480,7 @@ static int cma_get_port(struct rdma_id_private *id_priv) else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); -out: + return ret; } @@ -2318,11 +2494,7 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, return 0; sin6 = (struct sockaddr_in6 *) addr; -#ifdef __linux__ - if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) && -#else if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && -#endif !sin6->sin6_scope_id) return -EINVAL; @@ -2331,48 +2503,105 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, return 0; } +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == RDMA_CM_IDLE) { + ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; + ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); + if (ret) + return ret; + } + + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) + return -EINVAL; + + if (id_priv->reuseaddr) { + ret = cma_bind_listen(id_priv); + if (ret) + goto err; + } + + id_priv->backlog = backlog; + if (id->device) { + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_ib_listen(id_priv); + if (ret) + goto err; + break; + case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: + ret = cma_iw_listen(id_priv, backlog); + if (ret) + goto err; + break; + default: + ret = -ENOSYS; + goto err; + } + } else + cma_listen_on_all(id_priv); + + return 0; +err: + id_priv->backlog = 0; + cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_listen); + int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; int ret; + int ipv6only; + size_t var_size = sizeof(int); if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) + if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); if (ret) goto err1; + memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = rdma_translate_ip(addr, &id->route.addr.dev_addr); + ret = rdma_translate_ip(addr, &id->route.addr.dev_addr, NULL); if (ret) goto err1; - mutex_lock(&lock); ret = cma_acquire_dev(id_priv); - mutex_unlock(&lock); if (ret) goto err1; } - memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); + if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { + if (addr->sa_family == AF_INET) + id_priv->afonly = 1; +#if defined(INET6) + else if (addr->sa_family == AF_INET6) + id_priv->afonly = kernel_sysctlbyname(&thread0, "net.inet6.ip6.v6only", + &ipv6only, &var_size, NULL, 0, NULL, 0); +#endif + } ret = cma_get_port(id_priv); if (ret) goto err2; return 0; err2: - if (id_priv->cma_dev) { - mutex_lock(&lock); - cma_detach_from_dev(id_priv); - mutex_unlock(&lock); - } + if (id_priv->cma_dev) + cma_release_dev(id_priv); err1: - cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE); + cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } EXPORT_SYMBOL(rdma_bind_addr); @@ -2445,7 +2674,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; - if (cma_disable_callback(id_priv, CMA_CONNECT)) + if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) return 0; memset(&event, 0, sizeof event); @@ -2491,7 +2720,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, CMA_DESTROYING); + cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; @@ -2506,10 +2735,14 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, { struct ib_cm_sidr_req_param req; struct rdma_route *route; + struct ib_cm_id *id; int ret; req.private_data_len = sizeof(struct cma_hdr) + conn_param->private_data_len; + if (req.private_data_len < conn_param->private_data_len) + return -EINVAL; + req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!req.private_data) return -ENOMEM; @@ -2523,12 +2756,13 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, if (ret) goto out; - id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, - cma_sidr_rep_handler, id_priv); - if (IS_ERR(id_priv->cm_id.ib)) { - ret = PTR_ERR(id_priv->cm_id.ib); + id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, + id_priv); + if (IS_ERR(id)) { + ret = PTR_ERR(id); goto out; } + id_priv->cm_id.ib = id; req.path = route->path_rec; req.service_id = cma_get_service_id(id_priv->id.ps, @@ -2536,6 +2770,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, req.timeout_ms = 1 << (cma_response_timeout - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; + cma_dbg(id_priv, "sending SIDR\n"); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); @@ -2552,11 +2787,15 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, struct ib_cm_req_param req; struct rdma_route *route; void *private_data; + struct ib_cm_id *id; int offset, ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv->id.ps); req.private_data_len = offset + conn_param->private_data_len; + if (req.private_data_len < conn_param->private_data_len) + return -EINVAL; + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; @@ -2565,12 +2804,12 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); - id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler, - id_priv); - if (IS_ERR(id_priv->cm_id.ib)) { - ret = PTR_ERR(id_priv->cm_id.ib); + id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); + if (IS_ERR(id)) { + ret = PTR_ERR(id); goto out; } + id_priv->cm_id.ib = id; route = &id_priv->id.route; ret = cma_format_hdr(private_data, id_priv->id.ps, route); @@ -2585,22 +2824,23 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); req.qp_num = id_priv->qp_num; - req.qp_type = IB_QPT_RC; + req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; - req.retry_count = conn_param->retry_count; - req.rnr_retry_count = conn_param->rnr_retry_count; + req.retry_count = min_t(u8, 7, conn_param->retry_count); + req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = cma_response_timeout; req.local_cm_response_timeout = cma_response_timeout; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; + cma_dbg(id_priv, "sending REQ\n"); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: - if (ret && !IS_ERR(id_priv->cm_id.ib)) { - ib_destroy_cm_id(id_priv->cm_id.ib); + if (ret && !IS_ERR(id)) { + ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } @@ -2617,11 +2857,9 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock, - cma_iw_handler, id_priv); - if (IS_ERR(cm_id)) { - ret = PTR_ERR(cm_id); - goto out; - } + cma_iw_handler, id_priv); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); id_priv->cm_id.iw = cm_id; @@ -2635,17 +2873,19 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, if (ret) goto out; + if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; - if (id_priv->id.qp) + iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; + } else { + memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; - else - iw_param.qpn = conn_param->qp_num; + } ret = iw_cm_connect(cm_id, &iw_param); out: - if (ret && !IS_ERR(cm_id)) { + if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } @@ -2658,7 +2898,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT)) + if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { @@ -2668,12 +2908,13 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (cma_is_ud_ps(id->ps)) + if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); break; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: ret = cma_connect_iw(id_priv, conn_param); break; default: @@ -2685,7 +2926,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) return 0; err: - cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED); + cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect); @@ -2713,9 +2954,9 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; - rep.rnr_retry_count = conn_param->rnr_retry_count; + rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; - + cma_dbg(id_priv, "sending REP\n"); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; @@ -2727,6 +2968,9 @@ static int cma_accept_iw(struct rdma_id_private *id_priv, struct iw_cm_conn_param iw_param; int ret; + if (!conn_param) + return -EINVAL; + ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; @@ -2762,6 +3006,7 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, rep.private_data = private_data; rep.private_data_len = private_data_len; + cma_dbg(id_priv, "sending SIDR\n"); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } @@ -2771,7 +3016,9 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, CMA_CONNECT)) + + id_priv->owner = curthread->td_proc->p_pid; + if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp && conn_param) { @@ -2781,16 +3028,23 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (cma_is_ud_ps(id->ps)) + if (id->qp_type == IB_QPT_UD) { + if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->private_data, conn_param->private_data_len); - else if (conn_param) + else + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + NULL, 0); + } else { + if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); + } break; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: ret = cma_accept_iw(id_priv, conn_param); break; default: @@ -2815,7 +3069,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_has_cm_dev(id_priv)) + if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { @@ -2837,20 +3091,23 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_has_cm_dev(id_priv)) + if (!id_priv->cm_id.ib) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (cma_is_ud_ps(id->ps)) + if (id->qp_type == IB_QPT_UD) ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, private_data, private_data_len); - else + else { + cma_dbg(id_priv, "sending REJ\n"); ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); + } break; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); break; @@ -2868,7 +3125,7 @@ int rdma_disconnect(struct rdma_cm_id *id) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_has_cm_dev(id_priv)) + if (!id_priv->cm_id.ib) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { @@ -2877,10 +3134,14 @@ int rdma_disconnect(struct rdma_cm_id *id) if (ret) goto out; /* Initiate or respond to a disconnect. */ - if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) + cma_dbg(id_priv, "sending DREQ\n"); + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { + cma_dbg(id_priv, "sending DREP\n"); ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); + } break; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); break; default: @@ -2897,35 +3158,55 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; + struct rdma_dev_addr *dev_addr; int ret; + struct net_device *ndev = NULL; + u16 vlan; id_priv = mc->id_priv; - if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) && - cma_disable_callback(id_priv, CMA_ADDR_RESOLVED)) + dev_addr = &id_priv->id.route.addr.dev_addr; + if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && + cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) return 0; mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, - multicast->rec.mlid); + be16_to_cpu(multicast->rec.mlid)); mutex_unlock(&id_priv->qp_mutex); memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; + ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!ndev) { + status = -ENODEV; + } else { + vlan = rdma_vlan_dev_vlan_id(ndev); + dev_put(ndev); + } if (!status) { event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, &event.param.ud.ah_attr); + event.param.ud.ah_attr.vlan_id = vlan; event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - } else + } else { event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + /* mark that the cached record is no longer valid */ + if (status != -ENETRESET && status != -EAGAIN) { + spin_lock(&id_priv->lock); + id_priv->is_valid_rec = 0; + spin_unlock(&id_priv->lock); + } + } + ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { - cma_exch(id_priv, CMA_DESTROYING); + cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return 0; @@ -2938,20 +3219,13 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { -#if defined(INET) || defined(INET6) unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; -#endif -#ifdef INET struct sockaddr_in *sin = (struct sockaddr_in *) addr; -#endif -#ifdef INET6 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; -#endif if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); -#ifdef INET6 } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { @@ -2962,14 +3236,11 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); -#endif -#ifdef INET } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); -#endif } } @@ -2979,13 +3250,26 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; - int ret; + int ret = 0; - ib_addr_get_mgid(dev_addr, &rec.mgid); - ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, - &rec.mgid, &rec); - if (ret) + ib_addr_get_mgid(dev_addr, &id_priv->rec.mgid); + + /* cache ipoib bc record */ + spin_lock(&id_priv->lock); + if (!id_priv->is_valid_rec) + ret = ib_sa_get_mcmember_rec(id_priv->id.device, + id_priv->id.port_num, + &id_priv->rec.mgid, + &id_priv->rec); + if (ret) { + id_priv->is_valid_rec = 0; + spin_unlock(&id_priv->lock); return ret; + } else { + rec = id_priv->rec; + id_priv->is_valid_rec = 1; + } + spin_unlock(&id_priv->lock); cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); if (id_priv->id.ps == RDMA_PS_UDP) @@ -3002,19 +3286,18 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | - IB_SA_MCMEMBER_REC_RATE_SELECTOR; + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); - if (IS_ERR(mc->multicast.ib)) - return PTR_ERR(mc->multicast.ib); - - return 0; + return PTR_RET(mc->multicast.ib); } - static void iboe_mcast_work_handler(struct work_struct *work) { struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); @@ -3034,9 +3317,9 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); - } else if (addr->sa_family == AF_INET6) + } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); - else { + } else { mgid->raw[0] = 0xff; mgid->raw[1] = 0x0e; mgid->raw[2] = 0; @@ -3087,20 +3370,16 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, err = -ENODEV; goto out2; } - mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; -#ifdef __linux__ - mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); -#else mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); -#endif dev_put(ndev); if (!mc->multicast.ib->rec.mtu) { err = -EINVAL; goto out2; } - iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &mc->multicast.ib->rec.port_gid); work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); @@ -3124,8 +3403,8 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, CMA_ADDR_BOUND) && - !cma_comp(id_priv, CMA_ADDR_RESOLVED)) + if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && + !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) return -EINVAL; mc = kmalloc(sizeof *mc, GFP_KERNEL); @@ -3165,7 +3444,6 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, spin_unlock_irq(&id_priv->lock); kfree(mc); } - return ret; } EXPORT_SYMBOL(rdma_join_multicast); @@ -3185,7 +3463,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) if (id->qp) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, - mc->multicast.ib->rec.mlid); + be16_to_cpu(mc->multicast.ib->rec.mlid)); if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: @@ -3213,17 +3491,10 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id dev_addr = &id_priv->id.route.addr.dev_addr; -#ifdef __linux__ - if ((dev_addr->bound_dev_if == ndev->ifindex) && - memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { - printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", - ndev->name, &id_priv->id); -#else if ((dev_addr->bound_dev_if == ndev->if_index) && memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->if_xname, &id_priv->id); -#endif work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; @@ -3246,7 +3517,8 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; -#ifdef __linux__ +/* BONDING related, commented out until the bonding is resolved */ +#if 0 if (dev_net(ndev) != &init_net) return NOTIFY_DONE; @@ -3255,10 +3527,9 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; -#else +#endif if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER) return NOTIFY_DONE; -#endif mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) @@ -3303,19 +3574,19 @@ static void cma_add_one(struct ib_device *device) static int cma_remove_id_dev(struct rdma_id_private *id_priv) { struct rdma_cm_event event; - enum cma_state state; + enum rdma_cm_state state; int ret = 0; /* Record that we want to remove the device */ - state = cma_exch(id_priv, CMA_DEVICE_REMOVAL); - if (state == CMA_DESTROYING) + state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); + if (state == RDMA_CM_DESTROYING) return 0; cma_cancel_operation(id_priv, state); mutex_lock(&id_priv->handler_mutex); /* Check for destruction from another callback. */ - if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL)) + if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) goto out; memset(&event, 0, sizeof event); @@ -3370,22 +3641,18 @@ static void cma_remove_one(struct ib_device *device) kfree(cma_dev); } -static int cma_init(void) +static int __init cma_init(void) { - int ret; -#if defined(INET) - int low, high, remaining; - - get_random_bytes(&next_port, sizeof next_port); - inet_get_local_port_range(&low, &high); - remaining = (high - low) + 1; - next_port = ((unsigned int) next_port % remaining) + low; -#endif + int ret = -ENOMEM; cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) return -ENOMEM; + cma_free_wq = create_singlethread_workqueue("rdma_cm_fr"); + if (!cma_free_wq) + goto err1; + ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); @@ -3393,27 +3660,34 @@ static int cma_init(void) ret = ib_register_client(&cma_client); if (ret) goto err; + return 0; err: unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); + + destroy_workqueue(cma_free_wq); +err1: destroy_workqueue(cma_wq); return ret; } -static void cma_cleanup(void) +static void __exit cma_cleanup(void) { ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); + flush_workqueue(cma_free_wq); + destroy_workqueue(cma_free_wq); destroy_workqueue(cma_wq); idr_destroy(&sdp_ps); idr_destroy(&tcp_ps); idr_destroy(&udp_ps); idr_destroy(&ipoib_ps); + idr_destroy(&ib_ps); } module_init(cma_init); diff --git a/sys/ofed/drivers/infiniband/core/core_priv.h b/sys/ofed/drivers/infiniband/core/core_priv.h index 08c4bbbad168..001bbbe357a7 100644 --- a/sys/ofed/drivers/infiniband/core/core_priv.h +++ b/sys/ofed/drivers/infiniband/core/core_priv.h @@ -38,7 +38,8 @@ #include -int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, +int ib_device_register_sysfs(struct ib_device *device, + int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); diff --git a/sys/ofed/drivers/infiniband/core/device.c b/sys/ofed/drivers/infiniband/core/device.c index 98adf489b81a..a7a06d78164c 100644 --- a/sys/ofed/drivers/infiniband/core/device.c +++ b/sys/ofed/drivers/infiniband/core/device.c @@ -37,7 +37,6 @@ #include #include #include -#include #include "core_priv.h" @@ -45,18 +44,15 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); -#ifdef __ia64__ -/* workaround for a bug in hp chipset that would cause kernel - panic when dma resources are exhaused */ -int dma_map_sg_hp_wa = 0; -#endif - struct ib_client_data { struct list_head list; struct ib_client *client; void * data; }; +struct workqueue_struct *ib_wq; +EXPORT_SYMBOL_GPL(ib_wq); + static LIST_HEAD(device_list); static LIST_HEAD(client_list); @@ -99,7 +95,7 @@ static int ib_device_check_mandatory(struct ib_device *device) int i; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { - if (!*(void **) ((u_char *) device + mandatory_table[i].offset)) { + if (!*(void **) ((void *) device + mandatory_table[i].offset)) { printk(KERN_WARNING "Device %s is missing mandatory function %s\n", device->name, mandatory_table[i].name); return -EINVAL; @@ -177,9 +173,14 @@ static int end_port(struct ib_device *device) */ struct ib_device *ib_alloc_device(size_t size) { + struct ib_device *dev; + BUG_ON(size < sizeof (struct ib_device)); - return kzalloc(size, GFP_KERNEL); + dev = kzalloc(size, GFP_KERNEL); + spin_lock_init(&dev->cmd_perf_lock); + + return dev; } EXPORT_SYMBOL(ib_alloc_device); @@ -295,8 +296,6 @@ int ib_register_device(struct ib_device *device, INIT_LIST_HEAD(&device->client_data_list); spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); - device->ib_uverbs_xrcd_table = RB_ROOT; - mutex_init(&device->xrcd_table_mutex); ret = read_port_table_lengths(device); if (ret) { @@ -631,6 +630,9 @@ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { + if (!device->modify_device) + return -ENOSYS; + return device->modify_device(device, device_modify_mask, device_modify); } @@ -651,6 +653,9 @@ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { + if (!device->modify_port) + return -ENOSYS; + if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; @@ -705,18 +710,28 @@ int ib_find_pkey(struct ib_device *device, { int ret, i; u16 tmp_pkey; + int partial_ix = -1; for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; - if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { - *index = i; - return 0; + /* if there is full-member pkey take it.*/ + if (tmp_pkey & 0x8000) { + *index = i; + return 0; + } + if (partial_ix < 0) + partial_ix = i; } } + /*no full-member, if exists take the limited*/ + if (partial_ix >= 0) { + *index = partial_ix; + return 0; + } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); @@ -725,21 +740,29 @@ static int __init ib_core_init(void) { int ret; -#ifdef __ia64__ - if (ia64_platform_is("hpzx1")) - dma_map_sg_hp_wa = 1; -#endif + ib_wq = create_workqueue("infiniband"); + if (!ib_wq) + return -ENOMEM; ret = ib_sysfs_setup(); - if (ret) + if (ret) { printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); + goto err; + } ret = ib_cache_setup(); if (ret) { printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); - ib_sysfs_cleanup(); + goto err_sysfs; } + return 0; + +err_sysfs: + ib_sysfs_cleanup(); + +err: + destroy_workqueue(ib_wq); return ret; } @@ -748,7 +771,7 @@ static void __exit ib_core_cleanup(void) ib_cache_cleanup(); ib_sysfs_cleanup(); /* Make sure that any pending umem accounting work is done. */ - flush_scheduled_work(); + destroy_workqueue(ib_wq); } module_init(ib_core_init); diff --git a/sys/ofed/drivers/infiniband/core/fmr_pool.c b/sys/ofed/drivers/infiniband/core/fmr_pool.c index c22583344534..bda7abc84b9b 100644 --- a/sys/ofed/drivers/infiniband/core/fmr_pool.c +++ b/sys/ofed/drivers/infiniband/core/fmr_pool.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -150,7 +151,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) #ifdef DEBUG if (fmr->ref_count !=0) { - printk(KERN_WARNING PFX "Unmapping FMR %p with ref count %d\n", + printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n", fmr, fmr->ref_count); } #endif diff --git a/sys/ofed/drivers/infiniband/core/iwcm.c b/sys/ofed/drivers/infiniband/core/iwcm.c index 27878a890185..14d23cccb496 100644 --- a/sys/ofed/drivers/infiniband/core/iwcm.c +++ b/sys/ofed/drivers/infiniband/core/iwcm.c @@ -40,9 +40,12 @@ #include #include #include +#include #include #include #include +#include +#include #include #include @@ -507,6 +510,8 @@ int iw_cm_accept(struct iw_cm_id *cm_id, qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); @@ -566,6 +571,8 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); @@ -620,17 +627,6 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, */ BUG_ON(iw_event->status); - /* - * We could be destroying the listening id. If so, ignore this - * upcall. - */ - spin_lock_irqsave(&listen_id_priv->lock, flags); - if (listen_id_priv->state != IW_CM_STATE_LISTEN) { - spin_unlock_irqrestore(&listen_id_priv->lock, flags); - goto out; - } - spin_unlock_irqrestore(&listen_id_priv->lock, flags); - cm_id = iw_create_cm_id(listen_id_priv->id.device, iw_event->so, listen_id_priv->id.cm_handler, @@ -646,6 +642,19 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); cm_id_priv->state = IW_CM_STATE_CONN_RECV; + /* + * We could be destroying the listening id. If so, ignore this + * upcall. + */ + spin_lock_irqsave(&listen_id_priv->lock, flags); + if (listen_id_priv->state != IW_CM_STATE_LISTEN) { + spin_unlock_irqrestore(&listen_id_priv->lock, flags); + iw_cm_reject(cm_id, NULL, 0); + iw_destroy_cm_id(cm_id); + goto out; + } + spin_unlock_irqrestore(&listen_id_priv->lock, flags); + ret = alloc_work_entries(cm_id_priv, 3); if (ret) { iw_cm_reject(cm_id, NULL, 0); @@ -723,7 +732,7 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); - if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) { + if (iw_event->status == 0) { cm_id_priv->id.local_addr = iw_event->local_addr; cm_id_priv->id.remote_addr = iw_event->remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; diff --git a/sys/ofed/drivers/infiniband/core/local_sa.c b/sys/ofed/drivers/infiniband/core/local_sa.c deleted file mode 100644 index 9b9c60a6872e..000000000000 --- a/sys/ofed/drivers/infiniband/core/local_sa.c +++ /dev/null @@ -1,1273 +0,0 @@ -/* - * Copyright (c) 2006 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "sa.h" - -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("InfiniBand subnet administration caching"); -MODULE_LICENSE("Dual BSD/GPL"); - -enum { - SA_DB_MAX_PATHS_PER_DEST = 0x7F, - SA_DB_MIN_RETRY_TIMER = 4000, /* 4 sec */ - SA_DB_MAX_RETRY_TIMER = 256000 /* 256 sec */ -}; - -static int set_paths_per_dest(const char *val, struct kernel_param *kp); -static unsigned long paths_per_dest = 0; -module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong, - &paths_per_dest, 0644); -MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve " - "to each destination (DGID). Set to 0 " - "to disable cache."); - -static int set_subscribe_inform_info(const char *val, struct kernel_param *kp); -static char subscribe_inform_info = 1; -module_param_call(subscribe_inform_info, set_subscribe_inform_info, - param_get_bool, &subscribe_inform_info, 0644); -MODULE_PARM_DESC(subscribe_inform_info, - "Subscribe for SA InformInfo/Notice events."); - -static int do_refresh(const char *val, struct kernel_param *kp); -module_param_call(refresh, do_refresh, NULL, NULL, 0200); - -static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER; - -enum sa_db_lookup_method { - SA_DB_LOOKUP_LEAST_USED, - SA_DB_LOOKUP_RANDOM -}; - -static int set_lookup_method(const char *val, struct kernel_param *kp); -static int get_lookup_method(char *buf, struct kernel_param *kp); -static unsigned long lookup_method; -module_param_call(lookup_method, set_lookup_method, get_lookup_method, - &lookup_method, 0644); -MODULE_PARM_DESC(lookup_method, "Method used to return path records when " - "multiple paths exist to a given destination."); - -static void sa_db_add_dev(struct ib_device *device); -static void sa_db_remove_dev(struct ib_device *device); - -static struct ib_client sa_db_client = { - .name = "local_sa", - .add = sa_db_add_dev, - .remove = sa_db_remove_dev -}; - -static LIST_HEAD(dev_list); -static DEFINE_MUTEX(lock); -static rwlock_t rwlock; -static struct workqueue_struct *sa_wq; -static struct ib_sa_client sa_client; - -enum sa_db_state { - SA_DB_IDLE, - SA_DB_REFRESH, - SA_DB_DESTROY -}; - -struct sa_db_port { - struct sa_db_device *dev; - struct ib_mad_agent *agent; - /* Limit number of outstanding MADs to SA to reduce SA flooding */ - struct ib_mad_send_buf *msg; - u16 sm_lid; - u8 sm_sl; - struct ib_inform_info *in_info; - struct ib_inform_info *out_info; - struct rb_root paths; - struct list_head update_list; - unsigned long update_id; - enum sa_db_state state; - struct work_struct work; - union ib_gid gid; - int port_num; -}; - -struct sa_db_device { - struct list_head list; - struct ib_device *device; - struct ib_event_handler event_handler; - int start_port; - int port_count; - struct sa_db_port port[0]; -}; - -struct ib_sa_iterator { - struct ib_sa_iterator *next; -}; - -struct ib_sa_attr_iter { - struct ib_sa_iterator *iter; - unsigned long flags; -}; - -struct ib_sa_attr_list { - struct ib_sa_iterator iter; - struct ib_sa_iterator *tail; - int update_id; - union ib_gid gid; - struct rb_node node; -}; - -struct ib_path_rec_info { - struct ib_sa_iterator iter; /* keep first */ - struct ib_sa_path_rec rec; - unsigned long lookups; -}; - -struct ib_sa_mad_iter { - struct ib_mad_recv_wc *recv_wc; - struct ib_mad_recv_buf *recv_buf; - int attr_size; - int attr_offset; - int data_offset; - int data_left; - void *attr; - u8 attr_data[0]; -}; - -enum sa_update_type { - SA_UPDATE_FULL, - SA_UPDATE_ADD, - SA_UPDATE_REMOVE -}; - -struct update_info { - struct list_head list; - union ib_gid gid; - enum sa_update_type type; -}; - -struct sa_path_request { - struct work_struct work; - struct ib_sa_client *client; - void (*callback)(int, struct ib_sa_path_rec *, void *); - void *context; - struct ib_sa_path_rec path_rec; -}; - -static void process_updates(struct sa_db_port *port); - -static void free_attr_list(struct ib_sa_attr_list *attr_list) -{ - struct ib_sa_iterator *cur; - - for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) { - attr_list->iter.next = cur->next; - kfree(cur); - } - attr_list->tail = &attr_list->iter; -} - -static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list) -{ - rb_erase(&attr_list->node, root); - free_attr_list(attr_list); - kfree(attr_list); -} - -static void remove_all_attrs(struct rb_root *root) -{ - struct rb_node *node, *next_node; - struct ib_sa_attr_list *attr_list; - - write_lock_irq(&rwlock); - for (node = rb_first(root); node; node = next_node) { - next_node = rb_next(node); - attr_list = rb_entry(node, struct ib_sa_attr_list, node); - remove_attr(root, attr_list); - } - write_unlock_irq(&rwlock); -} - -static void remove_old_attrs(struct rb_root *root, unsigned long update_id) -{ - struct rb_node *node, *next_node; - struct ib_sa_attr_list *attr_list; - - write_lock_irq(&rwlock); - for (node = rb_first(root); node; node = next_node) { - next_node = rb_next(node); - attr_list = rb_entry(node, struct ib_sa_attr_list, node); - if (attr_list->update_id != update_id) - remove_attr(root, attr_list); - } - write_unlock_irq(&rwlock); -} - -static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root, - struct ib_sa_attr_list *attr_list) -{ - struct rb_node **link = &root->rb_node; - struct rb_node *parent = NULL; - struct ib_sa_attr_list *cur_attr_list; - int cmp; - - while (*link) { - parent = *link; - cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node); - cmp = memcmp(&cur_attr_list->gid, &attr_list->gid, - sizeof attr_list->gid); - if (cmp < 0) - link = &(*link)->rb_left; - else if (cmp > 0) - link = &(*link)->rb_right; - else - return cur_attr_list; - } - rb_link_node(&attr_list->node, parent, link); - rb_insert_color(&attr_list->node, root); - return NULL; -} - -static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid) -{ - struct rb_node *node = root->rb_node; - struct ib_sa_attr_list *attr_list; - int cmp; - - while (node) { - attr_list = rb_entry(node, struct ib_sa_attr_list, node); - cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid); - if (cmp < 0) - node = node->rb_left; - else if (cmp > 0) - node = node->rb_right; - else - return attr_list; - } - return NULL; -} - -static int insert_attr(struct rb_root *root, unsigned long update_id, void *key, - struct ib_sa_iterator *iter) -{ - struct ib_sa_attr_list *attr_list; - void *err; - - write_lock_irq(&rwlock); - attr_list = find_attr_list(root, key); - if (!attr_list) { - write_unlock_irq(&rwlock); - attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL); - if (!attr_list) - return -ENOMEM; - - attr_list->iter.next = NULL; - attr_list->tail = &attr_list->iter; - attr_list->update_id = update_id; - memcpy(attr_list->gid.raw, key, sizeof attr_list->gid); - - write_lock_irq(&rwlock); - err = insert_attr_list(root, attr_list); - if (err) { - write_unlock_irq(&rwlock); - kfree(attr_list); - return PTR_ERR(err); - } - } else if (attr_list->update_id != update_id) { - free_attr_list(attr_list); - attr_list->update_id = update_id; - } - - attr_list->tail->next = iter; - iter->next = NULL; - attr_list->tail = iter; - write_unlock_irq(&rwlock); - return 0; -} - -static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc) -{ - struct ib_sa_mad_iter *iter; - struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad; - int attr_size, attr_offset; - - attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8; - attr_size = 64; /* path record length */ - if (attr_offset < attr_size) - return ERR_PTR(-EINVAL); - - iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL); - if (!iter) - return ERR_PTR(-ENOMEM); - - iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR; - iter->recv_wc = mad_recv_wc; - iter->recv_buf = &mad_recv_wc->recv_buf; - iter->attr_offset = attr_offset; - iter->attr_size = attr_size; - return iter; -} - -static void ib_sa_iter_free(struct ib_sa_mad_iter *iter) -{ - kfree(iter); -} - -static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter) -{ - struct ib_sa_mad *mad; - int left, offset = 0; - - while (iter->data_left >= iter->attr_offset) { - while (iter->data_offset < IB_MGMT_SA_DATA) { - mad = (struct ib_sa_mad *) iter->recv_buf->mad; - - left = IB_MGMT_SA_DATA - iter->data_offset; - if (left < iter->attr_size) { - /* copy first piece of the attribute */ - iter->attr = &iter->attr_data; - memcpy(iter->attr, - &mad->data[iter->data_offset], left); - offset = left; - break; - } else if (offset) { - /* copy the second piece of the attribute */ - memcpy(iter->attr + offset, &mad->data[0], - iter->attr_size - offset); - iter->data_offset = iter->attr_size - offset; - offset = 0; - } else { - iter->attr = &mad->data[iter->data_offset]; - iter->data_offset += iter->attr_size; - } - - iter->data_left -= iter->attr_offset; - goto out; - } - iter->data_offset = 0; - iter->recv_buf = list_entry(iter->recv_buf->list.next, - struct ib_mad_recv_buf, list); - } - iter->attr = NULL; -out: - return iter->attr; -} - -/* - * Copy path records from a received response and insert them into our cache. - * A path record in the MADs are in network order, packed, and may - * span multiple MAD buffers, just to make our life hard. - */ -static void update_path_db(struct sa_db_port *port, - struct ib_mad_recv_wc *mad_recv_wc, - enum sa_update_type type) -{ - struct ib_sa_mad_iter *iter; - struct ib_path_rec_info *path_info; - void *attr; - int ret; - - iter = ib_sa_iter_create(mad_recv_wc); - if (IS_ERR(iter)) - return; - - port->update_id += (type == SA_UPDATE_FULL); - - while ((attr = ib_sa_iter_next(iter)) && - (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) { - - ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC); - - ret = insert_attr(&port->paths, port->update_id, - path_info->rec.dgid.raw, &path_info->iter); - if (ret) { - kfree(path_info); - break; - } - } - ib_sa_iter_free(iter); - - if (type == SA_UPDATE_FULL) - remove_old_attrs(&port->paths, port->update_id); -} - -static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port, - struct update_info *update) -{ - struct ib_ah_attr ah_attr; - struct ib_mad_send_buf *msg; - - msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR, - IB_MGMT_SA_DATA, GFP_KERNEL); - if (IS_ERR(msg)) - return NULL; - - memset(&ah_attr, 0, sizeof ah_attr); - ah_attr.dlid = port->sm_lid; - ah_attr.sl = port->sm_sl; - ah_attr.port_num = port->port_num; - - msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); - if (IS_ERR(msg->ah)) { - ib_free_send_mad(msg); - return NULL; - } - - msg->timeout_ms = retry_timer; - msg->retries = 0; - msg->context[0] = port; - msg->context[1] = update; - return msg; -} - -static __be64 form_tid(u32 hi_tid) -{ - static atomic_t tid; - return cpu_to_be64((((u64) hi_tid) << 32) | - ((u32) atomic_inc_return(&tid))); -} - -static void format_path_req(struct sa_db_port *port, - struct update_info *update, - struct ib_mad_send_buf *msg) -{ - struct ib_sa_mad *mad = msg->mad; - struct ib_sa_path_rec path_rec; - - mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; - mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; - mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; - mad->mad_hdr.method = IB_SA_METHOD_GET_TABLE; - mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC); - mad->mad_hdr.tid = form_tid(msg->mad_agent->hi_tid); - - mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH; - - path_rec.sgid = port->gid; - path_rec.numb_path = (u8) paths_per_dest; - - if (update->type == SA_UPDATE_ADD) { - mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID; - memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid); - } - - ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC); -} - -static int send_query(struct sa_db_port *port, - struct update_info *update) -{ - int ret; - - port->msg = get_sa_msg(port, update); - if (!port->msg) - return -ENOMEM; - - format_path_req(port, update, port->msg); - - ret = ib_post_send_mad(port->msg, NULL); - if (ret) - goto err; - - return 0; - -err: - ib_destroy_ah(port->msg->ah); - ib_free_send_mad(port->msg); - return ret; -} - -static void add_update(struct sa_db_port *port, u8 *gid, - enum sa_update_type type) -{ - struct update_info *update; - - update = kmalloc(sizeof *update, GFP_KERNEL); - if (update) { - if (gid) - memcpy(&update->gid, gid, sizeof update->gid); - update->type = type; - list_add(&update->list, &port->update_list); - } - - if (port->state == SA_DB_IDLE) { - port->state = SA_DB_REFRESH; - process_updates(port); - } -} - -static void clean_update_list(struct sa_db_port *port) -{ - struct update_info *update; - - while (!list_empty(&port->update_list)) { - update = list_entry(port->update_list.next, - struct update_info, list); - list_del(&update->list); - kfree(update); - } -} - -static int notice_handler(int status, struct ib_inform_info *info, - struct ib_sa_notice *notice) -{ - struct sa_db_port *port = info->context; - struct ib_sa_notice_data_gid *gid_data; - struct ib_inform_info **pinfo; - enum sa_update_type type; - - if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) { - pinfo = &port->in_info; - type = SA_UPDATE_ADD; - } else { - pinfo = &port->out_info; - type = SA_UPDATE_REMOVE; - } - - mutex_lock(&lock); - if (port->state == SA_DB_DESTROY || !*pinfo) { - mutex_unlock(&lock); - return 0; - } - - if (notice) { - gid_data = (struct ib_sa_notice_data_gid *) - ¬ice->data_details; - add_update(port, gid_data->gid, type); - mutex_unlock(&lock); - } else if (status == -ENETRESET) { - *pinfo = NULL; - mutex_unlock(&lock); - } else { - if (status) - *pinfo = ERR_PTR(-EINVAL); - port->state = SA_DB_IDLE; - clean_update_list(port); - mutex_unlock(&lock); - queue_work(sa_wq, &port->work); - } - - return status; -} - -static int reg_in_info(struct sa_db_port *port) -{ - int ret = 0; - - port->in_info = ib_sa_register_inform_info(&sa_client, - port->dev->device, - port->port_num, - IB_SA_SM_TRAP_GID_IN_SERVICE, - GFP_KERNEL, notice_handler, - port); - if (IS_ERR(port->in_info)) - ret = PTR_ERR(port->in_info); - - return ret; -} - -static int reg_out_info(struct sa_db_port *port) -{ - int ret = 0; - - port->out_info = ib_sa_register_inform_info(&sa_client, - port->dev->device, - port->port_num, - IB_SA_SM_TRAP_GID_OUT_OF_SERVICE, - GFP_KERNEL, notice_handler, - port); - if (IS_ERR(port->out_info)) - ret = PTR_ERR(port->out_info); - - return ret; -} - -static void unsubscribe_port(struct sa_db_port *port) -{ - if (port->in_info && !IS_ERR(port->in_info)) - ib_sa_unregister_inform_info(port->in_info); - - if (port->out_info && !IS_ERR(port->out_info)) - ib_sa_unregister_inform_info(port->out_info); - - port->out_info = NULL; - port->in_info = NULL; - -} - -static void cleanup_port(struct sa_db_port *port) -{ - unsubscribe_port(port); - - clean_update_list(port); - remove_all_attrs(&port->paths); -} - -static int update_port_info(struct sa_db_port *port) -{ - struct ib_port_attr port_attr; - int ret; - - ret = ib_query_port(port->dev->device, port->port_num, &port_attr); - if (ret) - return ret; - - if (port_attr.state != IB_PORT_ACTIVE) - return -ENODATA; - - port->sm_lid = port_attr.sm_lid; - port->sm_sl = port_attr.sm_sl; - return 0; -} - -static void process_updates(struct sa_db_port *port) -{ - struct update_info *update; - struct ib_sa_attr_list *attr_list; - int ret; - - if (!paths_per_dest || update_port_info(port)) { - cleanup_port(port); - goto out; - } - - /* Event registration is an optimization, so ignore failures. */ - if (subscribe_inform_info) { - if (!port->out_info) { - ret = reg_out_info(port); - if (!ret) - return; - } - - if (!port->in_info) { - ret = reg_in_info(port); - if (!ret) - return; - } - } else - unsubscribe_port(port); - - while (!list_empty(&port->update_list)) { - update = list_entry(port->update_list.next, - struct update_info, list); - - if (update->type == SA_UPDATE_REMOVE) { - write_lock_irq(&rwlock); - attr_list = find_attr_list(&port->paths, - update->gid.raw); - if (attr_list) - remove_attr(&port->paths, attr_list); - write_unlock_irq(&rwlock); - } else { - ret = send_query(port, update); - if (!ret) - return; - - } - list_del(&update->list); - kfree(update); - } -out: - port->state = SA_DB_IDLE; -} - -static void refresh_port_db(struct sa_db_port *port) -{ - if (port->state == SA_DB_DESTROY) - return; - - if (port->state == SA_DB_REFRESH) { - clean_update_list(port); - ib_cancel_mad(port->agent, port->msg); - } - - add_update(port, NULL, SA_UPDATE_FULL); -} - -static void refresh_dev_db(struct sa_db_device *dev) -{ - int i; - - for (i = 0; i < dev->port_count; i++) - refresh_port_db(&dev->port[i]); -} - -static void refresh_db(void) -{ - struct sa_db_device *dev; - - list_for_each_entry(dev, &dev_list, list) - refresh_dev_db(dev); -} - -static int do_refresh(const char *val, struct kernel_param *kp) -{ - mutex_lock(&lock); - refresh_db(); - mutex_unlock(&lock); - return 0; -} - -static int get_lookup_method(char *buf, struct kernel_param *kp) -{ - return sprintf(buf, - "%c %d round robin\n" - "%c %d random", - (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ', - SA_DB_LOOKUP_LEAST_USED, - (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ', - SA_DB_LOOKUP_RANDOM); -} - -static int set_lookup_method(const char *val, struct kernel_param *kp) -{ - unsigned long method; - int ret = 0; - - method = simple_strtoul(val, NULL, 0); - - switch (method) { - case SA_DB_LOOKUP_LEAST_USED: - case SA_DB_LOOKUP_RANDOM: - lookup_method = method; - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -static int set_paths_per_dest(const char *val, struct kernel_param *kp) -{ - int ret; - - mutex_lock(&lock); - ret = param_set_ulong(val, kp); - if (ret) - goto out; - - if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST) - paths_per_dest = SA_DB_MAX_PATHS_PER_DEST; - refresh_db(); -out: - mutex_unlock(&lock); - return ret; -} - -static int set_subscribe_inform_info(const char *val, struct kernel_param *kp) -{ - int ret; - - ret = param_set_bool(val, kp); - if (ret) - return ret; - - return do_refresh(val, kp); -} - -static void port_work_handler(struct work_struct *work) -{ - struct sa_db_port *port; - - port = container_of(work, typeof(*port), work); - mutex_lock(&lock); - refresh_port_db(port); - mutex_unlock(&lock); -} - -static void handle_event(struct ib_event_handler *event_handler, - struct ib_event *event) -{ - struct sa_db_device *dev; - struct sa_db_port *port; - - dev = container_of(event_handler, typeof(*dev), event_handler); - port = &dev->port[event->element.port_num - dev->start_port]; - - switch (event->event) { - case IB_EVENT_PORT_ERR: - case IB_EVENT_LID_CHANGE: - case IB_EVENT_SM_CHANGE: - case IB_EVENT_CLIENT_REREGISTER: - case IB_EVENT_PKEY_CHANGE: - case IB_EVENT_PORT_ACTIVE: - queue_work(sa_wq, &port->work); - break; - default: - break; - } -} - -static void ib_free_path_iter(struct ib_sa_attr_iter *iter) -{ - read_unlock_irqrestore(&rwlock, iter->flags); -} - -static int ib_create_path_iter(struct ib_device *device, u8 port_num, - union ib_gid *dgid, struct ib_sa_attr_iter *iter) -{ - struct sa_db_device *dev; - struct sa_db_port *port; - struct ib_sa_attr_list *list; - - dev = ib_get_client_data(device, &sa_db_client); - if (!dev) - return -ENODEV; - - port = &dev->port[port_num - dev->start_port]; - - read_lock_irqsave(&rwlock, iter->flags); - list = find_attr_list(&port->paths, dgid->raw); - if (!list) { - ib_free_path_iter(iter); - return -ENODATA; - } - - iter->iter = &list->iter; - return 0; -} - -static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter) -{ - struct ib_path_rec_info *next_path; - - iter->iter = iter->iter->next; - if (iter->iter) { - next_path = container_of(iter->iter, struct ib_path_rec_info, iter); - return &next_path->rec; - } else - return NULL; -} - -static int cmp_rec(struct ib_sa_path_rec *src, - struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask) -{ - /* DGID check already done */ - if (comp_mask & IB_SA_PATH_REC_SGID && - memcmp(&src->sgid, &dst->sgid, sizeof src->sgid)) - return -EINVAL; - if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid) - return -EINVAL; - if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid) - return -EINVAL; - if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC && - src->raw_traffic != dst->raw_traffic) - return -EINVAL; - - if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL && - src->flow_label != dst->flow_label) - return -EINVAL; - if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT && - src->hop_limit != dst->hop_limit) - return -EINVAL; - if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS && - src->traffic_class != dst->traffic_class) - return -EINVAL; - if (comp_mask & IB_SA_PATH_REC_REVERSIBLE && - dst->reversible && !src->reversible) - return -EINVAL; - /* Numb path check already done */ - if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey) - return -EINVAL; - - if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl) - return -EINVAL; - - if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR, - IB_SA_PATH_REC_MTU, dst->mtu_selector, - src->mtu, dst->mtu)) - return -EINVAL; - if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR, - IB_SA_PATH_REC_RATE, dst->rate_selector, - src->rate, dst->rate)) - return -EINVAL; - if (ib_sa_check_selector(comp_mask, - IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR, - IB_SA_PATH_REC_PACKET_LIFE_TIME, - dst->packet_life_time_selector, - src->packet_life_time, dst->packet_life_time)) - return -EINVAL; - - return 0; -} - -static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter, - struct ib_sa_path_rec *req_path, - ib_sa_comp_mask comp_mask) -{ - struct ib_sa_path_rec *path, *rand_path = NULL; - int num, count = 0; - - for (path = ib_get_next_path(iter); path; - path = ib_get_next_path(iter)) { - if (!cmp_rec(path, req_path, comp_mask)) { - get_random_bytes(&num, sizeof num); - if ((num % ++count) == 0) - rand_path = path; - } - } - - return rand_path; -} - -static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter, - struct ib_sa_path_rec *req_path, - ib_sa_comp_mask comp_mask) -{ - struct ib_path_rec_info *cur_path, *next_path = NULL; - struct ib_sa_path_rec *path; - unsigned long lookups = ~0; - - for (path = ib_get_next_path(iter); path; - path = ib_get_next_path(iter)) { - if (!cmp_rec(path, req_path, comp_mask)) { - - cur_path = container_of(iter->iter, struct ib_path_rec_info, - iter); - if (cur_path->lookups < lookups) { - lookups = cur_path->lookups; - next_path = cur_path; - } - } - } - - if (next_path) { - next_path->lookups++; - return &next_path->rec; - } else - return NULL; -} - -static void report_path(struct work_struct *work) -{ - struct sa_path_request *req; - - req = container_of(work, struct sa_path_request, work); - req->callback(0, &req->path_rec, req->context); - ib_sa_client_put(req->client); - kfree(req); -} - -/** - * ib_sa_path_rec_get - Start a Path get query - * @client:SA client - * @device:device to send query on - * @port_num: port number to send query on - * @rec:Path Record to send in query - * @comp_mask:component mask to send in query - * @timeout_ms:time to wait for response - * @gfp_mask:GFP mask to use for internal allocations - * @callback:function called when query completes, times out or is - * canceled - * @context:opaque user context passed to callback - * @sa_query:query context, used to cancel query - * - * Send a Path Record Get query to the SA to look up a path. The - * callback function will be called when the query completes (or - * fails); status is 0 for a successful response, -EINTR if the query - * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error - * occurred sending the query. The resp parameter of the callback is - * only valid if status is 0. - * - * If the return value of ib_sa_path_rec_get() is negative, it is an - * error code. Otherwise it is a query ID that can be used to cancel - * the query. - */ -int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_path_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) -{ - struct sa_path_request *req; - struct ib_sa_attr_iter iter; - struct ib_sa_path_rec *path_rec; - int ret; - - if (!paths_per_dest) - goto query_sa; - - if (!(comp_mask & IB_SA_PATH_REC_DGID) || - !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1) - goto query_sa; - - req = kmalloc(sizeof *req, gfp_mask); - if (!req) - goto query_sa; - - ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter); - if (ret) - goto free_req; - - if (lookup_method == SA_DB_LOOKUP_RANDOM) - path_rec = get_random_path(&iter, rec, comp_mask); - else - path_rec = get_next_path(&iter, rec, comp_mask); - - if (!path_rec) - goto free_iter; - - memcpy(&req->path_rec, path_rec, sizeof *path_rec); - ib_free_path_iter(&iter); - - INIT_WORK(&req->work, report_path); - req->client = client; - req->callback = callback; - req->context = context; - - ib_sa_client_get(client); - queue_work(sa_wq, &req->work); - *sa_query = ERR_PTR(-EEXIST); - return 0; - -free_iter: - ib_free_path_iter(&iter); -free_req: - kfree(req); -query_sa: - return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask, - timeout_ms, gfp_mask, callback, context, - sa_query); -} -EXPORT_SYMBOL(ib_sa_path_rec_get); - -static void recv_handler(struct ib_mad_agent *mad_agent, - struct ib_mad_recv_wc *mad_recv_wc) -{ - struct sa_db_port *port; - struct update_info *update; - struct ib_mad_send_buf *msg; - enum sa_update_type type; - - msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id; - port = msg->context[0]; - update = msg->context[1]; - - mutex_lock(&lock); - if (port->state == SA_DB_DESTROY || - update != list_entry(port->update_list.next, - struct update_info, list)) { - mutex_unlock(&lock); - } else { - type = update->type; - mutex_unlock(&lock); - update_path_db(mad_agent->context, mad_recv_wc, type); - } - - ib_free_recv_mad(mad_recv_wc); -} - -static void send_handler(struct ib_mad_agent *agent, - struct ib_mad_send_wc *mad_send_wc) -{ - struct ib_mad_send_buf *msg; - struct sa_db_port *port; - struct update_info *update; - int ret; - - msg = mad_send_wc->send_buf; - port = msg->context[0]; - update = msg->context[1]; - - mutex_lock(&lock); - if (port->state == SA_DB_DESTROY) - goto unlock; - - if (update == list_entry(port->update_list.next, - struct update_info, list)) { - - if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR && - msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) { - - msg->timeout_ms <<= 1; - ret = ib_post_send_mad(msg, NULL); - if (!ret) { - mutex_unlock(&lock); - return; - } - } - list_del(&update->list); - kfree(update); - } - process_updates(port); -unlock: - mutex_unlock(&lock); - - ib_destroy_ah(msg->ah); - ib_free_send_mad(msg); -} - -static int init_port(struct sa_db_device *dev, int port_num) -{ - struct sa_db_port *port; - int ret; - - port = &dev->port[port_num - dev->start_port]; - port->dev = dev; - port->port_num = port_num; - INIT_WORK(&port->work, port_work_handler); - port->paths = RB_ROOT; - INIT_LIST_HEAD(&port->update_list); - - ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid); - if (ret) - return ret; - - port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI, - NULL, IB_MGMT_RMPP_VERSION, - send_handler, recv_handler, port); - if (IS_ERR(port->agent)) - ret = PTR_ERR(port->agent); - - return ret; -} - -static void destroy_port(struct sa_db_port *port) -{ - mutex_lock(&lock); - port->state = SA_DB_DESTROY; - mutex_unlock(&lock); - - ib_unregister_mad_agent(port->agent); - cleanup_port(port); - flush_workqueue(sa_wq); -} - -static void sa_db_add_dev(struct ib_device *device) -{ - struct sa_db_device *dev; - struct sa_db_port *port; - int s, e, i, ret; - - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - - if (device->node_type == RDMA_NODE_IB_SWITCH) { - s = e = 0; - } else { - s = 1; - e = device->phys_port_cnt; - } - - dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL); - if (!dev) - return; - - dev->start_port = s; - dev->port_count = e - s + 1; - dev->device = device; - for (i = 0; i < dev->port_count; i++) { - ret = init_port(dev, s + i); - if (ret) - goto err; - } - - ib_set_client_data(device, &sa_db_client, dev); - - INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event); - - mutex_lock(&lock); - list_add_tail(&dev->list, &dev_list); - refresh_dev_db(dev); - mutex_unlock(&lock); - - ib_register_event_handler(&dev->event_handler); - return; -err: - while (i--) - destroy_port(&dev->port[i]); - kfree(dev); -} - -static void sa_db_remove_dev(struct ib_device *device) -{ - struct sa_db_device *dev; - int i; - - dev = ib_get_client_data(device, &sa_db_client); - if (!dev) - return; - - ib_unregister_event_handler(&dev->event_handler); - flush_workqueue(sa_wq); - - for (i = 0; i < dev->port_count; i++) - destroy_port(&dev->port[i]); - - mutex_lock(&lock); - list_del(&dev->list); - mutex_unlock(&lock); - - kfree(dev); -} - -int sa_db_init(void) -{ - int ret; - - rwlock_init(&rwlock); - sa_wq = create_singlethread_workqueue("local_sa"); - if (!sa_wq) - return -ENOMEM; - - ib_sa_register_client(&sa_client); - ret = ib_register_client(&sa_db_client); - if (ret) - goto err; - - return 0; - -err: - ib_sa_unregister_client(&sa_client); - destroy_workqueue(sa_wq); - return ret; -} - -void sa_db_cleanup(void) -{ - ib_unregister_client(&sa_db_client); - ib_sa_unregister_client(&sa_client); - destroy_workqueue(sa_wq); -} diff --git a/sys/ofed/drivers/infiniband/core/mad.c b/sys/ofed/drivers/infiniband/core/mad.c index 64e660c38e4f..11b3ba372186 100644 --- a/sys/ofed/drivers/infiniband/core/mad.c +++ b/sys/ofed/drivers/infiniband/core/mad.c @@ -34,6 +34,9 @@ * */ #include +#include +#include +#include #include #include "mad_priv.h" @@ -46,8 +49,8 @@ MODULE_DESCRIPTION("kernel IB MAD API"); MODULE_AUTHOR("Hal Rosenstock"); MODULE_AUTHOR("Sean Hefty"); -int mad_sendq_size = IB_MAD_QP_SEND_SIZE; -int mad_recvq_size = IB_MAD_QP_RECV_SIZE; +static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; +static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; module_param_named(send_queue_size, mad_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests"); @@ -59,9 +62,26 @@ static struct kmem_cache *ib_mad_cache; static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; -/* Port list lock */ -static spinlock_t ib_mad_port_list_lock; +/* + * Timeout FIFO (tf) param + */ +enum { + /* min time between 2 consecutive activations of tf workqueue */ + MIN_BETWEEN_ACTIVATIONS_MS = 5 +}; + +/* + * SA congestion control params + */ +enum { + MAX_OUTSTANDING_SA_MADS = 10, + MIN_TIME_FOR_SA_MAD_SEND_MS = 20, + MAX_SA_MADS = 10000 +}; + +/* Port list lock */ +static DEFINE_SPINLOCK(ib_mad_port_list_lock); /* Forward declarations */ static int method_in_use(struct ib_mad_mgmt_method_table **method, @@ -80,6 +100,509 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, u8 mgmt_class); static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv); +static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, + u32 timeout_ms, u32 retries_left); + + +/* + * Timeout FIFO functions - implements FIFO with timeout mechanism + */ + +static void activate_timeout_handler_task(unsigned long data) +{ + struct to_fifo *tf; + + tf = (struct to_fifo *)data; + del_timer(&tf->timer); + queue_work(tf->workq, &tf->work); +} + +static unsigned long adjusted_time(unsigned long last, unsigned long next) +{ + unsigned long min_next; + + min_next = last + msecs_to_jiffies(MIN_BETWEEN_ACTIVATIONS_MS); + if (time_after(min_next, next)) + return min_next; + + return next; +} + +static void notify_failure(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_wc_status status) +{ + struct ib_mad_send_wc mad_send_wc; + struct ib_mad_agent_private *mad_agent_priv; + + mad_send_wc.status = status; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_agent_priv = mad_send_wr->mad_agent_priv; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); +} + +static inline struct sa_cc_data * +get_cc_obj(struct ib_mad_send_wr_private *mad_send_wr) +{ + return &mad_send_wr->mad_agent_priv->qp_info->port_priv->sa_cc; +} + +static inline struct ib_mad_send_wr_private *tfe_to_mad(struct tf_entry *tfe) +{ + return container_of(tfe, struct ib_mad_send_wr_private, tf_list); +} + +static void timeout_handler_task(struct work_struct *work) +{ + struct tf_entry *tmp1, *tmp2; + struct list_head *list_item, exp_lst; + unsigned long flags, curr_time; + int lst_empty; + struct to_fifo *tf; + + tf = container_of(work, struct to_fifo, work); + do { + INIT_LIST_HEAD(&exp_lst); + + spin_lock_irqsave(&tf->lists_lock, flags); + curr_time = jiffies; + list_for_each(list_item, &tf->to_head) { + tmp1 = list_entry(list_item, struct tf_entry, to_list); + if (time_before(curr_time, tmp1->exp_time)) + break; + list_del(&tmp1->fifo_list); + tf->num_items--; + } + + /* cut list up to and including list_item->prev */ + list_cut_position(&exp_lst, &tf->to_head, list_item->prev); + spin_unlock_irqrestore(&tf->lists_lock, flags); + + lst_empty = list_empty(&exp_lst); + list_for_each_entry_safe(tmp1, tmp2, &exp_lst, to_list) { + list_del(&tmp1->to_list); + if (tmp1->canceled) { + tmp1->canceled = 0; + notify_failure(tfe_to_mad(tmp1), IB_WC_WR_FLUSH_ERR); + } else { + notify_failure(tfe_to_mad(tmp1), IB_WC_RESP_TIMEOUT_ERR); + } + } + } while (!lst_empty); + + spin_lock_irqsave(&tf->lists_lock, flags); + if (!list_empty(&tf->to_head)) { + tmp1 = list_entry(tf->to_head.next, struct tf_entry, to_list); + mod_timer(&tf->timer, adjusted_time(curr_time, tmp1->exp_time)); + } + spin_unlock_irqrestore(&tf->lists_lock, flags); +} + +/** + * tf_create - creates new timeout-fifo object + * @fifo_size: Maximum fifo size + * + * Allocate and initialize new timeout-fifo object + */ +static struct to_fifo *tf_create(u32 fifo_size) +{ + struct to_fifo *tf; + + tf = kzalloc(sizeof(*tf), GFP_KERNEL); + if (tf) { + tf->workq = create_singlethread_workqueue("to_fifo"); + if (!tf->workq) { + kfree(tf); + return NULL; + } + spin_lock_init(&tf->lists_lock); + INIT_LIST_HEAD(&tf->to_head); + INIT_LIST_HEAD(&tf->fifo_head); + init_timer(&tf->timer); + INIT_WORK(&tf->work, timeout_handler_task); + tf->timer.data = (unsigned long) tf; + tf->timer.function = activate_timeout_handler_task; + tf->timer.expires = jiffies; + tf->fifo_size = fifo_size; + tf->stop_enqueue = 0; + tf->num_items = 0; + } + + return tf; +} + +/** + * tf_enqueue - enqueue item to timeout-fifo object + * @tf:timeout-fifo object + * @item: item to enqueue. + * @timeout_ms: item expiration time in ms. + * + * Enqueue item to fifo and modify expiration timer when required. + * + * Returns 0 on success and negative on failure. + */ +static int tf_enqueue(struct to_fifo *tf, struct tf_entry *item, u32 timeout_ms) +{ + struct tf_entry *tmp; + struct list_head *list_item; + unsigned long flags; + + item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); + + spin_lock_irqsave(&tf->lists_lock, flags); + if (tf->num_items >= tf->fifo_size || tf->stop_enqueue) { + spin_unlock_irqrestore(&tf->lists_lock, flags); + return -EBUSY; + } + + /* Insert item to timeout list */ + list_for_each_prev(list_item, &tf->to_head) { + tmp = list_entry(list_item, struct tf_entry, to_list); + if (time_after(item->exp_time, tmp->exp_time)) + break; + } + + list_add(&item->to_list, list_item); + + /* Insert item to fifo list */ + list_add_tail(&item->fifo_list, &tf->fifo_head); + + tf->num_items++; + + /* modify expiration timer if required */ + if (list_item == &tf->to_head) + mod_timer(&tf->timer, item->exp_time); + + spin_unlock_irqrestore(&tf->lists_lock, flags); + + return 0; +} + +/** + * tf_dequeue - dequeue item from timeout-fifo object + * @tf:timeout-fifo object + * @time_left_ms: returns the time left for expiration in ms. + * + * Dequeue item from fifo and modify expiration timer when required. + * + * Returns pointer to tf_entry on success and NULL on failure. + */ +static struct tf_entry *tf_dequeue(struct to_fifo *tf, u32 *time_left_ms) +{ + unsigned long flags; + unsigned long time_left; + struct tf_entry *tmp, *tmp1; + + spin_lock_irqsave(&tf->lists_lock, flags); + if (list_empty(&tf->fifo_head)) { + spin_unlock_irqrestore(&tf->lists_lock, flags); + return NULL; + } + + list_for_each_entry(tmp, &tf->fifo_head, fifo_list) { + if (!tmp->canceled) + break; + } + + if (tmp->canceled) { + spin_unlock_irqrestore(&tf->lists_lock, flags); + return NULL; + } + + /* modify timer in case enqueued item is the next to expire */ + if (tf->to_head.next == &tmp->to_list) { + if (list_is_last(&tmp->to_list, &tf->to_head)) { + del_timer(&tf->timer); + } else { + tmp1 = list_entry(tmp->to_list.next, struct tf_entry, to_list); + mod_timer(&tf->timer, tmp1->exp_time); + } + } + list_del(&tmp->fifo_list); + list_del(&tmp->to_list); + tf->num_items--; + spin_unlock_irqrestore(&tf->lists_lock, flags); + + time_left = tmp->exp_time - jiffies; + if ((long) time_left <= 0) + time_left = 0; + *time_left_ms = jiffies_to_msecs(time_left); + + return tmp; +} + +static void tf_stop_enqueue(struct to_fifo *tf) +{ + unsigned long flags; + + spin_lock_irqsave(&tf->lists_lock, flags); + tf->stop_enqueue = 1; + spin_unlock_irqrestore(&tf->lists_lock, flags); +} + +/** + * tf_free - free empty timeout-fifo object + * @tf:timeout-fifo object + * + */ +static void tf_free(struct to_fifo *tf) +{ + del_timer_sync(&tf->timer); + flush_workqueue(tf->workq); + destroy_workqueue(tf->workq); + + kfree(tf); +} + +/** + * tf_free_agent - free MADs related to specific MAD agent from timeout-fifo + * @tf:timeout-fifo object + * @mad_agent_priv: MAD agent. + * + */ +static void tf_free_agent(struct to_fifo *tf, struct ib_mad_agent_private *mad_agent_priv) +{ + unsigned long flags; + struct tf_entry *tmp, *tmp1; + struct list_head tmp_head; + + INIT_LIST_HEAD(&tmp_head); + spin_lock_irqsave(&tf->lists_lock, flags); + list_for_each_entry_safe(tmp, tmp1, &tf->fifo_head, fifo_list) { + if (tfe_to_mad(tmp)->mad_agent_priv == mad_agent_priv) { + list_del(&tmp->to_list); + list_move(&tmp->fifo_list, &tmp_head); + tf->num_items--; + } + } + spin_unlock_irqrestore(&tf->lists_lock, flags); + + list_for_each_entry_safe(tmp, tmp1, &tmp_head, fifo_list) { + list_del(&tmp->fifo_list); + notify_failure(tfe_to_mad(tmp), IB_WC_WR_FLUSH_ERR); + } +} + +/** + * tf_modify_item - to modify expiration time for specific item + * @tf:timeout-fifo object + * @mad_agent_priv: MAD agent. + * @send_buf: the MAD to modify in queue + * @timeout_ms: new timeout to set. + * + * Returns 0 if item found on list and -ENXIO if not. + * + * Note: The send_buf may point on MAD that is already released. + * Therefore we can't use this struct before finding it in the list + */ +static int tf_modify_item(struct to_fifo *tf, + struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_buf *send_buf, u32 timeout_ms) +{ + struct tf_entry *tmp, *item; + struct list_head *list_item; + unsigned long flags; + int found = 0; + + spin_lock_irqsave(&tf->lists_lock, flags); + list_for_each_entry(item, &tf->fifo_head, fifo_list) { + if (tfe_to_mad(item)->mad_agent_priv == mad_agent_priv && + &tfe_to_mad(item)->send_buf == send_buf) { + found = 1; + break; + } + } + + if (!found) { + spin_unlock_irqrestore(&tf->lists_lock, flags); + return -ENXIO; + } + + item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); + + if (timeout_ms) { + list_del(&item->to_list); + list_for_each_prev(list_item, &tf->to_head) { + tmp = list_entry(list_item, struct tf_entry, to_list); + if (time_after(item->exp_time, tmp->exp_time)) + break; + } + list_add(&item->to_list, list_item); + + /* modify expiration timer if required */ + if (list_item == &tf->to_head) + mod_timer(&tf->timer, item->exp_time); + } else { + /* + * when item canceled (timeout_ms == 0) move item to + * head of timeout list and to the tail of fifo list + */ + item->canceled = 1; + list_move(&item->to_list, &tf->to_head); + list_move_tail(&item->fifo_list, &tf->fifo_head); + mod_timer(&tf->timer, item->exp_time); + } + spin_unlock_irqrestore(&tf->lists_lock, flags); + + return 0; +} + +/* + * SA congestion control functions + */ + +/* + * Defines which MAD is under congestion control. + */ +static int is_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_hdr *mad; + + mad = (struct ib_mad_hdr *)mad_send_wr->send_buf.mad; + + return ((mad_send_wr->send_buf.timeout_ms) && + (mad->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) && + ((mad->method == IB_MGMT_METHOD_GET) || + (mad->method == IB_MGMT_METHOD_SET))); +} + +/* + * Notify that SA congestion controlled MAD is done. + * to allow dequeuing SA MAD from congestion control queue. + */ +static void sa_cc_mad_done(struct sa_cc_data *cc_obj) +{ + unsigned long flags; + struct tf_entry *tfe; + struct ib_mad_send_wr_private *mad_send_wr; + u32 time_left_ms, timeout_ms, retries; + int ret; + + do { + spin_lock_irqsave(&cc_obj->lock, flags); + tfe = tf_dequeue(cc_obj->tf, &time_left_ms); + if (!tfe) { + if (cc_obj->outstanding > 0) + cc_obj->outstanding--; + spin_unlock_irqrestore(&cc_obj->lock, flags); + break; + } + spin_unlock_irqrestore(&cc_obj->lock, flags); + mad_send_wr = tfe_to_mad(tfe); + time_left_ms += MIN_TIME_FOR_SA_MAD_SEND_MS; + if (time_left_ms > mad_send_wr->send_buf.timeout_ms) { + retries = time_left_ms / mad_send_wr->send_buf.timeout_ms - 1; + timeout_ms = mad_send_wr->send_buf.timeout_ms; + } else { + retries = 0; + timeout_ms = time_left_ms; + } + ret = send_sa_cc_mad(mad_send_wr, timeout_ms, retries); + if (ret) { + if (ret == -ENOMEM) + notify_failure(mad_send_wr, IB_WC_GENERAL_ERR); + else + notify_failure(mad_send_wr, IB_WC_LOC_QP_OP_ERR); + } + } while (ret); +} + +/* + * Send SA MAD under congestion control. + */ +static int sa_cc_mad_send(struct ib_mad_send_wr_private *mad_send_wr) +{ + unsigned long flags; + int ret; + struct sa_cc_data *cc_obj; + + cc_obj = get_cc_obj(mad_send_wr); + spin_lock_irqsave(&cc_obj->lock, flags); + if (cc_obj->outstanding < MAX_OUTSTANDING_SA_MADS) { + cc_obj->outstanding++; + spin_unlock_irqrestore(&cc_obj->lock, flags); + ret = send_sa_cc_mad(mad_send_wr, mad_send_wr->send_buf.timeout_ms, + mad_send_wr->retries_left); + if (ret) + sa_cc_mad_done(cc_obj); + + } else { + int qtime = (mad_send_wr->send_buf.timeout_ms * + (mad_send_wr->retries_left + 1)) + - MIN_TIME_FOR_SA_MAD_SEND_MS; + + if (qtime < 0) + qtime = 0; + ret = tf_enqueue(cc_obj->tf, &mad_send_wr->tf_list, (u32)qtime); + + spin_unlock_irqrestore(&cc_obj->lock, flags); + } + + return ret; +} + +/* + * Initialize SA congestion control. + */ +static int sa_cc_init(struct sa_cc_data *cc_obj) +{ + spin_lock_init(&cc_obj->lock); + cc_obj->outstanding = 0; + cc_obj->tf = tf_create(MAX_SA_MADS); + if (!cc_obj->tf) + return -ENOMEM; + return 0; +} + +/* + * Cancel SA MADs from congestion control queue. + */ +static void cancel_sa_cc_mads(struct ib_mad_agent_private *mad_agent_priv) +{ + tf_free_agent(mad_agent_priv->qp_info->port_priv->sa_cc.tf, + mad_agent_priv); +} + +/* + * Modify timeout of SA MAD on congestion control queue. + */ +static int modify_sa_cc_mad(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_buf *send_buf, u32 timeout_ms) +{ + int ret; + int qtime = 0; + + if (timeout_ms > MIN_TIME_FOR_SA_MAD_SEND_MS) + qtime = timeout_ms - MIN_TIME_FOR_SA_MAD_SEND_MS; + + ret = tf_modify_item(mad_agent_priv->qp_info->port_priv->sa_cc.tf, + mad_agent_priv, send_buf, (u32)qtime); + return ret; +} + +static void sa_cc_destroy(struct sa_cc_data *cc_obj) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct tf_entry *tfe; + struct ib_mad_send_wc mad_send_wc; + struct ib_mad_agent_private *mad_agent_priv; + u32 time_left_ms; + + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; + mad_send_wc.vendor_err = 0; + + tf_stop_enqueue(cc_obj->tf); + tfe = tf_dequeue(cc_obj->tf, &time_left_ms); + while (tfe) { + mad_send_wr = tfe_to_mad(tfe); + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_agent_priv = mad_send_wr->mad_agent_priv; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + tfe = tf_dequeue(cc_obj->tf, &time_left_ms); + } + tf_free(cc_obj->tf); +} /* * Returns a ib_mad_port_private structure or NULL for a device/port @@ -184,15 +707,6 @@ int ib_response_mad(struct ib_mad *mad) } EXPORT_SYMBOL(ib_response_mad); -static void timeout_callback(unsigned long data) -{ - struct ib_mad_agent_private *mad_agent_priv = - (struct ib_mad_agent_private *) data; - - queue_work(mad_agent_priv->qp_info->port_priv->wq, - &mad_agent_priv->timeout_work); -} - /* * ib_register_mad_agent - Register to send/receive MADs */ @@ -285,6 +799,13 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error1; } + /* Verify the QP requested is supported. For example, Ethernet devices + * will not have QP0 */ + if (!port_priv->qp_info[qpn].qp) { + ret = ERR_PTR(-EPROTONOSUPPORT); + goto error1; + } + /* Allocate structures */ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); if (!mad_agent_priv) { @@ -300,13 +821,11 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, } if (mad_reg_req) { - reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL); + reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); if (!reg_req) { ret = ERR_PTR(-ENOMEM); goto error3; } - /* Make a copy of the MAD registration request */ - memcpy(reg_req, mad_reg_req, sizeof *reg_req); } /* Now, fill in the various structures */ @@ -324,9 +843,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, INIT_LIST_HEAD(&mad_agent_priv->wait_list); INIT_LIST_HEAD(&mad_agent_priv->done_list); INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); - INIT_WORK(&mad_agent_priv->timeout_work, timeout_sends); - setup_timer(&mad_agent_priv->timeout_timer, timeout_callback, - (unsigned long) mad_agent_priv); + INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); atomic_set(&mad_agent_priv->refcount, 1); @@ -533,8 +1050,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) */ cancel_mads(mad_agent_priv); port_priv = mad_agent_priv->qp_info->port_priv; - del_timer_sync(&mad_agent_priv->timeout_timer); - cancel_work_sync(&mad_agent_priv->timeout_work); + cancel_delayed_work(&mad_agent_priv->timed_work); spin_lock_irqsave(&port_priv->reg_lock, flags); remove_mad_reg_req(mad_agent_priv); @@ -577,6 +1093,7 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; + if (!IS_ERR(mad_agent)) { /* If the TID is zero, the agent can only snoop. */ if (mad_agent->hi_tid) { mad_agent_priv = container_of(mad_agent, @@ -589,6 +1106,8 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) agent); unregister_mad_snoop(mad_snoop_priv); } + } + return 0; } EXPORT_SYMBOL(ib_unregister_mad_agent); @@ -695,7 +1214,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, struct ib_wc mad_wc; struct ib_send_wr *send_wr = &mad_send_wr->send_wr; - if (device->node_type == RDMA_NODE_IB_SWITCH) + if (device->node_type == RDMA_NODE_IB_SWITCH && + smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) port_num = send_wr->wr.ud.port_num; else port_num = mad_agent_priv->agent.port_num; @@ -1028,12 +1548,20 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->send_buf.mad, sge[0].length, DMA_TO_DEVICE); - mad_send_wr->header_mapping = sge[0].addr; + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) + return -ENOMEM; sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); + + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { + ret = -ENOMEM; + goto dma1_err; + } + + mad_send_wr->header_mapping = sge[0].addr; mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); @@ -1051,14 +1579,51 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) list_add_tail(&mad_send_wr->mad_list.list, list); } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); - if (ret) { + + if (!ret) + return 0; + ib_dma_unmap_single(mad_agent->device, mad_send_wr->header_mapping, - sge[0].length, DMA_TO_DEVICE); + sge[1].length, DMA_TO_DEVICE); +dma1_err: ib_dma_unmap_single(mad_agent->device, mad_send_wr->payload_mapping, - sge[1].length, DMA_TO_DEVICE); + sge[0].length, DMA_TO_DEVICE); + return ret; +} + +/* + * Send SA MAD that passed congestion control + */ +static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, + u32 timeout_ms, u32 retries_left) +{ + int ret; + unsigned long flags; + struct ib_mad_agent_private *mad_agent_priv; + + mad_agent_priv = mad_send_wr->mad_agent_priv; + mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); + mad_send_wr->retries_left = retries_left; + mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); + + /* Reference MAD agent until send completes */ + atomic_inc(&mad_agent_priv->refcount); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + ret = ib_send_mad(mad_send_wr); + if (ret < 0) { + /* Fail send request */ + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_del(&mad_send_wr->agent_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + atomic_dec(&mad_agent_priv->refcount); } + return ret; } @@ -1125,6 +1690,12 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); mad_send_wr->status = IB_WC_SUCCESS; + if (is_sa_cc_mad(mad_send_wr)) { + mad_send_wr->is_sa_cc_mad = 1; + ret = sa_cc_mad_send(mad_send_wr); + if (ret < 0) + goto error; + } else { /* Reference MAD agent until send completes */ atomic_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); @@ -1147,6 +1718,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, goto error; } } + } return 0; error: if (bad_send_buf) @@ -1206,10 +1778,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method, { int i; - for (i = find_first_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS); - i < IB_MGMT_MAX_METHODS; - i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS, - 1+i)) { + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { if ((*method)->agent[i]) { printk(KERN_ERR PFX "Method %d already in use\n", i); return -EINVAL; @@ -1343,13 +1912,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, goto error3; /* Finally, add in methods being registered */ - for (i = find_first_bit(mad_reg_req->method_mask, - IB_MGMT_MAX_METHODS); - i < IB_MGMT_MAX_METHODS; - i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS, - 1+i)) { + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) (*method)->agent[i] = agent_priv; - } + return 0; error3: @@ -1442,13 +2007,9 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, goto error4; /* Finally, add in methods being registered */ - for (i = find_first_bit(mad_reg_req->method_mask, - IB_MGMT_MAX_METHODS); - i < IB_MGMT_MAX_METHODS; - i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS, - 1+i)) { + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) (*method)->agent[i] = agent_priv; - } + return 0; error4: @@ -1614,6 +2175,9 @@ find_mad_agent(struct ib_mad_port_private *port_priv, mad->mad_hdr.class_version].class; if (!class) goto out; + if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >= + IB_MGMT_MAX_METHODS) + goto out; method = class->method_table[convert_mgmt_class( mad->mad_hdr.mgmt_class)]; if (method) @@ -1856,6 +2420,26 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } } +static bool generate_unmatched_resp(struct ib_mad_private *recv, + struct ib_mad_private *response) +{ + if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET || + recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) { + memcpy(response, recv, sizeof *response); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = &response->mad.mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + response->mad.mad.mad_hdr.status = + cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); + if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION; + + return true; + } else { + return false; + } +} static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, struct ib_wc *wc) { @@ -1865,6 +2449,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, struct ib_mad_list_head *mad_list; struct ib_mad_agent_private *mad_agent; int port_num; + int ret = IB_MAD_RESULT_SUCCESS; mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; qp_info = mad_list->mad_queue->qp_info; @@ -1948,8 +2533,6 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, local: /* Give driver "right of first refusal" on incoming MAD */ if (port_priv->device->process_mad) { - int ret; - ret = port_priv->device->process_mad(port_priv->device, 0, port_priv->port_num, wc, &recv->grh, @@ -1977,6 +2560,10 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, * or via recv_handler in ib_mad_complete_recv() */ recv = NULL; + } else if ((ret & IB_MAD_RESULT_SUCCESS) && + generate_unmatched_resp(recv, response)) { + agent_send_response(&response->mad.mad, &recv->grh, wc, + port_priv->device, port_num, qp_info->qp->qp_num); } out: @@ -1992,9 +2579,10 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_send_wr_private *mad_send_wr; + unsigned long delay; if (list_empty(&mad_agent_priv->wait_list)) { - del_timer(&mad_agent_priv->timeout_timer); + cancel_delayed_work(&mad_agent_priv->timed_work); } else { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, @@ -2003,8 +2591,11 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) if (time_after(mad_agent_priv->timeout, mad_send_wr->timeout)) { mad_agent_priv->timeout = mad_send_wr->timeout; - mod_timer(&mad_agent_priv->timeout_timer, - mad_send_wr->timeout); + delay = mad_send_wr->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->timed_work, delay); } } } @@ -2031,14 +2622,15 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) temp_mad_send_wr->timeout)) break; } - } else + } + else list_item = &mad_agent_priv->wait_list; list_add(&mad_send_wr->agent_list, list_item); /* Reschedule a work item if we have a shorter timeout */ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) - mod_timer(&mad_agent_priv->timeout_timer, - mad_send_wr->timeout); + mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->timed_work, delay); } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, @@ -2090,9 +2682,12 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, mad_send_wc->status = mad_send_wr->status; if (ret == IB_RMPP_RESULT_INTERNAL) ib_rmpp_send_handler(mad_send_wc); - else + else { + if (mad_send_wr->is_sa_cc_mad) + sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, mad_send_wc); + } /* Release reference on agent taken when sending */ deref_mad_agent(mad_agent_priv); @@ -2272,6 +2867,7 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) INIT_LIST_HEAD(&cancel_list); + cancel_sa_cc_mads(mad_agent_priv); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &mad_agent_priv->send_list, agent_list) { @@ -2293,6 +2889,8 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) &cancel_list, agent_list) { mad_send_wc.send_buf = &mad_send_wr->send_buf; list_del(&mad_send_wr->agent_list); + if (mad_send_wr->is_sa_cc_mad) + sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); atomic_dec(&mad_agent_priv->refcount); @@ -2332,7 +2930,13 @@ int ib_modify_mad(struct ib_mad_agent *mad_agent, agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); - if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { + if (!mad_send_wr) { + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + if (modify_sa_cc_mad(mad_agent_priv, send_buf, timeout_ms)) + return -EINVAL; + return 0; + } + if (mad_send_wr->status != IB_WC_SUCCESS) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return -EINVAL; } @@ -2482,10 +3086,10 @@ static void timeout_sends(struct work_struct *work) struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; - unsigned long flags; + unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, - timeout_work); + timed_work.work); mad_send_wc.vendor_err = 0; spin_lock_irqsave(&mad_agent_priv->lock, flags); @@ -2495,8 +3099,12 @@ static void timeout_sends(struct work_struct *work) agent_list); if (time_after(mad_send_wr->timeout, jiffies)) { - mod_timer(&mad_agent_priv->timeout_timer, - mad_send_wr->timeout); + delay = mad_send_wr->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + queue_delayed_work(mad_agent_priv->qp_info-> + port_priv->wq, + &mad_agent_priv->timed_work, delay); break; } @@ -2512,6 +3120,8 @@ static void timeout_sends(struct work_struct *work) else mad_send_wc.status = mad_send_wr->status; mad_send_wc.send_buf = &mad_send_wr->send_buf; + if (mad_send_wr->is_sa_cc_mad) + sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); @@ -2572,6 +3182,14 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, sizeof *mad_priv - sizeof mad_priv->header, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, + sg_list.addr))) { + ret = -ENOMEM; + kmem_cache_free(ib_mad_cache, mad_priv); + printk(KERN_ERR PFX "ib_dma_map_single failed\n"); + break; + } + mad_priv->header.mapping = sg_list.addr; recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; @@ -2645,6 +3263,7 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) int ret, i; struct ib_qp_attr *attr; struct ib_qp *qp; + u16 pkey_index = 0; attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { @@ -2652,6 +3271,11 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) return -ENOMEM; } + ret = ib_find_pkey(port_priv->device, port_priv->port_num, + 0xFFFF, &pkey_index); + if (ret) + pkey_index = 0; + for (i = 0; i < IB_MAD_QPS_CORE; i++) { qp = port_priv->qp_info[i].qp; if (!qp) @@ -2662,7 +3286,7 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) * one is needed for the Reset to Init transition */ attr->qp_state = IB_QPS_INIT; - attr->pkey_index = 0; + attr->pkey_index = pkey_index; attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); @@ -2858,6 +3482,10 @@ static int ib_mad_port_open(struct ib_device *device, } INIT_WORK(&port_priv->work, ib_mad_completion_handler); + if (sa_cc_init(&port_priv->sa_cc)) + goto error9; + + spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); @@ -2865,17 +3493,19 @@ static int ib_mad_port_open(struct ib_device *device, ret = ib_mad_port_start(port_priv); if (ret) { printk(KERN_ERR PFX "Couldn't start port\n"); - goto error9; + goto error10; } return 0; -error9: +error10: spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); +error9: + sa_cc_destroy(&port_priv->sa_cc); error8: destroy_mad_qp(&port_priv->qp_info[1]); error7: @@ -2915,6 +3545,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); + sa_cc_destroy(&port_priv->sa_cc); destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); ib_dereg_mr(port_priv->mr); @@ -2983,6 +3614,9 @@ static void ib_mad_remove_device(struct ib_device *device) { int i, num_ports, cur_port; + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + if (device->node_type == RDMA_NODE_IB_SWITCH) { num_ports = 1; cur_port = 0; @@ -3017,8 +3651,6 @@ static int __init ib_mad_init_module(void) mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); - spin_lock_init(&ib_mad_port_list_lock); - ib_mad_cache = kmem_cache_create("ib_mad", sizeof(struct ib_mad_private), 0, @@ -3054,4 +3686,3 @@ static void __exit ib_mad_cleanup_module(void) module_init(ib_mad_init_module); module_exit(ib_mad_cleanup_module); - diff --git a/sys/ofed/drivers/infiniband/core/mad_priv.h b/sys/ofed/drivers/infiniband/core/mad_priv.h index 8b4df0a33e0b..e2cd0ac0a514 100644 --- a/sys/ofed/drivers/infiniband/core/mad_priv.h +++ b/sys/ofed/drivers/infiniband/core/mad_priv.h @@ -102,8 +102,7 @@ struct ib_mad_agent_private { struct list_head send_list; struct list_head wait_list; struct list_head done_list; - struct work_struct timeout_work; - struct timer_list timeout_timer; + struct delayed_work timed_work; unsigned long timeout; struct list_head local_list; struct work_struct local_work; @@ -122,6 +121,14 @@ struct ib_mad_snoop_private { struct completion comp; }; +/* Structure for timeout-fifo entry */ +struct tf_entry { + unsigned long exp_time; /* entry expiration time */ + struct list_head fifo_list; /* to keep entries in fifo order */ + struct list_head to_list; /* to keep entries in timeout order */ + int canceled; /* indicates whether entry is canceled */ +}; + struct ib_mad_send_wr_private { struct ib_mad_list_head mad_list; struct list_head agent_list; @@ -147,6 +154,10 @@ struct ib_mad_send_wr_private { int seg_num; int newwin; int pad; + + /* SA congestion controlled MAD */ + int is_sa_cc_mad; + struct tf_entry tf_list; }; struct ib_mad_local_private { @@ -198,6 +209,25 @@ struct ib_mad_qp_info { atomic_t snoop_count; }; +struct to_fifo { + struct list_head to_head; + struct list_head fifo_head; + spinlock_t lists_lock; + struct timer_list timer; + struct work_struct work; + u32 fifo_size; + u32 num_items; + int stop_enqueue; + struct workqueue_struct *workq; +}; + +/* SA congestion control data */ +struct sa_cc_data { + spinlock_t lock; + unsigned long outstanding; + struct to_fifo *tf; +}; + struct ib_mad_port_private { struct list_head port_list; struct ib_device *device; @@ -212,6 +242,7 @@ struct ib_mad_port_private { struct workqueue_struct *wq; struct work_struct work; struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; + struct sa_cc_data sa_cc; }; int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr); diff --git a/sys/ofed/drivers/infiniband/core/mad_rmpp.c b/sys/ofed/drivers/infiniband/core/mad_rmpp.c index 4e0f2829e0e5..f37878c9c06e 100644 --- a/sys/ofed/drivers/infiniband/core/mad_rmpp.c +++ b/sys/ofed/drivers/infiniband/core/mad_rmpp.c @@ -31,6 +31,8 @@ * SOFTWARE. */ +#include + #include "mad_priv.h" #include "mad_rmpp.h" diff --git a/sys/ofed/drivers/infiniband/core/multicast.c b/sys/ofed/drivers/infiniband/core/multicast.c index f8d7ef81e190..ef595b247cec 100644 --- a/sys/ofed/drivers/infiniband/core/multicast.c +++ b/sys/ofed/drivers/infiniband/core/multicast.c @@ -34,12 +34,27 @@ #include #include #include +#include +#include #include #include +#include +#include #include #include "sa.h" +static int mcast_leave_retries = 3; + +/*static const struct kernel_param_ops retry_ops = { + .set = param_set_int, + .get = param_get_int, +}; + +module_param_cb(mcast_leave_retries, &retry_ops, &mcast_leave_retries, 0644); +MODULE_PARM_DESC(mcast_leave_retries, "Number of retries for multicast leave " + "requests before giving up (default: 3)"); +*/ static void mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device); @@ -250,6 +265,34 @@ static u8 get_leave_state(struct mcast_group *group) return leave_state & group->rec.join_state; } +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 selector, u8 src_value, u8 dst_value) +{ + int err; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + static int cmp_rec(struct ib_sa_mcmember_rec *src, struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) { @@ -262,7 +305,7 @@ static int cmp_rec(struct ib_sa_mcmember_rec *src, return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; - if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, src->mtu, dst->mtu)) return -EINVAL; @@ -271,11 +314,11 @@ static int cmp_rec(struct ib_sa_mcmember_rec *src, return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; - if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, src->rate, dst->rate)) return -EINVAL; - if (ib_sa_check_selector(comp_mask, + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, dst->packet_life_time_selector, @@ -517,11 +560,15 @@ static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, { struct mcast_group *group = context; - if (status && (group->retries > 0) && + if (status && group->retries > 0 && !send_leave(group, group->leave_state)) group->retries--; - else + else { + if (status && group->retries <= 0) + printk(KERN_WARNING "reached max retry count. " + "status=%d. Giving up\n", status); mcast_work_handler(&group->work); + } } static struct mcast_group *acquire_group(struct mcast_port *port, @@ -544,7 +591,7 @@ static struct mcast_group *acquire_group(struct mcast_port *port, if (!group) return NULL; - group->retries = 3; + group->retries = mcast_leave_retries; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; @@ -754,7 +801,6 @@ static void mcast_event_handler(struct ib_event_handler *handler, switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: - case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; diff --git a/sys/ofed/drivers/infiniband/core/notice.c b/sys/ofed/drivers/infiniband/core/notice.c deleted file mode 100644 index ca91d96d64d8..000000000000 --- a/sys/ofed/drivers/infiniband/core/notice.c +++ /dev/null @@ -1,749 +0,0 @@ -/* - * Copyright (c) 2006 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "sa.h" - -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("InfiniBand InformInfo & Notice event handling"); -MODULE_LICENSE("Dual BSD/GPL"); - -static void inform_add_one(struct ib_device *device); -static void inform_remove_one(struct ib_device *device); - -static struct ib_client inform_client = { - .name = "ib_notice", - .add = inform_add_one, - .remove = inform_remove_one -}; - -static struct ib_sa_client sa_client; -static struct workqueue_struct *inform_wq; - -struct inform_device; - -struct inform_port { - struct inform_device *dev; - spinlock_t lock; - struct rb_root table; - atomic_t refcount; - struct completion comp; - u8 port_num; -}; - -struct inform_device { - struct ib_device *device; - struct ib_event_handler event_handler; - int start_port; - int end_port; - struct inform_port port[0]; -}; - -enum inform_state { - INFORM_IDLE, - INFORM_REGISTERING, - INFORM_MEMBER, - INFORM_BUSY, - INFORM_ERROR -}; - -struct inform_member; - -struct inform_group { - u16 trap_number; - struct rb_node node; - struct inform_port *port; - spinlock_t lock; - struct work_struct work; - struct list_head pending_list; - struct list_head active_list; - struct list_head notice_list; - struct inform_member *last_join; - int members; - enum inform_state join_state; /* State relative to SA */ - atomic_t refcount; - enum inform_state state; - struct ib_sa_query *query; - int query_id; -}; - -struct inform_member { - struct ib_inform_info info; - struct ib_sa_client *client; - struct inform_group *group; - struct list_head list; - enum inform_state state; - atomic_t refcount; - struct completion comp; -}; - -struct inform_notice { - struct list_head list; - struct ib_sa_notice notice; -}; - -static void reg_handler(int status, struct ib_sa_inform *inform, - void *context); -static void unreg_handler(int status, struct ib_sa_inform *inform, - void *context); - -static struct inform_group *inform_find(struct inform_port *port, - u16 trap_number) -{ - struct rb_node *node = port->table.rb_node; - struct inform_group *group; - - while (node) { - group = rb_entry(node, struct inform_group, node); - if (trap_number < group->trap_number) - node = node->rb_left; - else if (trap_number > group->trap_number) - node = node->rb_right; - else - return group; - } - return NULL; -} - -static struct inform_group *inform_insert(struct inform_port *port, - struct inform_group *group) -{ - struct rb_node **link = &port->table.rb_node; - struct rb_node *parent = NULL; - struct inform_group *cur_group; - - while (*link) { - parent = *link; - cur_group = rb_entry(parent, struct inform_group, node); - if (group->trap_number < cur_group->trap_number) - link = &(*link)->rb_left; - else if (group->trap_number > cur_group->trap_number) - link = &(*link)->rb_right; - else - return cur_group; - } - rb_link_node(&group->node, parent, link); - rb_insert_color(&group->node, &port->table); - return NULL; -} - -static void deref_port(struct inform_port *port) -{ - if (atomic_dec_and_test(&port->refcount)) - complete(&port->comp); -} - -static void release_group(struct inform_group *group) -{ - struct inform_port *port = group->port; - unsigned long flags; - - spin_lock_irqsave(&port->lock, flags); - if (atomic_dec_and_test(&group->refcount)) { - rb_erase(&group->node, &port->table); - spin_unlock_irqrestore(&port->lock, flags); - kfree(group); - deref_port(port); - } else - spin_unlock_irqrestore(&port->lock, flags); -} - -static void deref_member(struct inform_member *member) -{ - if (atomic_dec_and_test(&member->refcount)) - complete(&member->comp); -} - -static void queue_reg(struct inform_member *member) -{ - struct inform_group *group = member->group; - unsigned long flags; - - spin_lock_irqsave(&group->lock, flags); - list_add(&member->list, &group->pending_list); - if (group->state == INFORM_IDLE) { - group->state = INFORM_BUSY; - atomic_inc(&group->refcount); - queue_work(inform_wq, &group->work); - } - spin_unlock_irqrestore(&group->lock, flags); -} - -static int send_reg(struct inform_group *group, struct inform_member *member) -{ - struct inform_port *port = group->port; - struct ib_sa_inform inform; - int ret; - - memset(&inform, 0, sizeof inform); - inform.lid_range_begin = cpu_to_be16(0xFFFF); - inform.is_generic = 1; - inform.subscribe = 1; - inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL); - inform.trap.generic.trap_num = cpu_to_be16(member->info.trap_number); - inform.trap.generic.resp_time = 19; - inform.trap.generic.producer_type = - cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL); - - group->last_join = member; - ret = ib_sa_informinfo_query(&sa_client, port->dev->device, - port->port_num, &inform, 3000, GFP_KERNEL, - reg_handler, group,&group->query); - if (ret >= 0) { - group->query_id = ret; - ret = 0; - } - return ret; -} - -static int send_unreg(struct inform_group *group) -{ - struct inform_port *port = group->port; - struct ib_sa_inform inform; - int ret; - - memset(&inform, 0, sizeof inform); - inform.lid_range_begin = cpu_to_be16(0xFFFF); - inform.is_generic = 1; - inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL); - inform.trap.generic.trap_num = cpu_to_be16(group->trap_number); - inform.trap.generic.qpn = IB_QP1; - inform.trap.generic.resp_time = 19; - inform.trap.generic.producer_type = - cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL); - - ret = ib_sa_informinfo_query(&sa_client, port->dev->device, - port->port_num, &inform, 3000, GFP_KERNEL, - unreg_handler, group, &group->query); - if (ret >= 0) { - group->query_id = ret; - ret = 0; - } - return ret; -} - -static void join_group(struct inform_group *group, struct inform_member *member) -{ - member->state = INFORM_MEMBER; - group->members++; - list_move(&member->list, &group->active_list); -} - -static int fail_join(struct inform_group *group, struct inform_member *member, - int status) -{ - spin_lock_irq(&group->lock); - list_del_init(&member->list); - spin_unlock_irq(&group->lock); - return member->info.callback(status, &member->info, NULL); -} - -static void process_group_error(struct inform_group *group) -{ - struct inform_member *member; - int ret; - - spin_lock_irq(&group->lock); - while (!list_empty(&group->active_list)) { - member = list_entry(group->active_list.next, - struct inform_member, list); - atomic_inc(&member->refcount); - list_del_init(&member->list); - group->members--; - member->state = INFORM_ERROR; - spin_unlock_irq(&group->lock); - - ret = member->info.callback(-ENETRESET, &member->info, NULL); - deref_member(member); - if (ret) - ib_sa_unregister_inform_info(&member->info); - spin_lock_irq(&group->lock); - } - - group->join_state = INFORM_IDLE; - group->state = INFORM_BUSY; - spin_unlock_irq(&group->lock); -} - -/* - * Report a notice to all active subscribers. We use a temporary list to - * handle unsubscription requests while the notice is being reported, which - * avoids holding the group lock while in the user's callback. - */ -static void process_notice(struct inform_group *group, - struct inform_notice *info_notice) -{ - struct inform_member *member; - struct list_head list; - int ret; - - INIT_LIST_HEAD(&list); - - spin_lock_irq(&group->lock); - list_splice_init(&group->active_list, &list); - while (!list_empty(&list)) { - - member = list_entry(list.next, struct inform_member, list); - atomic_inc(&member->refcount); - list_move(&member->list, &group->active_list); - spin_unlock_irq(&group->lock); - - ret = member->info.callback(0, &member->info, - &info_notice->notice); - deref_member(member); - if (ret) - ib_sa_unregister_inform_info(&member->info); - spin_lock_irq(&group->lock); - } - spin_unlock_irq(&group->lock); -} - -static void inform_work_handler(struct work_struct *work) -{ - struct inform_group *group; - struct inform_member *member; - struct ib_inform_info *info; - struct inform_notice *info_notice; - int status, ret; - - group = container_of(work, typeof(*group), work); -retest: - spin_lock_irq(&group->lock); - while (!list_empty(&group->pending_list) || - !list_empty(&group->notice_list) || - (group->state == INFORM_ERROR)) { - - if (group->state == INFORM_ERROR) { - spin_unlock_irq(&group->lock); - process_group_error(group); - goto retest; - } - - if (!list_empty(&group->notice_list)) { - info_notice = list_entry(group->notice_list.next, - struct inform_notice, list); - list_del(&info_notice->list); - spin_unlock_irq(&group->lock); - process_notice(group, info_notice); - kfree(info_notice); - goto retest; - } - - member = list_entry(group->pending_list.next, - struct inform_member, list); - info = &member->info; - atomic_inc(&member->refcount); - - if (group->join_state == INFORM_MEMBER) { - join_group(group, member); - spin_unlock_irq(&group->lock); - ret = info->callback(0, info, NULL); - } else { - spin_unlock_irq(&group->lock); - status = send_reg(group, member); - if (!status) { - deref_member(member); - return; - } - ret = fail_join(group, member, status); - } - - deref_member(member); - if (ret) - ib_sa_unregister_inform_info(&member->info); - spin_lock_irq(&group->lock); - } - - if (!group->members && (group->join_state == INFORM_MEMBER)) { - group->join_state = INFORM_IDLE; - spin_unlock_irq(&group->lock); - if (send_unreg(group)) - goto retest; - } else { - group->state = INFORM_IDLE; - spin_unlock_irq(&group->lock); - release_group(group); - } -} - -/* - * Fail a join request if it is still active - at the head of the pending queue. - */ -static void process_join_error(struct inform_group *group, int status) -{ - struct inform_member *member; - int ret; - - spin_lock_irq(&group->lock); - member = list_entry(group->pending_list.next, - struct inform_member, list); - if (group->last_join == member) { - atomic_inc(&member->refcount); - list_del_init(&member->list); - spin_unlock_irq(&group->lock); - ret = member->info.callback(status, &member->info, NULL); - deref_member(member); - if (ret) - ib_sa_unregister_inform_info(&member->info); - } else - spin_unlock_irq(&group->lock); -} - -static void reg_handler(int status, struct ib_sa_inform *inform, void *context) -{ - struct inform_group *group = context; - - if (status) - process_join_error(group, status); - else - group->join_state = INFORM_MEMBER; - - inform_work_handler(&group->work); -} - -static void unreg_handler(int status, struct ib_sa_inform *rec, void *context) -{ - struct inform_group *group = context; - - inform_work_handler(&group->work); -} - -int notice_dispatch(struct ib_device *device, u8 port_num, - struct ib_sa_notice *notice) -{ - struct inform_device *dev; - struct inform_port *port; - struct inform_group *group; - struct inform_notice *info_notice; - - dev = ib_get_client_data(device, &inform_client); - if (!dev) - return 0; /* No one to give notice to. */ - - port = &dev->port[port_num - dev->start_port]; - spin_lock_irq(&port->lock); - group = inform_find(port, __be16_to_cpu(notice->trap. - generic.trap_num)); - if (!group) { - spin_unlock_irq(&port->lock); - return 0; - } - - atomic_inc(&group->refcount); - spin_unlock_irq(&port->lock); - - info_notice = kmalloc(sizeof *info_notice, GFP_KERNEL); - if (!info_notice) { - release_group(group); - return -ENOMEM; - } - - info_notice->notice = *notice; - - spin_lock_irq(&group->lock); - list_add(&info_notice->list, &group->notice_list); - if (group->state == INFORM_IDLE) { - group->state = INFORM_BUSY; - spin_unlock_irq(&group->lock); - inform_work_handler(&group->work); - } else { - spin_unlock_irq(&group->lock); - release_group(group); - } - - return 0; -} - -static struct inform_group *acquire_group(struct inform_port *port, - u16 trap_number, gfp_t gfp_mask) -{ - struct inform_group *group, *cur_group; - unsigned long flags; - - spin_lock_irqsave(&port->lock, flags); - group = inform_find(port, trap_number); - if (group) - goto found; - spin_unlock_irqrestore(&port->lock, flags); - - group = kzalloc(sizeof *group, gfp_mask); - if (!group) - return NULL; - - group->port = port; - group->trap_number = trap_number; - INIT_LIST_HEAD(&group->pending_list); - INIT_LIST_HEAD(&group->active_list); - INIT_LIST_HEAD(&group->notice_list); - INIT_WORK(&group->work, inform_work_handler); - spin_lock_init(&group->lock); - - spin_lock_irqsave(&port->lock, flags); - cur_group = inform_insert(port, group); - if (cur_group) { - kfree(group); - group = cur_group; - } else - atomic_inc(&port->refcount); -found: - atomic_inc(&group->refcount); - spin_unlock_irqrestore(&port->lock, flags); - return group; -} - -/* - * We serialize all join requests to a single group to make our lives much - * easier. Otherwise, two users could try to join the same group - * simultaneously, with different configurations, one could leave while the - * join is in progress, etc., which makes locking around error recovery - * difficult. - */ -struct ib_inform_info * -ib_sa_register_inform_info(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u16 trap_number, gfp_t gfp_mask, - int (*callback)(int status, - struct ib_inform_info *info, - struct ib_sa_notice *notice), - void *context) -{ - struct inform_device *dev; - struct inform_member *member; - struct ib_inform_info *info; - int ret; - - dev = ib_get_client_data(device, &inform_client); - if (!dev) - return ERR_PTR(-ENODEV); - - member = kzalloc(sizeof *member, gfp_mask); - if (!member) - return ERR_PTR(-ENOMEM); - - ib_sa_client_get(client); - member->client = client; - member->info.trap_number = trap_number; - member->info.callback = callback; - member->info.context = context; - init_completion(&member->comp); - atomic_set(&member->refcount, 1); - member->state = INFORM_REGISTERING; - - member->group = acquire_group(&dev->port[port_num - dev->start_port], - trap_number, gfp_mask); - if (!member->group) { - ret = -ENOMEM; - goto err; - } - - /* - * The user will get the info structure in their callback. They - * could then free the info structure before we can return from - * this routine. So we save the pointer to return before queuing - * any callback. - */ - info = &member->info; - queue_reg(member); - return info; - -err: - ib_sa_client_put(member->client); - kfree(member); - return ERR_PTR(ret); -} -EXPORT_SYMBOL(ib_sa_register_inform_info); - -void ib_sa_unregister_inform_info(struct ib_inform_info *info) -{ - struct inform_member *member; - struct inform_group *group; - - member = container_of(info, struct inform_member, info); - group = member->group; - - spin_lock_irq(&group->lock); - if (member->state == INFORM_MEMBER) - group->members--; - - list_del_init(&member->list); - - if (group->state == INFORM_IDLE) { - group->state = INFORM_BUSY; - spin_unlock_irq(&group->lock); - /* Continue to hold reference on group until callback */ - queue_work(inform_wq, &group->work); - } else { - spin_unlock_irq(&group->lock); - release_group(group); - } - - deref_member(member); - wait_for_completion(&member->comp); - ib_sa_client_put(member->client); - kfree(member); -} -EXPORT_SYMBOL(ib_sa_unregister_inform_info); - -static void inform_groups_lost(struct inform_port *port) -{ - struct inform_group *group; - struct rb_node *node; - unsigned long flags; - - spin_lock_irqsave(&port->lock, flags); - for (node = rb_first(&port->table); node; node = rb_next(node)) { - group = rb_entry(node, struct inform_group, node); - spin_lock(&group->lock); - if (group->state == INFORM_IDLE) { - atomic_inc(&group->refcount); - queue_work(inform_wq, &group->work); - } - group->state = INFORM_ERROR; - spin_unlock(&group->lock); - } - spin_unlock_irqrestore(&port->lock, flags); -} - -static void inform_event_handler(struct ib_event_handler *handler, - struct ib_event *event) -{ - struct inform_device *dev; - - dev = container_of(handler, struct inform_device, event_handler); - - switch (event->event) { - case IB_EVENT_PORT_ERR: - case IB_EVENT_LID_CHANGE: - case IB_EVENT_SM_CHANGE: - case IB_EVENT_CLIENT_REREGISTER: - inform_groups_lost(&dev->port[event->element.port_num - - dev->start_port]); - break; - default: - break; - } -} - -static void inform_add_one(struct ib_device *device) -{ - struct inform_device *dev; - struct inform_port *port; - int i; - - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - - dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, - GFP_KERNEL); - if (!dev) - return; - - if (device->node_type == RDMA_NODE_IB_SWITCH) - dev->start_port = dev->end_port = 0; - else { - dev->start_port = 1; - dev->end_port = device->phys_port_cnt; - } - - for (i = 0; i <= dev->end_port - dev->start_port; i++) { - port = &dev->port[i]; - port->dev = dev; - port->port_num = dev->start_port + i; - spin_lock_init(&port->lock); - port->table = RB_ROOT; - init_completion(&port->comp); - atomic_set(&port->refcount, 1); - } - - dev->device = device; - ib_set_client_data(device, &inform_client, dev); - - INIT_IB_EVENT_HANDLER(&dev->event_handler, device, inform_event_handler); - ib_register_event_handler(&dev->event_handler); -} - -static void inform_remove_one(struct ib_device *device) -{ - struct inform_device *dev; - struct inform_port *port; - int i; - - dev = ib_get_client_data(device, &inform_client); - if (!dev) - return; - - ib_unregister_event_handler(&dev->event_handler); - flush_workqueue(inform_wq); - - for (i = 0; i <= dev->end_port - dev->start_port; i++) { - port = &dev->port[i]; - deref_port(port); - wait_for_completion(&port->comp); - } - - kfree(dev); -} - -int notice_init(void) -{ - int ret; - - inform_wq = create_singlethread_workqueue("ib_inform"); - if (!inform_wq) - return -ENOMEM; - - ib_sa_register_client(&sa_client); - - ret = ib_register_client(&inform_client); - if (ret) - goto err; - return 0; - -err: - ib_sa_unregister_client(&sa_client); - destroy_workqueue(inform_wq); - return ret; -} - -void notice_cleanup(void) -{ - ib_unregister_client(&inform_client); - ib_sa_unregister_client(&sa_client); - destroy_workqueue(inform_wq); -} diff --git a/sys/ofed/drivers/infiniband/core/packer.c b/sys/ofed/drivers/infiniband/core/packer.c index 019bd4b0863e..9f42595a165c 100644 --- a/sys/ofed/drivers/infiniband/core/packer.c +++ b/sys/ofed/drivers/infiniband/core/packer.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include diff --git a/sys/ofed/drivers/infiniband/core/peer_mem.c b/sys/ofed/drivers/infiniband/core/peer_mem.c new file mode 100644 index 000000000000..cd716a461ecc --- /dev/null +++ b/sys/ofed/drivers/infiniband/core/peer_mem.c @@ -0,0 +1,461 @@ +/* + * Copyright (c) 2013, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +static DEFINE_MUTEX(peer_memory_mutex); +static LIST_HEAD(peer_memory_list); + +static int num_registered_peers; + +/* This code uses the sysfs which is not supporeted by the FreeBSD. + * * Will be added in future to the sysctl */ + +#if 0 +static struct kobject *peers_kobj; +static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj); +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); + + if (ib_peer_client) { + sprintf(buf, "%s\n", ib_peer_client->peer_mem->version); + return strlen(buf); + } + /* not found - nothing is return */ + return 0; +} + +static ssize_t num_alloc_mrs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); + + if (ib_peer_client) { + sprintf(buf, "%lu\n", ib_peer_client->stats.num_alloc_mrs); + return strlen(buf); + } + /* not found - nothing is return */ + return 0; +} + +static ssize_t num_reg_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); + + if (ib_peer_client) { + sprintf(buf, "%lu\n", ib_peer_client->stats.num_reg_pages); + return strlen(buf); + } + /* not found - nothing is return */ + return 0; +} + +static ssize_t num_dereg_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); + + if (ib_peer_client) { + sprintf(buf, "%lu\n", ib_peer_client->stats.num_dereg_pages); + return strlen(buf); + } + /* not found - nothing is return */ + return 0; +} + +static ssize_t num_free_callbacks_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); + + if (ib_peer_client) { + sprintf(buf, "%lu\n", ib_peer_client->stats.num_free_callbacks); + return strlen(buf); + } + /* not found - nothing is return */ + return 0; +} + +static struct kobj_attribute version_attr = __ATTR_RO(version); +static struct kobj_attribute num_alloc_mrs = __ATTR_RO(num_alloc_mrs); +static struct kobj_attribute num_reg_pages = __ATTR_RO(num_reg_pages); +static struct kobj_attribute num_dereg_pages = __ATTR_RO(num_dereg_pages); +static struct kobj_attribute num_free_callbacks = __ATTR_RO(num_free_callbacks); + +static struct attribute *peer_mem_attrs[] = { + &version_attr.attr, + &num_alloc_mrs.attr, + &num_reg_pages.attr, + &num_dereg_pages.attr, + &num_free_callbacks.attr, + NULL, +}; +#endif + +#if 0 +static void destroy_peer_sysfs(struct ib_peer_memory_client *ib_peer_client) +{ + kobject_put(ib_peer_client->kobj); + if (!num_registered_peers) + kobject_put(peers_kobj); + + return; +} + +/* This code uses the sysfs which is not supporeted by the FreeBSD. + * Will be added in future to the sysctl */ + +static int create_peer_sysfs(struct ib_peer_memory_client *ib_peer_client) +{ + int ret; + + if (!num_registered_peers) { + /* creating under /sys/kernel/mm */ + peers_kobj = kobject_create_and_add("memory_peers", mm_kobj); + if (!peers_kobj) + return -ENOMEM; + } + + ib_peer_client->peer_mem_attr_group.attrs = peer_mem_attrs; + /* Dir alreday was created explicitly to get its kernel object for further usage */ + ib_peer_client->peer_mem_attr_group.name = NULL; + ib_peer_client->kobj = kobject_create_and_add(ib_peer_client->peer_mem->name, + peers_kobj); + + if (!ib_peer_client->kobj) { + ret = -EINVAL; + goto free; + } + + /* Create the files associated with this kobject */ + ret = sysfs_create_group(ib_peer_client->kobj, + &ib_peer_client->peer_mem_attr_group); + if (ret) + goto peer_free; + + return 0; + +peer_free: + kobject_put(ib_peer_client->kobj); + +free: + if (!num_registered_peers) + kobject_put(peers_kobj); + + return ret; +} +#endif + +static int ib_invalidate_peer_memory(void *reg_handle, + void *core_context) +{ + struct ib_peer_memory_client *ib_peer_client = + (struct ib_peer_memory_client *)reg_handle; + struct invalidation_ctx *invalidation_ctx; + struct core_ticket *core_ticket; + int need_unlock = 1; + + mutex_lock(&ib_peer_client->lock); + ib_peer_client->stats.num_free_callbacks += 1; + core_ticket = ib_peer_search_context(ib_peer_client, + (unsigned long)core_context); + if (!core_ticket) + goto out; + + invalidation_ctx = (struct invalidation_ctx *)core_ticket->context; + /* If context not ready yet mark to be invalidated */ + if (!invalidation_ctx->func) { + invalidation_ctx->peer_invalidated = 1; + goto out; + } + + invalidation_ctx->func(invalidation_ctx->cookie, + invalidation_ctx->umem, 0, 0); + if (invalidation_ctx->inflight_invalidation) { + + /* init the completion to wait on before letting other thread to run */ + init_completion(&invalidation_ctx->comp); + mutex_unlock(&ib_peer_client->lock); + need_unlock = 0; + wait_for_completion(&invalidation_ctx->comp); + } + + kfree(invalidation_ctx); + +out: + if (need_unlock) + mutex_unlock(&ib_peer_client->lock); + + return 0; +} + +/* access to that peer client is under its lock - no extra lock is needed */ +unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client, + void *context) +{ + struct core_ticket *core_ticket = kzalloc(sizeof(*core_ticket), GFP_KERNEL); + + ib_peer_client->last_ticket++; + core_ticket->context = context; + core_ticket->key = ib_peer_client->last_ticket; + + list_add_tail(&core_ticket->ticket_list, + &ib_peer_client->core_ticket_list); + + return core_ticket->key; +} + +int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client, + unsigned long key) +{ + struct core_ticket *core_ticket, *tmp; + + list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list, + ticket_list) { + if (core_ticket->key == key) { + list_del(&core_ticket->ticket_list); + kfree(core_ticket); + return 0; + } + } + + return 1; +} + +struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client, + unsigned long key) +{ + struct core_ticket *core_ticket, *tmp; + list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list, + ticket_list) { + if (core_ticket->key == key) + return core_ticket; + } + + return NULL; +} + + +static int ib_memory_peer_check_mandatory(struct peer_memory_client + *peer_client) +{ +#define PEER_MEM_MANDATORY_FUNC(x) {\ + offsetof(struct peer_memory_client, x), #x } + + static const struct { + size_t offset; + char *name; + } mandatory_table[] = { + PEER_MEM_MANDATORY_FUNC(acquire), + PEER_MEM_MANDATORY_FUNC(get_pages), + PEER_MEM_MANDATORY_FUNC(put_pages), + PEER_MEM_MANDATORY_FUNC(get_page_size), + PEER_MEM_MANDATORY_FUNC(dma_map), + PEER_MEM_MANDATORY_FUNC(dma_unmap) + }; + int i; + + for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { + if (!*(void **) ((void *) peer_client + mandatory_table[i].offset)) { + printk(KERN_WARNING "Peer memory %s is missing mandatory function %s\n", + peer_client->name, mandatory_table[i].name); + return -EINVAL; + } + } + + return 0; +} + + + +void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, + invalidate_peer_memory *invalidate_callback) +{ + int ret = 0; + struct ib_peer_memory_client *ib_peer_client = NULL; + + mutex_lock(&peer_memory_mutex); + if (ib_memory_peer_check_mandatory(peer_client)) { + ret = -EINVAL; + goto out; + } + + ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL); + if (!ib_peer_client) + goto out; + ib_peer_client->peer_mem = peer_client; + + INIT_LIST_HEAD(&ib_peer_client->core_ticket_list); + mutex_init(&ib_peer_client->lock); +#ifdef __FreeBSD__ + ib_peer_client->holdcount = 0; + ib_peer_client->needwakeup = 0; + cv_init(&ib_peer_client->peer_cv, "ibprcl"); +#else + ret = init_srcu_struct(&ib_peer_client->peer_srcu); + if (ret) + goto free; +#endif +#if 0 + if (create_peer_sysfs(ib_peer_client)) + goto free; +#endif + *invalidate_callback = ib_invalidate_peer_memory; + list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list); + num_registered_peers++; + goto out; +#if 0 +free: + kfree(ib_peer_client); + ib_peer_client = NULL; +#endif +out: + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; +} +EXPORT_SYMBOL(ib_register_peer_memory_client); + +void ib_unregister_peer_memory_client(void *reg_handle) +{ + struct ib_peer_memory_client *ib_peer_client = + (struct ib_peer_memory_client *)reg_handle; + + mutex_lock(&peer_memory_mutex); + /* remove from list to prevent future core clients usage as it goes down */ + list_del(&ib_peer_client->core_peer_list); +#ifdef __FreeBSD__ + while (ib_peer_client->holdcount != 0) { + ib_peer_client->needwakeup = 1; + cv_wait(&ib_peer_client->peer_cv, &peer_memory_mutex.sx); + } + cv_destroy(&ib_peer_client->peer_cv); +#else + mutex_unlock(&peer_memory_mutex); + /* peer memory can't go down while there are active clients */ + synchronize_srcu(&ib_peer_client->peer_srcu); + cleanup_srcu_struct(&ib_peer_client->peer_srcu); + mutex_lock(&peer_memory_mutex); +#endif + num_registered_peers--; +/* This code uses the sysfs which is not supporeted by the FreeBSD. + * Will be added in future to the sysctl */ +#if 0 + destroy_peer_sysfs(ib_peer_client); +#endif + mutex_unlock(&peer_memory_mutex); + + kfree(ib_peer_client); +} +EXPORT_SYMBOL(ib_unregister_peer_memory_client); + +/* This code uses the sysfs which is not supporeted by the FreeBSD. + * Will be added in future to the sysctl */ + +#if 0 +static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj) +{ + struct ib_peer_memory_client *ib_peer_client; + + mutex_lock(&peer_memory_mutex); + list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) { + if (ib_peer_client->kobj == kobj) + goto found; + } + + ib_peer_client = NULL; + +found: + + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; +} +#endif + +struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr, + size_t size, void **peer_client_context, + int *srcu_key) +{ + struct ib_peer_memory_client *ib_peer_client; + int ret; + + mutex_lock(&peer_memory_mutex); + list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) { + ret = ib_peer_client->peer_mem->acquire(addr, size, + context->peer_mem_private_data, + context->peer_mem_name, + peer_client_context); + if (ret == 1) + goto found; + } + + ib_peer_client = NULL; + +found: + if (ib_peer_client) { +#ifdef __FreeBSD__ + ib_peer_client->holdcount++; +#else + *srcu_key = srcu_read_lock(&ib_peer_client->peer_srcu); +#endif + } + + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; + +} +EXPORT_SYMBOL(ib_get_peer_client); + +void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, + void *peer_client_context, + int srcu_key) +{ + + if (ib_peer_client->peer_mem->release) + ib_peer_client->peer_mem->release(peer_client_context); + +#ifdef __FreeBSD__ + ib_peer_client->holdcount--; + if (ib_peer_client->holdcount == 0 && ib_peer_client->needwakeup) { + cv_signal(&ib_peer_client->peer_cv); + } +#else + srcu_read_unlock(&ib_peer_client->peer_srcu, srcu_key); +#endif + return; +} +EXPORT_SYMBOL(ib_put_peer_client); + diff --git a/sys/ofed/drivers/infiniband/core/sa.h b/sys/ofed/drivers/infiniband/core/sa.h index b8abdd767b6c..b1d4bbf4ce5c 100644 --- a/sys/ofed/drivers/infiniband/core/sa.h +++ b/sys/ofed/drivers/infiniband/core/sa.h @@ -48,29 +48,6 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) complete(&client->comp); } -int ib_sa_check_selector(ib_sa_comp_mask comp_mask, - ib_sa_comp_mask selector_mask, - ib_sa_comp_mask value_mask, - u8 selector, u8 src_value, u8 dst_value); - -int ib_sa_pack_attr(void *dst, void *src, int attr_id); - -int ib_sa_unpack_attr(void *dst, void *src, int attr_id); - -int ib_sa_path_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_path_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); - -int sa_db_init(void); -void sa_db_cleanup(void); - int ib_sa_mcmember_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, @@ -86,20 +63,4 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, int mcast_init(void); void mcast_cleanup(void); -int ib_sa_informinfo_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_inform *rec, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_inform *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); - -int notice_dispatch(struct ib_device *device, u8 port_num, - struct ib_sa_notice *notice); - -int notice_init(void); -void notice_cleanup(void); - #endif /* SA_H */ diff --git a/sys/ofed/drivers/infiniband/core/sa_query.c b/sys/ofed/drivers/infiniband/core/sa_query.c index 9c6b4f70a9ca..a0c04f5f5228 100644 --- a/sys/ofed/drivers/infiniband/core/sa_query.c +++ b/sys/ofed/drivers/infiniband/core/sa_query.c @@ -59,12 +59,10 @@ struct ib_sa_sm_ah { struct ib_sa_port { struct ib_mad_agent *agent; - struct ib_mad_agent *notice_agent; struct ib_sa_sm_ah *sm_ah; struct work_struct update_task; spinlock_t ah_lock; u8 port_num; - struct ib_device *device; }; struct ib_sa_device { @@ -95,14 +93,14 @@ struct ib_sa_path_query { struct ib_sa_query sa_query; }; -struct ib_sa_mcmember_query { - void (*callback)(int, struct ib_sa_mcmember_rec *, void *); +struct ib_sa_guidinfo_query { + void (*callback)(int, struct ib_sa_guidinfo_rec *, void *); void *context; struct ib_sa_query sa_query; }; -struct ib_sa_inform_query { - void (*callback)(int, struct ib_sa_inform *, void *); +struct ib_sa_mcmember_query { + void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; struct ib_sa_query sa_query; }; @@ -116,10 +114,10 @@ static struct ib_client sa_client = { .remove = ib_sa_remove_one }; -static spinlock_t idr_lock; +static DEFINE_SPINLOCK(idr_lock); static DEFINE_IDR(query_idr); -static spinlock_t tid_lock; +static DEFINE_SPINLOCK(tid_lock); static u32 tid; #define PATH_REC_FIELD(field) \ @@ -354,162 +352,34 @@ static const struct ib_field service_rec_table[] = { .size_bits = 2*64 }, }; -#define INFORM_FIELD(field) \ - .struct_offset_bytes = offsetof(struct ib_sa_inform, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_inform *) 0)->field, \ - .field_name = "sa_inform:" #field +#define GUIDINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ + .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ + .field_name = "sa_guidinfo_rec:" #field -static const struct ib_field inform_table[] = { - { INFORM_FIELD(gid), +static const struct ib_field guidinfo_rec_table[] = { + { GUIDINFO_REC_FIELD(lid), .offset_words = 0, .offset_bits = 0, - .size_bits = 128 }, - { INFORM_FIELD(lid_range_begin), - .offset_words = 4, - .offset_bits = 0, .size_bits = 16 }, - { INFORM_FIELD(lid_range_end), - .offset_words = 4, - .offset_bits = 16, - .size_bits = 16 }, - { RESERVED, - .offset_words = 5, - .offset_bits = 0, - .size_bits = 16 }, - { INFORM_FIELD(is_generic), - .offset_words = 5, + { GUIDINFO_REC_FIELD(block_num), + .offset_words = 0, .offset_bits = 16, .size_bits = 8 }, - { INFORM_FIELD(subscribe), - .offset_words = 5, + { GUIDINFO_REC_FIELD(res1), + .offset_words = 0, .offset_bits = 24, .size_bits = 8 }, - { INFORM_FIELD(type), - .offset_words = 6, - .offset_bits = 0, - .size_bits = 16 }, - { INFORM_FIELD(trap.generic.trap_num), - .offset_words = 6, - .offset_bits = 16, - .size_bits = 16 }, - { INFORM_FIELD(trap.generic.qpn), - .offset_words = 7, - .offset_bits = 0, - .size_bits = 24 }, - { RESERVED, - .offset_words = 7, - .offset_bits = 24, - .size_bits = 3 }, - { INFORM_FIELD(trap.generic.resp_time), - .offset_words = 7, - .offset_bits = 27, - .size_bits = 5 }, - { RESERVED, - .offset_words = 8, - .offset_bits = 0, - .size_bits = 8 }, - { INFORM_FIELD(trap.generic.producer_type), - .offset_words = 8, - .offset_bits = 8, - .size_bits = 24 }, -}; - -#define NOTICE_FIELD(field) \ - .struct_offset_bytes = offsetof(struct ib_sa_notice, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_notice *) 0)->field, \ - .field_name = "sa_notice:" #field - -static const struct ib_field notice_table[] = { - { NOTICE_FIELD(is_generic), - .offset_words = 0, - .offset_bits = 0, - .size_bits = 1 }, - { NOTICE_FIELD(type), - .offset_words = 0, - .offset_bits = 1, - .size_bits = 7 }, - { NOTICE_FIELD(trap.generic.producer_type), - .offset_words = 0, - .offset_bits = 8, - .size_bits = 24 }, - { NOTICE_FIELD(trap.generic.trap_num), + { GUIDINFO_REC_FIELD(res2), .offset_words = 1, .offset_bits = 0, - .size_bits = 16 }, - { NOTICE_FIELD(issuer_lid), - .offset_words = 1, - .offset_bits = 16, - .size_bits = 16 }, - { NOTICE_FIELD(notice_toggle), + .size_bits = 32 }, + { GUIDINFO_REC_FIELD(guid_info_list), .offset_words = 2, .offset_bits = 0, - .size_bits = 1 }, - { NOTICE_FIELD(notice_count), - .offset_words = 2, - .offset_bits = 1, - .size_bits = 15 }, - { NOTICE_FIELD(data_details), - .offset_words = 2, - .offset_bits = 16, - .size_bits = 432 }, - { NOTICE_FIELD(issuer_gid), - .offset_words = 16, - .offset_bits = 0, - .size_bits = 128 }, + .size_bits = 512 }, }; -int ib_sa_check_selector(ib_sa_comp_mask comp_mask, - ib_sa_comp_mask selector_mask, - ib_sa_comp_mask value_mask, - u8 selector, u8 src_value, u8 dst_value) -{ - int err; - - if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) - return 0; - - switch (selector) { - case IB_SA_GT: - err = (src_value <= dst_value); - break; - case IB_SA_LT: - err = (src_value >= dst_value); - break; - case IB_SA_EQ: - err = (src_value != dst_value); - break; - default: - err = 0; - break; - } - - return err; -} - -int ib_sa_pack_attr(void *dst, void *src, int attr_id) -{ - switch (attr_id) { - case IB_SA_ATTR_PATH_REC: - ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst); - break; - default: - return -EINVAL; - } - return 0; -} - -int ib_sa_unpack_attr(void *dst, void *src, int attr_id) -{ - switch (attr_id) { - case IB_SA_ATTR_PATH_REC: - ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst); - break; - default: - return -EINVAL; - } - return 0; -} - static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -588,7 +458,7 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event port->sm_ah = NULL; spin_unlock_irqrestore(&port->ah_lock, flags); - schedule_work(&sa_dev->port[event->element.port_num - + queue_work(ib_wq, &sa_dev->port[event->element.port_num - sa_dev->start_port].update_task); } } @@ -685,6 +555,14 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; } + if (force_grh) { + memcpy(ah_attr->dmac, rec->dmac, 6); + ah_attr->vlan_id = rec->vlan_id; + } else { + memset(ah_attr->dmac, 0, 6); + ah_attr->vlan_id = 0xffff; + } + return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); @@ -791,6 +669,10 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); + rec.vlan_id = 0xffff; + memset(rec.dmac, 0, ETH_ALEN); + memset(rec.smac, 0, ETH_ALEN); + query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); @@ -801,7 +683,33 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) kfree(container_of(sa_query, struct ib_sa_path_query, sa_query)); } -int ib_sa_path_rec_query(struct ib_sa_client *client, + +/** + * ib_sa_path_rec_get - Start a Path get query + * @client:SA client + * @device:device to send query on + * @port_num: port number to send query on + * @rec:Path Record to send in query + * @comp_mask:component mask to send in query + * @timeout_ms:time to wait for response + * @gfp_mask:GFP mask to use for internal allocations + * @callback:function called when query completes, times out or is + * canceled + * @context:opaque user context passed to callback + * @sa_query:query context, used to cancel query + * + * Send a Path Record Get query to the SA to look up a path. The + * callback function will be called when the query completes (or + * fails); status is 0 for a successful response, -EINTR if the query + * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error + * occurred sending the query. The resp parameter of the callback is + * only valid if status is 0. + * + * If the return value of ib_sa_path_rec_get() is negative, it is an + * error code. Otherwise it is a query ID that can be used to cancel + * the query. + */ +int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, ib_sa_comp_mask comp_mask, @@ -867,6 +775,7 @@ int ib_sa_path_rec_query(struct ib_sa_client *client, kfree(query); return ret; } +EXPORT_SYMBOL(ib_sa_path_rec_get); static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status, @@ -1082,26 +991,27 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, return ret; } -static void ib_sa_inform_callback(struct ib_sa_query *sa_query, +/* Support GuidInfoRecord */ +static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { - struct ib_sa_inform_query *query = - container_of(sa_query, struct ib_sa_inform_query, sa_query); + struct ib_sa_guidinfo_query *query = + container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); if (mad) { - struct ib_sa_inform rec; + struct ib_sa_guidinfo_rec rec; - ib_unpack(inform_table, ARRAY_SIZE(inform_table), + ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } -static void ib_sa_inform_release(struct ib_sa_query *sa_query) +static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) { - kfree(container_of(sa_query, struct ib_sa_inform_query, sa_query)); + kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query)); } int ib_sa_guid_info_rec_query(struct ib_sa_client *client, @@ -1115,52 +1025,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, void *context, struct ib_sa_query **sa_query) { - // stub function - - // called originally from mad.c under mlx4_ib_init_sriov() - // which calls mlx4_ib_init_alias_guid_service() in alias_GUID.c - // which goes down to this function - - printk("ERROR: function should be called only in SRIOV flow!!!"); - - return 0; -} - -/** - * ib_sa_informinfo_query - Start an InformInfo registration. - * @client:SA client - * @device:device to send query on - * @port_num: port number to send query on - * @rec:Inform record to send in query - * @timeout_ms:time to wait for response - * @gfp_mask:GFP mask to use for internal allocations - * @callback:function called when notice handler registration completes, - * times out or is canceled - * @context:opaque user context passed to callback - * @sa_query:query context, used to cancel query - * - * This function sends inform info to register with SA to receive - * in-service notice. - * The callback function will be called when the query completes (or - * fails); status is 0 for a successful response, -EINTR if the query - * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error - * occurred sending the query. The resp parameter of the callback is - * only valid if status is 0. - * - * If the return value of ib_sa_inform_query() is negative, it is an - * error code. Otherwise it is a query ID that can be used to cancel - * the query. - */ -int ib_sa_informinfo_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_inform *rec, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_inform *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) -{ - struct ib_sa_inform_query *query; + struct ib_sa_guidinfo_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; @@ -1170,6 +1035,12 @@ int ib_sa_informinfo_query(struct ib_sa_client *client, if (!sa_dev) return -ENODEV; + if (method != IB_MGMT_METHOD_GET && + method != IB_MGMT_METHOD_SET && + method != IB_SA_METHOD_DELETE) { + return -EINVAL; + } + port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; @@ -1190,15 +1061,18 @@ int ib_sa_informinfo_query(struct ib_sa_client *client, mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); - query->sa_query.callback = callback ? ib_sa_inform_callback : NULL; - query->sa_query.release = ib_sa_inform_release; - query->sa_query.port = port; - mad->mad_hdr.method = IB_MGMT_METHOD_SET; - mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_INFORM_INFO); + query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; + query->sa_query.release = ib_sa_guidinfo_rec_release; - ib_pack(inform_table, ARRAY_SIZE(inform_table), rec, mad->data); + mad->mad_hdr.method = method; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec, + mad->data); *sa_query = &query->sa_query; + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; @@ -1209,49 +1083,12 @@ int ib_sa_informinfo_query(struct ib_sa_client *client, *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); + err1: kfree(query); return ret; } - -static void ib_sa_notice_resp(struct ib_sa_port *port, - struct ib_mad_recv_wc *mad_recv_wc) -{ - struct ib_mad_send_buf *mad_buf; - struct ib_sa_mad *mad; - int ret; - unsigned long flags; - - mad_buf = ib_create_send_mad(port->notice_agent, 1, 0, 0, - IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, - GFP_KERNEL); - if (IS_ERR(mad_buf)) - return; - - mad = mad_buf->mad; - memcpy(mad, mad_recv_wc->recv_buf.mad, sizeof *mad); - mad->mad_hdr.method = IB_MGMT_METHOD_REPORT_RESP; - - spin_lock_irqsave(&port->ah_lock, flags); - if (!port->sm_ah) { - spin_unlock_irqrestore(&port->ah_lock, flags); - ib_free_send_mad(mad_buf); - return; - } - kref_get(&port->sm_ah->ref); - mad_buf->context[0] = &port->sm_ah->ref; - mad_buf->ah = port->sm_ah->ah; - spin_unlock_irqrestore(&port->ah_lock, flags); - - ret = ib_post_send_mad(mad_buf, NULL); - if (ret) - goto err; - - return; -err: - kref_put(mad_buf->context[0], free_sm_ah); - ib_free_send_mad(mad_buf); -} +EXPORT_SYMBOL(ib_sa_guid_info_rec_query); static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) @@ -1306,36 +1143,9 @@ static void recv_handler(struct ib_mad_agent *mad_agent, ib_free_recv_mad(mad_recv_wc); } -static void notice_resp_handler(struct ib_mad_agent *agent, - struct ib_mad_send_wc *mad_send_wc) -{ - kref_put(mad_send_wc->send_buf->context[0], free_sm_ah); - ib_free_send_mad(mad_send_wc->send_buf); -} - -static void notice_handler(struct ib_mad_agent *mad_agent, - struct ib_mad_recv_wc *mad_recv_wc) -{ - struct ib_sa_port *port; - struct ib_sa_mad *mad; - struct ib_sa_notice notice; - - port = mad_agent->context; - mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad; - ib_unpack(notice_table, ARRAY_SIZE(notice_table), mad->data, ¬ice); - - if (!notice_dispatch(port->device, port->port_num, ¬ice)) - ib_sa_notice_resp(port, mad_recv_wc); - ib_free_recv_mad(mad_recv_wc); -} - static void ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; - struct ib_mad_reg_req reg_req = { - .mgmt_class = IB_MGMT_CLASS_SUBN_ADM, - .mgmt_class_version = 2 - }; int s, e, i; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) @@ -1372,16 +1182,6 @@ static void ib_sa_add_one(struct ib_device *device) if (IS_ERR(sa_dev->port[i].agent)) goto err; - sa_dev->port[i].device = device; - set_bit(IB_MGMT_METHOD_REPORT, reg_req.method_mask); - sa_dev->port[i].notice_agent = - ib_register_mad_agent(device, i + s, IB_QPT_GSI, - ®_req, 0, notice_resp_handler, - notice_handler, &sa_dev->port[i]); - - if (IS_ERR(sa_dev->port[i].notice_agent)) - goto err; - INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); } @@ -1396,7 +1196,7 @@ static void ib_sa_add_one(struct ib_device *device) INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); if (ib_register_event_handler(&sa_dev->event_handler)) - goto err; + goto reg_err; for (i = 0; i <= e - s; ++i) if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) @@ -1404,14 +1204,14 @@ static void ib_sa_add_one(struct ib_device *device) return; +reg_err: + ib_set_client_data(device, &sa_client, NULL); + i = e - s; err: - while (--i >= 0) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { - if (!IS_ERR(sa_dev->port[i].notice_agent)) - ib_unregister_mad_agent(sa_dev->port[i].notice_agent); - if (!IS_ERR(sa_dev->port[i].agent)) + for (; i >= 0; --i) + if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND && + !IS_ERR(sa_dev->port[i].agent)) ib_unregister_mad_agent(sa_dev->port[i].agent); - } kfree(sa_dev); @@ -1428,11 +1228,10 @@ static void ib_sa_remove_one(struct ib_device *device) ib_unregister_event_handler(&sa_dev->event_handler); - flush_scheduled_work(); + flush_workqueue(ib_wq); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { - ib_unregister_mad_agent(sa_dev->port[i].notice_agent); ib_unregister_mad_agent(sa_dev->port[i].agent); if (sa_dev->port[i].sm_ah) kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); @@ -1447,9 +1246,6 @@ static int __init ib_sa_init(void) { int ret; - spin_lock_init(&idr_lock); - spin_lock_init(&tid_lock); - get_random_bytes(&tid, sizeof tid); ret = ib_register_client(&sa_client); @@ -1464,23 +1260,7 @@ static int __init ib_sa_init(void) goto err2; } - ret = notice_init(); - if (ret) { - printk(KERN_ERR "Couldn't initialize notice handling\n"); - goto err3; - } - - ret = sa_db_init(); - if (ret) { - printk(KERN_ERR "Couldn't initialize local SA\n"); - goto err4; - } - return 0; -err4: - notice_cleanup(); -err3: - mcast_cleanup(); err2: ib_unregister_client(&sa_client); err1: @@ -1489,9 +1269,7 @@ static int __init ib_sa_init(void) static void __exit ib_sa_cleanup(void) { - sa_db_cleanup(); mcast_cleanup(); - notice_cleanup(); ib_unregister_client(&sa_client); idr_destroy(&query_idr); } diff --git a/sys/ofed/drivers/infiniband/core/smi.c b/sys/ofed/drivers/infiniband/core/smi.c index 87236753bce9..5855e4405d9b 100644 --- a/sys/ofed/drivers/infiniband/core/smi.c +++ b/sys/ofed/drivers/infiniband/core/smi.c @@ -52,6 +52,10 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, hop_cnt = smp->hop_cnt; /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + if (!ib_get_smp_direction(smp)) { /* C14-9:1 */ if (hop_cnt && hop_ptr == 0) { @@ -133,6 +137,10 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, hop_cnt = smp->hop_cnt; /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + if (!ib_get_smp_direction(smp)) { /* C14-9:1 -- sender should have incremented hop_ptr */ if (hop_cnt && hop_ptr == 0) diff --git a/sys/ofed/drivers/infiniband/core/sysfs.c b/sys/ofed/drivers/infiniband/core/sysfs.c index 4cd5560e10b5..6bcbfb9d9a1d 100644 --- a/sys/ofed/drivers/infiniband/core/sysfs.c +++ b/sys/ofed/drivers/infiniband/core/sysfs.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -105,7 +106,7 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, return ret; return sprintf(buf, "%d: %s\n", attr.state, - attr.state < ARRAY_SIZE(state_name) ? + attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? state_name[attr.state] : "UNKNOWN"); } @@ -180,19 +181,18 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, { struct ib_port_attr attr; char *speed = ""; - int rate; + int rate; /* in deci-Gb/sec */ ssize_t ret; ret = ib_query_port(p->ibdev, p->port_num, &attr); if (ret) return ret; - switch (attr.active_speed) { - case 2: speed = " DDR"; break; - case 4: speed = " QDR"; break; - } + ib_active_speed_enum_to_rate(attr.active_speed, + &rate, + &speed); - rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed; + rate *= ib_width_enum_to_int(attr.active_width); if (rate < 0) return -EINVAL; @@ -229,9 +229,11 @@ static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, { switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { case IB_LINK_LAYER_INFINIBAND: - return sprintf(buf, "%s\n", "IB"); + return sprintf(buf, "%s\n", "InfiniBand"); case IB_LINK_LAYER_ETHERNET: return sprintf(buf, "%s\n", "Ethernet"); + case IB_LINK_LAYER_SCIF: + return sprintf(buf, "%s\n", "SCIF"); default: return sprintf(buf, "%s\n", "Unknown"); } @@ -267,16 +269,12 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, container_of(attr, struct port_table_attribute, attr); union ib_gid gid; ssize_t ret; - u16 *raw; ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); if (ret) return ret; - raw = (u16 *)gid.raw; - return sprintf(buf, "%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x\n", - htons(raw[0]), htons(raw[1]), htons(raw[2]), htons(raw[3]), - htons(raw[4]), htons(raw[5]), htons(raw[6]), htons(raw[7])); + return sprintf(buf, GID_PRINT_FMT"\n",GID_PRINT_ARGS(gid.raw)); } static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, @@ -351,8 +349,8 @@ static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr, be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); break; case 64: - ret = sprintf(buf, "%llu\n", (unsigned long long) - be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8))); + ret = sprintf(buf, "%llu\n", + (unsigned long long)be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8))); break; default: ret = 0; @@ -536,6 +534,7 @@ alloc_group_attrs(ssize_t (*show)(struct ib_port *, element->attr.attr.mode = S_IRUGO; element->attr.show = show; element->index = i; + sysfs_attr_init(&element->attr.attr); tab_attr[i] = &element->attr.attr; } @@ -570,7 +569,7 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - kobject_get(device->ports_parent), + device->ports_parent, "%d", port_num); if (ret) goto err_put; @@ -609,7 +608,6 @@ static int add_port(struct ib_device *device, int port_num, } list_add_tail(&p->kobj.entry, &device->port_list); - #ifdef __linux__ kobject_uevent(&p->kobj, KOBJ_ADD); #endif @@ -655,6 +653,7 @@ static ssize_t show_node_type(struct device *device, case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); + case RDMA_NODE_MIC: return sprintf(buf, "%d: MIC\n", dev->node_type); default: return sprintf(buf, "%d: \n", dev->node_type); } } @@ -716,16 +715,75 @@ static ssize_t set_node_desc(struct device *device, return count; } +static ssize_t show_cmd_perf(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%d\n", dev->cmd_perf); +} + +static ssize_t set_cmd_perf(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + u32 val; + + if (sscanf(buf, "0x%x", &val) != 1) + return -EINVAL; + + dev->cmd_perf = val; + + return count; +} + +static ssize_t show_cmd_avg(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%llu\n", (unsigned long long)dev->cmd_avg); +} + +static ssize_t set_cmd_avg(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + spin_lock(&dev->cmd_perf_lock); + dev->cmd_avg = 0; + dev->cmd_n = 0; + spin_unlock(&dev->cmd_perf_lock); + + return count; +} + +static ssize_t show_cmd_n(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%d\n", dev->cmd_n); +} + static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); +static DEVICE_ATTR(cmd_perf, S_IRUGO | S_IWUSR, show_cmd_perf, set_cmd_perf); +static DEVICE_ATTR(cmd_avg, S_IRUGO | S_IWUSR, show_cmd_avg, set_cmd_avg); +static DEVICE_ATTR(cmd_n, S_IRUGO, show_cmd_n, NULL); static struct device_attribute *ib_class_attributes[] = { &dev_attr_node_type, &dev_attr_sys_image_guid, &dev_attr_node_guid, - &dev_attr_node_desc + &dev_attr_node_desc, + &dev_attr_cmd_perf, + &dev_attr_cmd_avg, + &dev_attr_cmd_n, }; static struct class ib_class = { @@ -851,7 +909,8 @@ static struct attribute_group iw_stats_group = { }; int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, u8, struct kobject *)) + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) { struct device *class_dev = &device->dev; int ret; @@ -874,8 +933,7 @@ int ib_device_register_sysfs(struct ib_device *device, goto err_unregister; } - device->ports_parent = kobject_create_and_add("ports", - kobject_get(&class_dev->kobj)); + device->ports_parent = kobject_create_and_add("ports",&class_dev->kobj); if (!device->ports_parent) { ret = -ENOMEM; goto err_put; @@ -919,6 +977,11 @@ int ib_device_register_sysfs(struct ib_device *device, kobject_put(&class_dev->kobj); err_unregister: + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { + device_remove_file(class_dev, ib_class_attributes[i]); + } + device_unregister(class_dev); err: @@ -927,15 +990,16 @@ int ib_device_register_sysfs(struct ib_device *device, void ib_device_unregister_sysfs(struct ib_device *device) { + int i; struct kobject *p, *t; struct ib_port *port; - int i; + struct device *class_dev = &device->dev; /* Hold kobject until ib_dealloc_device() */ kobject_get(&device->dev.kobj); for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - device_remove_file(&device->dev, ib_class_attributes[i]); + device_remove_file(class_dev, ib_class_attributes[i]); } list_for_each_entry_safe(p, t, &device->port_list, entry) { @@ -960,22 +1024,3 @@ void ib_sysfs_cleanup(void) { class_unregister(&ib_class); } - -/*int ib_sysfs_create_port_files(struct ib_device *device, - int (*create)(struct ib_device *dev, u8 port_num, - struct kobject *kobj)) -{ - struct kobject *p; - struct ib_port *port; - int ret = 0; - - list_for_each_entry(p, &device->port_list, entry) { - port = container_of(p, struct ib_port, kobj); - ret = create(device, port->port_num, &port->kobj); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL(ib_sysfs_create_port_files);*/ diff --git a/sys/ofed/drivers/infiniband/core/ucm.c b/sys/ofed/drivers/infiniband/core/ucm.c index b912ebeb427a..8f20e89da0d5 100644 --- a/sys/ofed/drivers/infiniband/core/ucm.c +++ b/sys/ofed/drivers/infiniband/core/ucm.c @@ -37,10 +37,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -396,7 +398,6 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file, struct ib_ucm_event_get cmd; struct ib_ucm_event *uevent; int result = 0; - DEFINE_WAIT(wait); if (out_len < sizeof(struct ib_ucm_event_resp)) return -ENOSPC; @@ -1123,7 +1124,7 @@ static ssize_t ib_ucm_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; - if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) + if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) return -EINVAL; if (hdr.in + sizeof(hdr) > len) @@ -1163,7 +1164,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp) { struct ib_ucm_file *file; - file = kzalloc(sizeof(*file), GFP_KERNEL); + file = kmalloc(sizeof(*file), GFP_KERNEL); if (!file) return -ENOMEM; @@ -1177,7 +1178,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp) file->filp = filp; file->device = container_of(inode->i_cdev->si_drv1, struct ib_ucm_device, cdev); - return 0; + return nonseekable_open(inode, filp); } static int ib_ucm_close(struct inode *inode, struct file *filp) @@ -1212,7 +1213,10 @@ static void ib_ucm_release_dev(struct device *dev) ucm_dev = container_of(dev, struct ib_ucm_device, dev); cdev_del(&ucm_dev->cdev); + if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) clear_bit(ucm_dev->devnum, dev_map); + else + clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); kfree(ucm_dev); } @@ -1222,6 +1226,7 @@ static const struct file_operations ucm_fops = { .release = ib_ucm_close, .write = ib_ucm_write, .poll = ib_ucm_poll, + .llseek = no_llseek, }; static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, @@ -1234,8 +1239,32 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); +static dev_t overflow_maj; +static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); +static int find_overflow_devnum(void) +{ + int ret; + + if (!overflow_maj) { + ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, + "infiniband_cm"); + if (ret) { + printk(KERN_ERR "ucm: couldn't register dynamic device number\n"); + return ret; + } + } + + ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES); + if (ret >= IB_UCM_MAX_DEVICES) + return -1; + + return ret; +} + static void ib_ucm_add_one(struct ib_device *device) { + int devnum; + dev_t base; struct ib_ucm_device *ucm_dev; if (!device->alloc_ucontext || @@ -1248,16 +1277,25 @@ static void ib_ucm_add_one(struct ib_device *device) ucm_dev->ib_dev = device; - ucm_dev->devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); - if (ucm_dev->devnum >= IB_UCM_MAX_DEVICES) + devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); + if (devnum >= IB_UCM_MAX_DEVICES) { + devnum = find_overflow_devnum(); + if (devnum < 0) goto err; - set_bit(ucm_dev->devnum, dev_map); + ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; + base = devnum + overflow_maj; + set_bit(devnum, overflow_map); + } else { + ucm_dev->devnum = devnum; + base = devnum + IB_UCM_BASE_DEV; + set_bit(devnum, dev_map); + } cdev_init(&ucm_dev->cdev, &ucm_fops); ucm_dev->cdev.owner = THIS_MODULE; kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum); - if (cdev_add(&ucm_dev->cdev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1)) + if (cdev_add(&ucm_dev->cdev, base, 1)) goto err; ucm_dev->dev.class = &cm_class; @@ -1278,7 +1316,10 @@ static void ib_ucm_add_one(struct ib_device *device) device_unregister(&ucm_dev->dev); err_cdev: cdev_del(&ucm_dev->cdev); - clear_bit(ucm_dev->devnum, dev_map); + if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) + clear_bit(devnum, dev_map); + else + clear_bit(devnum, overflow_map); err: kfree(ucm_dev); return; @@ -1298,6 +1339,7 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *att { return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION); } + static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); static int __init ib_ucm_init(void) @@ -1337,6 +1379,8 @@ static void __exit ib_ucm_cleanup(void) ib_unregister_client(&ucm_client); class_remove_file(&cm_class, &class_attr_abi_version); unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); + if (overflow_maj) + unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES); idr_destroy(&ctx_id_table); } diff --git a/sys/ofed/drivers/infiniband/core/ucma.c b/sys/ofed/drivers/infiniband/core/ucma.c index 23cbf7b5ac09..5f73b40b6405 100644 --- a/sys/ofed/drivers/infiniband/core/ucma.c +++ b/sys/ofed/drivers/infiniband/core/ucma.c @@ -34,10 +34,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include @@ -48,9 +51,7 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); MODULE_LICENSE("Dual BSD/GPL"); -enum { - UCMA_MAX_BACKLOG = 1024 -}; +static unsigned int max_backlog = 1024; struct ucma_file { struct mutex mut; @@ -253,17 +254,17 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, if (!uevent) return event->event == RDMA_CM_EVENT_CONNECT_REQUEST; + mutex_lock(&ctx->file->mut); uevent->cm_id = cm_id; ucma_set_event_context(ctx, event, uevent); uevent->resp.event = event->event; uevent->resp.status = event->status; - if (cm_id->ps == RDMA_PS_UDP || cm_id->ps == RDMA_PS_IPOIB) + if (cm_id->qp_type == IB_QPT_UD) ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); - mutex_lock(&ctx->file->mut); if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { if (!ctx->backlog) { ret = -ENOMEM; @@ -298,7 +299,6 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, struct rdma_ucm_get_event cmd; struct ucma_event *uevent; int ret = 0; - DEFINE_WAIT(wait); if (out_len < sizeof uevent->resp) return -ENOSPC; @@ -332,6 +332,7 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, ctx->cm_id = uevent->cm_id; ctx->cm_id->context = ctx; uevent->resp.id = ctx->id; + ctx->cm_id->ucontext = ctx; } if (copy_to_user((void __user *)(unsigned long)cmd.response, @@ -350,13 +351,31 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, return ret; } -static ssize_t ucma_create_id(struct ucma_file *file, - const char __user *inbuf, +static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) +{ + switch (cmd->ps) { + case RDMA_PS_TCP: + *qp_type = IB_QPT_RC; + return 0; + case RDMA_PS_UDP: + case RDMA_PS_IPOIB: + *qp_type = IB_QPT_UD; + return 0; + case RDMA_PS_IB: + *qp_type = cmd->qp_type; + return 0; + default: + return -EINVAL; + } +} + +static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; + enum ib_qp_type qp_type; int ret; if (out_len < sizeof(resp)) @@ -365,6 +384,10 @@ static ssize_t ucma_create_id(struct ucma_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + ret = ucma_get_qp_type(&cmd, &qp_type); + if (ret) + return ret; + mutex_lock(&file->mut); ctx = ucma_alloc_ctx(file); mutex_unlock(&file->mut); @@ -372,11 +395,12 @@ static ssize_t ucma_create_id(struct ucma_file *file, return -ENOMEM; ctx->uid = cmd.uid; - ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps); + ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(ctx->cm_id)) { ret = PTR_ERR(ctx->cm_id); goto err1; } + ctx->cm_id->ucontext = ctx; resp.id = ctx->id; if (copy_to_user((void __user *)(unsigned long)cmd.response, @@ -409,24 +433,6 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx) mutex_unlock(&mut); } -static void ucma_cleanup_events(struct ucma_context *ctx) -{ - struct ucma_event *uevent, *tmp; - - list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { - if (uevent->ctx != ctx) - continue; - - list_del(&uevent->list); - - /* clear incoming connections. */ - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) - rdma_destroy_id(uevent->cm_id); - - kfree(uevent); - } -} - static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; @@ -440,9 +446,16 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) } } +/* + * We cannot hold file->mut when calling rdma_destroy_id() or we can + * deadlock. We also acquire file->mut in ucma_event_handler(), and + * rdma_destroy_id() will wait until all callbacks have completed. + */ static int ucma_free_ctx(struct ucma_context *ctx) { int events_reported; + struct ucma_event *uevent, *tmp; + LIST_HEAD(list); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); @@ -451,10 +464,20 @@ static int ucma_free_ctx(struct ucma_context *ctx) /* Cleanup events not yet reported to the user. */ mutex_lock(&ctx->file->mut); - ucma_cleanup_events(ctx); + list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &list); + } list_del(&ctx->list); mutex_unlock(&ctx->file->mut); + list_for_each_entry_safe(uevent, tmp, &list, list) { + list_del(&uevent->list); + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) + rdma_destroy_id(uevent->cm_id); + kfree(uevent); + } + events_reported = ctx->events_reported; kfree(ctx); return events_reported; @@ -586,24 +609,14 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { - struct rdma_dev_addr *dev_addr; - struct net_device *dev; - u16 vid = 0; resp->num_paths = route->num_paths; switch (route->num_paths) { case 0: - dev_addr = &route->addr.dev_addr; - dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (dev) { - vid = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - } - - iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid, - dev_addr->dst_dev_addr, vid); - iboe_addr_get_sgid(dev_addr, - (union ib_gid *) &resp->ib_route[0].sgid); + rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, + (union ib_gid *)&resp->ib_route[0].dgid); + rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, + (union ib_gid *)&resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); break; case 2: @@ -619,6 +632,16 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, } } +static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + struct rdma_dev_addr *dev_addr; + + dev_addr = &route->addr.dev_addr; + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); +} + static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -653,8 +676,10 @@ static ssize_t ucma_query_route(struct ucma_file *file, resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.port_num = ctx->cm_id->port_num; - if (rdma_node_get_transport(ctx->cm_id->device->node_type) == RDMA_TRANSPORT_IB) { - switch (rdma_port_get_link_layer(ctx->cm_id->device, ctx->cm_id->port_num)) { + switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { + case RDMA_TRANSPORT_IB: + switch (rdma_port_get_link_layer(ctx->cm_id->device, + ctx->cm_id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ucma_copy_ib_route(&resp, &ctx->cm_id->route); break; @@ -664,6 +689,12 @@ static ssize_t ucma_query_route(struct ucma_file *file, default: break; } + break; + case RDMA_TRANSPORT_IWARP: + ucma_copy_iw_route(&resp, &ctx->cm_id->route); + break; + default: + break; } out: @@ -727,8 +758,8 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ctx->backlog = cmd.backlog > 0 && cmd.backlog < UCMA_MAX_BACKLOG ? - cmd.backlog : UCMA_MAX_BACKLOG; + ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? + cmd.backlog : max_backlog; ret = rdma_listen(ctx->cm_id, ctx->backlog); ucma_put_ctx(ctx); return ret; @@ -750,9 +781,12 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, return PTR_ERR(ctx); if (cmd.conn_param.valid) { - ctx->uid = cmd.uid; ucma_copy_conn_param(&conn_param, &cmd.conn_param); + mutex_lock(&file->mut); ret = rdma_accept(ctx->cm_id, &conn_param); + if (!ret) + ctx->uid = cmd.uid; + mutex_unlock(&file->mut); } else ret = rdma_accept(ctx->cm_id, NULL); @@ -848,6 +882,20 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); break; + case RDMA_OPTION_ID_REUSEADDR: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; + case RDMA_OPTION_ID_AFONLY: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; default: ret = -ENOSYS; } @@ -887,12 +935,22 @@ static int ucma_set_ib_path(struct ucma_context *ctx, static int ucma_set_option_ib(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { - int ret; + int ret = 0; switch (optname) { case RDMA_OPTION_IB_PATH: ret = ucma_set_ib_path(ctx, optval, optlen); break; + + case RDMA_OPTION_IB_APM: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + if (*(u8 *)optval) + ret = rdma_enable_apm(ctx->cm_id, RDMA_ALT_PATH_BEST); + break; + default: ret = -ENOSYS; } @@ -937,20 +995,21 @@ static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, optval = kmalloc(cmd.optlen, GFP_KERNEL); if (!optval) { ret = -ENOMEM; - goto out1; + goto err_ucma_put_ctx; } - if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval, + if (copy_from_user(optval, (void __user *)(unsigned long)cmd.optval, cmd.optlen)) { ret = -EFAULT; - goto out2; + goto err_kfree; } ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, cmd.optlen); -out2: + +err_kfree: kfree(optval); -out1: +err_ucma_put_ctx: ucma_put_ctx(ctx); return ret; } @@ -1121,7 +1180,7 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; struct ucma_context *ctx; - struct file *filp; + struct fd f; struct ucma_file *cur_file; int ret = 0; @@ -1129,12 +1188,12 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, return -EFAULT; /* Get current fd to protect against it being closed */ - filp = fget(cmd.fd); - if (!filp) + f = fdget(cmd.fd); + if (!f.file) return -ENOENT; /* Validate current fd and prevent destruction of id. */ - ctx = ucma_get_ctx(filp->private_data, cmd.id); + ctx = ucma_get_ctx(f.file->private_data, cmd.id); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto file_put; @@ -1168,7 +1227,7 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, ucma_put_ctx(ctx); file_put: - fput(filp); + fdput(f); return ret; } @@ -1209,7 +1268,7 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; - if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) + if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) return -EINVAL; if (hdr.in + sizeof(hdr) > len) @@ -1261,7 +1320,8 @@ static int ucma_open(struct inode *inode, struct file *filp) filp->private_data = file; file->filp = filp; - return 0; + + return nonseekable_open(inode, filp); } static int ucma_close(struct inode *inode, struct file *filp) @@ -1291,11 +1351,14 @@ static const struct file_operations ucma_fops = { .release = ucma_close, .write = ucma_write, .poll = ucma_poll, + .llseek = no_llseek, }; static struct miscdevice ucma_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "rdma_cm", + .nodename = "infiniband/rdma_cm", + .mode = 0666, .fops = &ucma_fops, }; @@ -1318,10 +1381,11 @@ static int __init ucma_init(void) ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); - goto err; + goto err1; } + return 0; -err: +err1: misc_deregister(&ucma_misc); return ret; } diff --git a/sys/ofed/drivers/infiniband/core/ud_header.c b/sys/ofed/drivers/infiniband/core/ud_header.c index 09fc1ffe0553..051d3bd3d4b6 100644 --- a/sys/ofed/drivers/infiniband/core/ud_header.c +++ b/sys/ofed/drivers/infiniband/core/ud_header.c @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -230,32 +231,28 @@ void ib_ud_header_init(int payload_bytes, int immediate_present, struct ib_ud_header *header) { - u16 packet_length = 0; - memset(header, 0, sizeof *header); if (lrh_present) { + u16 packet_length = 0; + header->lrh.link_version = 0; header->lrh.link_next_header = grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL; - packet_length = IB_LRH_BYTES; + packet_length = (IB_LRH_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + (grh_present ? IB_GRH_BYTES : 0) + + payload_bytes + + 4 + /* ICRC */ + 3) / 4; /* round up */ + header->lrh.packet_length = cpu_to_be16(packet_length); } - if (eth_present) { - if (vlan_present) { + if (vlan_present) header->eth.type = cpu_to_be16(ETH_P_8021Q); - packet_length += IB_VLAN_BYTES; - } - packet_length += IB_ETH_BYTES; - } - - packet_length += IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes + - 4 + /* ICRC */ - 3; /* round up */ - packet_length /= 4; if (grh_present) { - packet_length += IB_GRH_BYTES / 4; header->grh.ip_version = 6; header->grh.payload_length = cpu_to_be16((IB_BTH_BYTES + @@ -266,9 +263,6 @@ void ib_ud_header_init(int payload_bytes, header->grh.next_header = 0x1b; } - if (lrh_present) - header->lrh.packet_length = cpu_to_be16(packet_length); - if (immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; else @@ -284,36 +278,6 @@ void ib_ud_header_init(int payload_bytes, } EXPORT_SYMBOL(ib_ud_header_init); -/** - * ib_lrh_header_pack - Pack LRH header struct into wire format - * @lrh:unpacked LRH header struct - * @buf:Buffer to pack into - * - * ib_lrh_header_pack() packs the LRH header structure @lrh into - * wire format in the buffer @buf. - */ -int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf) -{ - ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf); - return 0; -} -EXPORT_SYMBOL(ib_lrh_header_pack); - -/** - * ib_lrh_header_unpack - Unpack LRH structure from wire format - * @lrh:unpacked LRH header struct - * @buf:Buffer to pack into - * - * ib_lrh_header_unpack() unpacks the LRH header structure from - * wire format (in buf) into @lrh. - */ -int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh) -{ - ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh); - return 0; -} -EXPORT_SYMBOL(ib_lrh_header_unpack); - /** * ib_ud_header_pack - Pack UD header struct into wire format * @header:UD header struct @@ -337,14 +301,11 @@ int ib_ud_header_pack(struct ib_ud_header *header, &header->eth, buf + len); len += IB_ETH_BYTES; } - - if (header->vlan_present) { ib_pack(vlan_table, ARRAY_SIZE(vlan_table), &header->vlan, buf + len); len += IB_VLAN_BYTES; } - if (header->grh_present) { ib_pack(grh_table, ARRAY_SIZE(grh_table), &header->grh, buf + len); diff --git a/sys/ofed/drivers/infiniband/core/umem.c b/sys/ofed/drivers/infiniband/core/umem.c index 7695a2149670..cdd2e67fa248 100644 --- a/sys/ofed/drivers/infiniband/core/umem.c +++ b/sys/ofed/drivers/infiniband/core/umem.c @@ -35,109 +35,168 @@ #include #include #include -#ifdef __linux__ -#include -#endif #include - +#include +#include #include -#include #include - -#include -#include -#include #include - +#include #include "uverbs.h" +#define IB_UMEM_MAX_PAGE_CHUNK (PAGE_SIZE / sizeof (struct page *)) + static int allow_weak_ordering; -module_param(allow_weak_ordering, bool, 0444); -MODULE_PARM_DESC(allow_weak_ordering, "Allow weak ordering for data registered memory"); +module_param_named(weak_ordering, allow_weak_ordering, int, 0444); +MODULE_PARM_DESC(weak_ordering, "Allow weak ordering for data registered memory"); -#define IB_UMEM_MAX_PAGE_CHUNK \ - ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ - ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ - (void *) &((struct ib_umem_chunk *) 0)->page_list[0])) - -#ifdef __ia64__ -extern int dma_map_sg_hp_wa; - -static int dma_map_sg_ia64(struct ib_device *ibdev, - struct scatterlist *sg, - int nents, - enum dma_data_direction dir) +static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem, + struct ib_umem *umem, unsigned long addr, + int dmasync, int invalidation_supported) { - int i, rc, j, lents = 0; - struct device *dev; + int ret; + const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem; + struct invalidation_ctx *invalidation_ctx = NULL; - if (!dma_map_sg_hp_wa) - return ib_dma_map_sg(ibdev, sg, nents, dir); - - dev = ibdev->dma_device; - for (i = 0; i < nents; ++i) { - rc = dma_map_sg(dev, sg + i, 1, dir); - if (rc <= 0) { - for (j = 0; j < i; ++j) - dma_unmap_sg(dev, sg + j, 1, dir); - - return 0; + umem->ib_peer_mem = ib_peer_mem; + if (invalidation_supported) { + invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL); + if (!invalidation_ctx) { + ret = -ENOMEM; + goto out; } - lents += rc; + umem->invalidation_ctx = invalidation_ctx; + invalidation_ctx->umem = umem; + mutex_lock(&ib_peer_mem->lock); + invalidation_ctx->context_ticket = + ib_peer_insert_context(ib_peer_mem, invalidation_ctx); + /* unlock before calling get pages to prevent a dead-lock from the callback */ + mutex_unlock(&ib_peer_mem->lock); } - return lents; + ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1, + &umem->sg_head, + umem->peer_mem_client_context, + invalidation_ctx ? + (void *)invalidation_ctx->context_ticket : NULL); + + if (invalidation_ctx) { + /* taking the lock back, checking that wasn't invalidated at that time */ + mutex_lock(&ib_peer_mem->lock); + if (invalidation_ctx->peer_invalidated) { + printk(KERN_ERR "peer_umem_get: pages were invalidated by peer\n"); + ret = -EINVAL; + } + } + + if (ret) + goto out; + + umem->page_size = peer_mem->get_page_size + (umem->peer_mem_client_context); + if (umem->page_size <= 0) + goto put_pages; + + umem->offset = addr & ((unsigned long)umem->page_size - 1); + ret = peer_mem->dma_map(&umem->sg_head, + umem->peer_mem_client_context, + umem->context->device->dma_device, + dmasync, + &umem->nmap); + if (ret) + goto put_pages; + + ib_peer_mem->stats.num_reg_pages += + umem->nmap * (umem->page_size >> PAGE_SHIFT); + ib_peer_mem->stats.num_alloc_mrs += 1; + return umem; + +put_pages: + + peer_mem->put_pages(umem->peer_mem_client_context, + &umem->sg_head); +out: + if (invalidation_ctx) { + ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); + mutex_unlock(&umem->ib_peer_mem->lock); + kfree(invalidation_ctx); + } + + ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, + umem->peer_mem_srcu_key); + kfree(umem); + return ERR_PTR(ret); } -static void dma_unmap_sg_ia64(struct ib_device *ibdev, - struct scatterlist *sg, - int nents, - enum dma_data_direction dir) +static void peer_umem_release(struct ib_umem *umem) { - int i; - struct device *dev; + struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem; + const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem; + struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx; - if (!dma_map_sg_hp_wa) - return ib_dma_unmap_sg(ibdev, sg, nents, dir); + if (invalidation_ctx) { + + int peer_callback; + int inflight_invalidation; + /* If we are not under peer callback we must take the lock before removing + * core ticket from the tree and releasing its umem. + * It will let any inflight callbacks to be ended safely. + * If we are under peer callback or under error flow of reg_mr so that context + * wasn't activated yet lock was already taken. + */ + if (invalidation_ctx->func && !invalidation_ctx->peer_callback) + mutex_lock(&ib_peer_mem->lock); + ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); + /* make sure to check inflight flag after took the lock and remove from tree. + * in addition, from that point using local variables for peer_callback and + * inflight_invalidation as after the complete invalidation_ctx can't be accessed + * any more as it may be freed by the callback. + */ + peer_callback = invalidation_ctx->peer_callback; + inflight_invalidation = invalidation_ctx->inflight_invalidation; + if (inflight_invalidation) + complete(&invalidation_ctx->comp); + /* On peer callback lock is handled externally */ + if (!peer_callback) + /* unlocking before put_pages */ + mutex_unlock(&ib_peer_mem->lock); + /* in case under callback context or callback is pending let it free the invalidation context */ + if (!peer_callback && !inflight_invalidation) + kfree(invalidation_ctx); + } + + peer_mem->dma_unmap(&umem->sg_head, + umem->peer_mem_client_context, + umem->context->device->dma_device); + peer_mem->put_pages(&umem->sg_head, + umem->peer_mem_client_context); + + ib_peer_mem->stats.num_dereg_pages += + umem->nmap * (umem->page_size >> PAGE_SHIFT); + ib_peer_mem->stats.num_dealloc_mrs += 1; + ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, + umem->peer_mem_srcu_key); + kfree(umem); + + return; - dev = ibdev->dma_device; - for (i = 0; i < nents; ++i) - dma_unmap_sg(dev, sg + i, 1, dir); } -#define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir) -#define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir) - -#endif - static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { -#ifdef __linux__ - struct ib_umem_chunk *chunk, *tmp; - int i; - list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { - ib_dma_unmap_sg_attrs(dev, chunk->page_list, - chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); - for (i = 0; i < chunk->nents; ++i) { - struct page *page = sg_page(&chunk->page_list[i]); - if (umem->writable && dirty) - set_page_dirty_lock(page); - put_page(page); - } - kfree(chunk); - } -#else - struct ib_umem_chunk *chunk, *tmp; vm_object_t object; + struct scatterlist *sg; + struct page *page; int i; object = NULL; - list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { - ib_dma_unmap_sg_attrs(dev, chunk->page_list, - chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); - for (i = 0; i < chunk->nents; ++i) { - struct page *page = sg_page(&chunk->page_list[i]); + if (umem->nmap > 0) + ib_dma_unmap_sg(dev, umem->sg_head.sgl, + umem->nmap, + DMA_BIDIRECTIONAL); + for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { + page = sg_page(sg); if (umem->writable && dirty) { if (object && object != page->object) VM_OBJECT_WUNLOCK(object); @@ -148,14 +207,26 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d vm_page_dirty(page); } } - kfree(chunk); - } + sg_free_table(&umem->sg_head); if (object) VM_OBJECT_WUNLOCK(object); -#endif } +void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, + umem_invalidate_func_t func, + void *cookie) +{ + struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx; + + invalidation_ctx->func = func; + invalidation_ctx->cookie = cookie; + + /* from that point any pending invalidations can be called */ + mutex_unlock(&umem->ib_peer_mem->lock); + return; +} +EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier); /** * ib_umem_get - Pin and DMA map userspace memory. * @context: userspace context to pin memory for @@ -164,163 +235,23 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d * @access: IB_ACCESS_xxx flags for memory being pinned * @dmasync: flush in-flight DMA when the memory region is written */ -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, - size_t size, int access, int dmasync) +struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync, + int invalidation_supported) { -#ifdef __linux__ + struct ib_umem *umem; - struct page **page_list; - struct vm_area_struct **vma_list; - struct ib_umem_chunk *chunk; - unsigned long locked; - unsigned long lock_limit; - unsigned long cur_base; - unsigned long npages; - int ret; - int off; - int i; - DEFINE_DMA_ATTRS(attrs); - - if (dmasync) - dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); - else if (allow_weak_ordering) - dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs); - - if (!can_do_mlock()) - return ERR_PTR(-EPERM); - - umem = kmalloc(sizeof *umem, GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - - umem->context = context; - umem->length = size; - umem->offset = addr & ~PAGE_MASK; - umem->page_size = PAGE_SIZE; - /* - * We ask for writable memory if any access flags other than - * "remote read" are set. "Local write" and "remote write" - * obviously require write access. "Remote atomic" can do - * things like fetch and add, which will modify memory, and - * "MW bind" can change permissions by binding a window. - */ - umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); - - /* We assume the memory is from hugetlb until proved otherwise */ - umem->hugetlb = 1; - - INIT_LIST_HEAD(&umem->chunk_list); - - page_list = (struct page **) __get_free_page(GFP_KERNEL); - if (!page_list) { - kfree(umem); - return ERR_PTR(-ENOMEM); - } - - /* - * if we can't alloc the vma_list, it's not so bad; - * just assume the memory is not hugetlb memory - */ - vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); - if (!vma_list) - umem->hugetlb = 0; - - npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; - - down_write(¤t->mm->mmap_sem); - - locked = npages + current->mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { - ret = -ENOMEM; - goto out; - } - - cur_base = addr & PAGE_MASK; - - ret = 0; - - while (npages) { - ret = get_user_pages(current, current->mm, cur_base, - min_t(unsigned long, npages, - PAGE_SIZE / sizeof (struct page *)), - 1, !umem->writable, page_list, vma_list); - - if (ret < 0) - goto out; - - cur_base += ret * PAGE_SIZE; - npages -= ret; - - off = 0; - - while (ret) { - chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) * - min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK), - GFP_KERNEL); - if (!chunk) { - ret = -ENOMEM; - goto out; - } - - chunk->attrs = attrs; - chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); - sg_init_table(chunk->page_list, chunk->nents); - for (i = 0; i < chunk->nents; ++i) { - if (vma_list && - !is_vm_hugetlb_page(vma_list[i + off])) - umem->hugetlb = 0; - sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0); - } - - chunk->nmap = ib_dma_map_sg_attrs(context->device, - &chunk->page_list[0], - chunk->nents, - DMA_BIDIRECTIONAL, - &attrs); - if (chunk->nmap <= 0) { - for (i = 0; i < chunk->nents; ++i) - put_page(sg_page(&chunk->page_list[i])); - kfree(chunk); - - ret = -ENOMEM; - goto out; - } - - ret -= chunk->nents; - off += chunk->nents; - list_add_tail(&chunk->list, &umem->chunk_list); - } - - ret = 0; - } - -out: - if (ret < 0) { - __ib_umem_release(context->device, umem, 0); - kfree(umem); - } else - current->mm->locked_vm = locked; - - up_write(¤t->mm->mmap_sem); - if (vma_list) - free_page((unsigned long) vma_list); - free_page((unsigned long) page_list); - - return ret < 0 ? ERR_PTR(ret) : umem; -#else - struct ib_umem *umem; - struct ib_umem_chunk *chunk; struct proc *proc; pmap_t pmap; vm_offset_t end, last, start; vm_size_t npages; int error; - int ents; int ret; + int ents; int i; DEFINE_DMA_ATTRS(attrs); + struct scatterlist *sg, *sg_list_start; + int need_release = 0; error = priv_check(curthread, PRIV_VM_MLOCK); if (error) @@ -372,134 +303,115 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, * "MW bind" can change permissions by binding a window. */ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + + if (invalidation_supported || context->peer_mem_private_data) { + + struct ib_peer_memory_client *peer_mem_client; + + peer_mem_client = ib_get_peer_client(context, addr, size, + &umem->peer_mem_client_context, + &umem->peer_mem_srcu_key); + if (peer_mem_client) + return peer_umem_get(peer_mem_client, umem, addr, + dmasync, invalidation_supported); + } + umem->hugetlb = 0; - INIT_LIST_HEAD(&umem->chunk_list); pmap = vm_map_pmap(&proc->p_vmspace->vm_map); - ret = 0; - while (npages) { - ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK); - chunk = kmalloc(sizeof(*chunk) + - (sizeof(struct scatterlist) * ents), - GFP_KERNEL); - if (!chunk) { - ret = -ENOMEM; + + if (npages == 0) { + ret = -EINVAL; goto out; } - chunk->attrs = attrs; - chunk->nents = ents; - sg_init_table(&chunk->page_list[0], ents); - for (i = 0; i < chunk->nents; ++i) { + ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); + if (ret) + goto out; + + need_release = 1; + sg_list_start = umem->sg_head.sgl; + + while (npages) { + + ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK); + umem->npages += ents; + + for_each_sg(sg_list_start, sg, ents, i) { vm_paddr_t pa; pa = pmap_extract(pmap, start); if (pa == 0) { ret = -ENOMEM; - kfree(chunk); goto out; } - sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa), + sg_set_page(sg, PHYS_TO_VM_PAGE(pa), PAGE_SIZE, 0); npages--; start += PAGE_SIZE; } - chunk->nmap = ib_dma_map_sg_attrs(context->device, - &chunk->page_list[0], - chunk->nents, + /* preparing for next loop */ + sg_list_start = sg; + } + + umem->nmap = ib_dma_map_sg_attrs(context->device, + umem->sg_head.sgl, + umem->npages, DMA_BIDIRECTIONAL, &attrs); - if (chunk->nmap != chunk->nents) { - kfree(chunk); + if (umem->nmap != umem->npages) { ret = -ENOMEM; goto out; } - list_add_tail(&chunk->list, &umem->chunk_list); - } - out: if (ret < 0) { + if (need_release) __ib_umem_release(context->device, umem, 0); kfree(umem); } return ret < 0 ? ERR_PTR(ret) : umem; -#endif +} +EXPORT_SYMBOL(ib_umem_get_ex); + +struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync) +{ + return ib_umem_get_ex(context, addr, + size, access, dmasync, 0); } EXPORT_SYMBOL(ib_umem_get); -#ifdef __linux__ -static void ib_umem_account(struct work_struct *work) -{ - struct ib_umem *umem = container_of(work, struct ib_umem, work); - - down_write(&umem->mm->mmap_sem); - umem->mm->locked_vm -= umem->diff; - up_write(&umem->mm->mmap_sem); - mmput(umem->mm); - kfree(umem); -} -#endif - /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release */ void ib_umem_release(struct ib_umem *umem) { -#ifdef __linux__ - struct ib_ucontext *context = umem->context; - struct mm_struct *mm; - unsigned long diff; - __ib_umem_release(umem->context->device, umem, 1); - - mm = get_task_mm(current); - if (!mm) { - kfree(umem); - return; - } - - diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; - - /* - * We may be called with the mm's mmap_sem already held. This - * can happen when a userspace munmap() is the call that drops - * the last reference to our file and calls our release - * method. If there are memory regions to destroy, we'll end - * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting to the system workqueue. - */ - if (context->closing) { - if (!down_write_trylock(&mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_account); - umem->mm = mm; - umem->diff = diff; - - schedule_work(&umem->work); - return; - } - } else - down_write(&mm->mmap_sem); - - current->mm->locked_vm -= diff; - up_write(&mm->mmap_sem); - mmput(mm); -#else vm_offset_t addr, end, last, start; vm_size_t size; int error; + if (umem->ib_peer_mem) { + peer_umem_release(umem); + return; + } + __ib_umem_release(umem->context->device, umem, 1); + if (umem->context->closing) { kfree(umem); return; } + error = priv_check(curthread, PRIV_VM_MUNLOCK); + if (error) return; + addr = umem->start; size = umem->length; last = addr + size; @@ -507,69 +419,24 @@ void ib_umem_release(struct ib_umem *umem) end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); - -#endif kfree(umem); + } EXPORT_SYMBOL(ib_umem_release); int ib_umem_page_count(struct ib_umem *umem) { - struct ib_umem_chunk *chunk; int shift; int i; int n; + struct scatterlist *sg; shift = ilog2(umem->page_size); n = 0; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (i = 0; i < chunk->nmap; ++i) - n += sg_dma_len(&chunk->page_list[i]) >> shift; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) + n += sg_dma_len(sg) >> shift; return n; } EXPORT_SYMBOL(ib_umem_page_count); - -/**********************************************/ -/* - * Stub functions for contiguous pages - - * We currently do not support this feature - */ -/**********************************************/ - -/** - * ib_cmem_release_contiguous_pages - release memory allocated by - * ib_cmem_alloc_contiguous_pages. - * @cmem: cmem struct to release - */ -void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem) -{ -} -EXPORT_SYMBOL(ib_cmem_release_contiguous_pages); - -/** - * * ib_cmem_alloc_contiguous_pages - allocate contiguous pages - * * @context: userspace context to allocate memory for - * * @total_size: total required size for that allocation. - * * @page_size_order: order of one contiguous page. - * */ -struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context, - unsigned long total_size, - unsigned long page_size_order) -{ - return NULL; -} -EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages); - -/** - * * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA - * * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages - * * @vma: VMA to inject pages into. - * */ -int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem, - struct vm_area_struct *vma) -{ - return 0; -} -EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma); diff --git a/sys/ofed/drivers/infiniband/core/user_mad.c b/sys/ofed/drivers/infiniband/core/user_mad.c index 161c65f7472b..cc4a65913b98 100644 --- a/sys/ofed/drivers/infiniband/core/user_mad.c +++ b/sys/ofed/drivers/infiniband/core/user_mad.c @@ -43,7 +43,9 @@ #include #include #include +#include #include +#include #include @@ -63,12 +65,9 @@ enum { }; /* - * Our lifetime rules for these structs are the following: each time a - * device special file is opened, we look up the corresponding struct - * ib_umad_port by minor in the umad_port[] table while holding the - * port_lock. If this lookup succeeds, we take a reference on the - * ib_umad_port's struct ib_umad_device while still holding the - * port_lock; if the lookup fails, we fail the open(). We drop these + * Our lifetime rules for these structs are the following: + * device special file is opened, we take a reference on the + * ib_umad_port's struct ib_umad_device. We drop these * references in the corresponding close(). * * In addition to references coming from open character devices, there @@ -76,12 +75,7 @@ enum { * module's reference taken when allocating the ib_umad_device in * ib_umad_add_one(). * - * When destroying an ib_umad_device, we clear all of its - * ib_umad_ports from umad_port[] while holding port_lock before - * dropping the module's reference to the ib_umad_device. This is - * always safe because any open() calls will either succeed and obtain - * a reference before we clear the umad_port[] entries, or fail after - * we clear the umad_port[] entries. + * When destroying an ib_umad_device, we drop the module's reference. */ struct ib_umad_port { @@ -99,6 +93,7 @@ struct ib_umad_port { struct ib_umad_device *umad_dev; int dev_num; u8 port_num; + struct list_head port_lst; }; struct ib_umad_device { @@ -135,18 +130,85 @@ static struct class *umad_class; static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); static DEFINE_SPINLOCK(port_lock); -static struct ib_umad_port *umad_port[IB_UMAD_MAX_PORTS]; static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); +static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device); -static void ib_umad_release_dev(struct kref *ref) +static DEFINE_SPINLOCK(ports_list_lock); +static struct list_head ports_list; + + +static void remove_ports(struct kref *ref) { + int i; + struct ib_umad_port *p, *p1; struct ib_umad_device *dev = container_of(ref, struct ib_umad_device, ref); + for (i = 0; i <= dev->end_port - dev->start_port; ++i) { + struct ib_umad_port *port = &dev->port[i]; + + list_for_each_entry_safe(p, p1, &ports_list, port_lst) + if (p == port) { + list_del(&p->port_lst); + break; + } + } +} + +static void put_umad_dev(struct kref *ref) +{ + int ret, i; + struct ib_umad_device *dev = + container_of(ref, struct ib_umad_device, ref); + + spin_lock(&ports_list_lock); + ret = (kref_put(ref, remove_ports)); + spin_unlock(&ports_list_lock); + if (ret) { + for (i = 0; i <= dev->end_port - dev->start_port; ++i) { + if (dev->port[i].dev_num < IB_UMAD_MAX_PORTS) + clear_bit(dev->port[i].dev_num, dev_map); + else + clear_bit(dev->port[i].dev_num - IB_UMAD_MAX_PORTS, overflow_map); + cdev_del(dev->port[i].cdev); + cdev_del(dev->port[i].sm_cdev); + } kfree(dev); + } +} + +static void release_port(struct ib_umad_port *port) +{ + put_umad_dev(&port->umad_dev->ref); +} + + +static struct ib_umad_port *get_port(struct cdev *cdev) +{ + struct ib_umad_port *port; + + spin_lock(&ports_list_lock); + list_for_each_entry(port, &ports_list, port_lst) { + if (port->cdev == cdev || port->sm_cdev == cdev) { + kref_get(&port->umad_dev->ref); + spin_unlock(&ports_list_lock); + + return port; + } + } + spin_unlock(&ports_list_lock); + + return NULL; +} + +static void insert_port(struct ib_umad_port *port) +{ + spin_lock(&ports_list_lock); + list_add(&port->port_lst, &ports_list); + spin_unlock(&ports_list_lock); } static int hdr_size(struct ib_umad_file *file) @@ -466,8 +528,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err; } - if (packet->mad.hdr.id < 0 || - packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { + if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { ret = -EINVAL; goto err; } @@ -679,7 +740,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, file->already_used = 1; if (!file->use_pkey_index) { printk(KERN_WARNING "user_mad: process %s did not enable " - "P_Key index support.\n", curproc->p_comm); + "P_Key index support.\n", curthread->td_proc->p_comm); printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt " "has info on the new ABI.\n"); } @@ -711,7 +772,7 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); - if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { + if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { ret = -EINVAL; goto out; } @@ -779,41 +840,33 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, /* * ib_umad_open() does not need the BKL: * - * - umad_port[] accesses are protected by port_lock, the - * ib_umad_port structures are properly reference counted, and + * - the ib_umad_port structures are properly reference counted, and * everything else is purely local to the file being created, so * races against other open calls are not a problem; * - the ioctl method does not affect any global state outside of the * file structure being operated on; - * - the port is added to umad_port[] as the last part of module - * initialization so the open method will either immediately run - * -ENXIO, or all required initialization will be done. */ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret = 0; - - spin_lock(&port_lock); - port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE]; - if (port) - kref_get(&port->umad_dev->ref); - spin_unlock(&port_lock); + int ret; + port = get_port(inode->i_cdev->si_drv1); if (!port) return -ENXIO; mutex_lock(&port->file_mutex); if (!port->ib_dev) { + release_port(port); ret = -ENXIO; goto out; } file = kzalloc(sizeof *file, GFP_KERNEL); if (!file) { - kref_put(&port->umad_dev->ref, ib_umad_release_dev); + release_port(port); ret = -ENOMEM; goto out; } @@ -830,6 +883,8 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); + ret = nonseekable_open(inode, filp); + out: mutex_unlock(&port->file_mutex); return ret; @@ -838,7 +893,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp) static int ib_umad_close(struct inode *inode, struct file *filp) { struct ib_umad_file *file = filp->private_data; - struct ib_umad_device *dev = file->port->umad_dev; + struct ib_umad_port *port = file->port; struct ib_umad_packet *packet, *tmp; int already_dead; int i; @@ -867,7 +922,7 @@ static int ib_umad_close(struct inode *inode, struct file *filp) mutex_unlock(&file->port->file_mutex); kfree(file); - kref_put(&dev->ref, ib_umad_release_dev); + release_port(port); return 0; } @@ -882,7 +937,8 @@ static const struct file_operations umad_fops = { .compat_ioctl = ib_umad_compat_ioctl, #endif .open = ib_umad_open, - .release = ib_umad_close + .release = ib_umad_close, + .llseek = no_llseek, }; static int ib_umad_sm_open(struct inode *inode, struct file *filp) @@ -893,12 +949,7 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) }; int ret; - spin_lock(&port_lock); - port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE - IB_UMAD_MAX_PORTS]; - if (port) - kref_get(&port->umad_dev->ref); - spin_unlock(&port_lock); - + port = get_port(inode->i_cdev->si_drv1); if (!port) return -ENXIO; @@ -922,10 +973,10 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) filp->private_data = port; - return 0; + return nonseekable_open(inode, filp); fail: - kref_put(&port->umad_dev->ref, ib_umad_release_dev); + release_port(port); return ret; } @@ -944,7 +995,7 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) up(&port->sm_sem); - kref_put(&port->umad_dev->ref, ib_umad_release_dev); + release_port(port); return ret; } @@ -952,7 +1003,8 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) static const struct file_operations umad_sm_fops = { .owner = THIS_MODULE, .open = ib_umad_sm_open, - .release = ib_umad_sm_close + .release = ib_umad_sm_close, + .llseek = no_llseek, }; static struct ib_client umad_client = { @@ -991,31 +1043,66 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *att } static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +static dev_t overflow_maj; +static int find_overflow_devnum(void) +{ + int ret; + + if (!overflow_maj) { + ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, + "infiniband_mad"); + if (ret) { + printk(KERN_ERR "user_mad: couldn't register dynamic device number\n"); + return ret; + } + } + + ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS); + if (ret >= IB_UMAD_MAX_PORTS) + return -1; + + return ret; +} + static int ib_umad_init_port(struct ib_device *device, int port_num, struct ib_umad_port *port) { + int devnum; + dev_t base; + spin_lock(&port_lock); - port->dev_num = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (port->dev_num >= IB_UMAD_MAX_PORTS) { + devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); + if (devnum >= IB_UMAD_MAX_PORTS) { spin_unlock(&port_lock); + devnum = find_overflow_devnum(); + if (devnum < 0) return -1; + + spin_lock(&port_lock); + port->dev_num = devnum + IB_UMAD_MAX_PORTS; + base = devnum + overflow_maj; + set_bit(devnum, overflow_map); + } else { + port->dev_num = devnum; + base = devnum + base_dev; + set_bit(devnum, dev_map); } - set_bit(port->dev_num, dev_map); spin_unlock(&port_lock); port->ib_dev = device; port->port_num = port_num; - init_MUTEX(&port->sm_sem); + sema_init(&port->sm_sem, 1); mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); port->cdev = cdev_alloc(); if (!port->cdev) - return -1; - port->cdev->owner = THIS_MODULE; + goto err_cdev_c; + port->cdev->ops = &umad_fops; + port->cdev->owner = THIS_MODULE; kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num); - if (cdev_add(port->cdev, base_dev + port->dev_num, 1)) + if (cdev_add(port->cdev, base, 1)) goto err_cdev; port->dev = device_create(umad_class, device->dma_device, @@ -1029,13 +1116,15 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (device_create_file(port->dev, &dev_attr_port)) goto err_dev; + base += IB_UMAD_MAX_PORTS; port->sm_cdev = cdev_alloc(); if (!port->sm_cdev) goto err_dev; - port->sm_cdev->owner = THIS_MODULE; + port->sm_cdev->ops = &umad_sm_fops; + port->sm_cdev->owner = THIS_MODULE; kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num); - if (cdev_add(port->sm_cdev, base_dev + port->dev_num + IB_UMAD_MAX_PORTS, 1)) + if (cdev_add(port->sm_cdev, base, 1)) goto err_sm_cdev; port->sm_dev = device_create(umad_class, device->dma_device, @@ -1049,10 +1138,6 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (device_create_file(port->sm_dev, &dev_attr_port)) goto err_sm_dev; - spin_lock(&port_lock); - umad_port[port->dev_num] = port; - spin_unlock(&port_lock); - return 0; err_sm_dev: @@ -1066,7 +1151,11 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, err_cdev: cdev_del(port->cdev); - clear_bit(port->dev_num, dev_map); +err_cdev_c: + if (port->dev_num < IB_UMAD_MAX_PORTS) + clear_bit(devnum, dev_map); + else + clear_bit(devnum, overflow_map); return -1; } @@ -1074,7 +1163,6 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, static void ib_umad_kill_port(struct ib_umad_port *port) { struct ib_umad_file *file; - int already_dead; int id; dev_set_drvdata(port->dev, NULL); @@ -1083,20 +1171,12 @@ static void ib_umad_kill_port(struct ib_umad_port *port) device_destroy(umad_class, port->cdev->dev); device_destroy(umad_class, port->sm_cdev->dev); - cdev_del(port->cdev); - cdev_del(port->sm_cdev); - - spin_lock(&port_lock); - umad_port[port->dev_num] = NULL; - spin_unlock(&port_lock); - mutex_lock(&port->file_mutex); port->ib_dev = NULL; list_for_each_entry(file, &port->file_list, port_list) { mutex_lock(&file->mutex); - already_dead = file->agents_dead; file->agents_dead = 1; mutex_unlock(&file->mutex); @@ -1106,8 +1186,6 @@ static void ib_umad_kill_port(struct ib_umad_port *port) } mutex_unlock(&port->file_mutex); - - clear_bit(port->dev_num, dev_map); } static void ib_umad_add_one(struct ib_device *device) @@ -1136,10 +1214,12 @@ static void ib_umad_add_one(struct ib_device *device) umad_dev->start_port = s; umad_dev->end_port = e; + for (i = 0; i <= e - s; ++i) + insert_port(&umad_dev->port[i]); + for (i = s; i <= e; ++i) { umad_dev->port[i - s].umad_dev = umad_dev; - if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND) if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) goto err; } @@ -1150,10 +1230,9 @@ static void ib_umad_add_one(struct ib_device *device) err: while (--i >= s) - if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND) ib_umad_kill_port(&umad_dev->port[i - s]); - kref_put(&umad_dev->ref, ib_umad_release_dev); + put_umad_dev(&umad_dev->ref); } static void ib_umad_remove_one(struct ib_device *device) @@ -1165,16 +1244,22 @@ static void ib_umad_remove_one(struct ib_device *device) return; for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) ib_umad_kill_port(&umad_dev->port[i]); - kref_put(&umad_dev->ref, ib_umad_release_dev); + put_umad_dev(&umad_dev->ref); +} + +static char *umad_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } static int __init ib_umad_init(void) { int ret; + INIT_LIST_HEAD(&ports_list); + ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { @@ -1189,6 +1274,8 @@ static int __init ib_umad_init(void) goto out_chrdev; } + umad_class->devnode = umad_devnode; + ret = class_create_file(umad_class, &class_attr_abi_version); if (ret) { printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n"); @@ -1218,6 +1305,8 @@ static void __exit ib_umad_cleanup(void) ib_unregister_client(&umad_client); class_destroy(umad_class); unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); + if (overflow_maj) + unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); } module_init(ib_umad_init); diff --git a/sys/ofed/drivers/infiniband/core/uverbs.h b/sys/ofed/drivers/infiniband/core/uverbs.h index fa64da542b95..8ca6498f8f67 100644 --- a/sys/ofed/drivers/infiniband/core/uverbs.h +++ b/sys/ofed/drivers/infiniband/core/uverbs.h @@ -41,10 +41,14 @@ #include #include #include +#include +#include #include +#include #include #include +#include /* * Our lifetime rules for these structs are the following: @@ -69,24 +73,26 @@ struct ib_uverbs_device { struct kref ref; + int num_comp_vectors; struct completion comp; - int devnum; - struct cdev *cdev; struct device *dev; struct ib_device *ib_dev; - int num_comp_vectors; + int devnum; + struct cdev cdev; + struct rb_root xrcd_tree; + struct mutex xrcd_tree_mutex; }; struct ib_uverbs_event_file { struct kref ref; struct file *filp; + int is_async; struct ib_uverbs_file *uverbs_file; spinlock_t lock; + int is_closed; wait_queue_head_t poll_wait; struct fasync_struct *async_queue; struct list_head event_list; - int is_async; - int is_closed; }; struct ib_uverbs_file { @@ -120,9 +126,20 @@ struct ib_uevent_object { u32 events_reported; }; +struct ib_uxrcd_object { + struct ib_uobject uobject; + atomic_t refcnt; +}; + +struct ib_usrq_object { + struct ib_uevent_object uevent; + struct ib_uxrcd_object *uxrcd; +}; + struct ib_uqp_object { struct ib_uevent_object uevent; struct list_head mcast_list; + struct ib_uxrcd_object *uxrcd; }; struct ib_ucq_object { @@ -134,9 +151,8 @@ struct ib_ucq_object { u32 async_events_reported; }; -struct ib_uxrcd_object { +struct ib_udct_object { struct ib_uobject uobject; - struct list_head xrc_reg_qp_list; }; extern spinlock_t ib_uverbs_idr_lock; @@ -147,12 +163,14 @@ extern struct idr ib_uverbs_ah_idr; extern struct idr ib_uverbs_cq_idr; extern struct idr ib_uverbs_qp_idr; extern struct idr ib_uverbs_srq_idr; -extern struct idr ib_uverbs_xrc_domain_idr; +extern struct idr ib_uverbs_xrcd_idr; +extern struct idr ib_uverbs_rule_idr; +extern struct idr ib_uverbs_dct_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, - int is_async, int *fd); + int is_async); struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd); void ib_uverbs_release_ucq(struct ib_uverbs_file *file, @@ -167,12 +185,24 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); -void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event, - void *context_ptr); -void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev, - struct ib_xrcd *xrcd); -int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file, - struct ib_xrcd *xrcd, u32 qp_num); +void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); + +struct ib_uverbs_flow_spec { + union { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ib ib; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + }; +}; #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ @@ -186,6 +216,8 @@ IB_UVERBS_DECLARE_CMD(alloc_pd); IB_UVERBS_DECLARE_CMD(dealloc_pd); IB_UVERBS_DECLARE_CMD(reg_mr); IB_UVERBS_DECLARE_CMD(dereg_mr); +IB_UVERBS_DECLARE_CMD(alloc_mw); +IB_UVERBS_DECLARE_CMD(dealloc_mw); IB_UVERBS_DECLARE_CMD(create_comp_channel); IB_UVERBS_DECLARE_CMD(create_cq); IB_UVERBS_DECLARE_CMD(resize_cq); @@ -193,6 +225,7 @@ IB_UVERBS_DECLARE_CMD(poll_cq); IB_UVERBS_DECLARE_CMD(req_notify_cq); IB_UVERBS_DECLARE_CMD(destroy_cq); IB_UVERBS_DECLARE_CMD(create_qp); +IB_UVERBS_DECLARE_CMD(open_qp); IB_UVERBS_DECLARE_CMD(query_qp); IB_UVERBS_DECLARE_CMD(modify_qp); IB_UVERBS_DECLARE_CMD(destroy_qp); @@ -207,14 +240,30 @@ IB_UVERBS_DECLARE_CMD(create_srq); IB_UVERBS_DECLARE_CMD(modify_srq); IB_UVERBS_DECLARE_CMD(query_srq); IB_UVERBS_DECLARE_CMD(destroy_srq); -IB_UVERBS_DECLARE_CMD(create_xrc_srq); -IB_UVERBS_DECLARE_CMD(open_xrc_domain); -IB_UVERBS_DECLARE_CMD(close_xrc_domain); -IB_UVERBS_DECLARE_CMD(create_xrc_rcv_qp); -IB_UVERBS_DECLARE_CMD(modify_xrc_rcv_qp); -IB_UVERBS_DECLARE_CMD(query_xrc_rcv_qp); -IB_UVERBS_DECLARE_CMD(reg_xrc_rcv_qp); -IB_UVERBS_DECLARE_CMD(unreg_xrc_rcv_qp); +IB_UVERBS_DECLARE_CMD(create_xsrq); +IB_UVERBS_DECLARE_CMD(open_xrcd); +IB_UVERBS_DECLARE_CMD(close_xrcd); +#define IB_UVERBS_DECLARE_EX_CMD(name) \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file,\ + struct ib_udata *ucore, \ + struct ib_udata *uhw) + +#define IB_UVERBS_DECLARE_EXP_CMD(name) \ + ssize_t ib_uverbs_exp_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ + struct ib_udata *uhw) + +IB_UVERBS_DECLARE_EX_CMD(create_flow); +IB_UVERBS_DECLARE_EX_CMD(destroy_flow); + +IB_UVERBS_DECLARE_EXP_CMD(create_qp); +IB_UVERBS_DECLARE_EXP_CMD(modify_cq); +IB_UVERBS_DECLARE_EXP_CMD(modify_qp); +IB_UVERBS_DECLARE_EXP_CMD(create_cq); +IB_UVERBS_DECLARE_EXP_CMD(query_device); +IB_UVERBS_DECLARE_EXP_CMD(create_dct); +IB_UVERBS_DECLARE_EXP_CMD(destroy_dct); +IB_UVERBS_DECLARE_EXP_CMD(query_dct); #endif /* UVERBS_H */ diff --git a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c index a34b344e5caf..5eef3f770d07 100644 --- a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c +++ b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c @@ -35,28 +35,68 @@ #include #include +#include +#include +#include #include +#include #include #include +#include #include "uverbs.h" -static struct lock_class_key pd_lock_key; -static struct lock_class_key mr_lock_key; -static struct lock_class_key cq_lock_key; -static struct lock_class_key qp_lock_key; -static struct lock_class_key ah_lock_key; -static struct lock_class_key srq_lock_key; +static int disable_raw_qp_enforcement; +module_param_named(disable_raw_qp_enforcement, disable_raw_qp_enforcement, int, + 0444); +MODULE_PARM_DESC(disable_raw_qp_enforcement, "Disable RAW QP enforcement for " + "being opened by root (default: 0)"); + +struct uverbs_lock_class { + struct lock_class_key key; + char name[16]; +}; + +static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" }; +static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" }; +static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" }; +static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" }; +static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; +static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; +static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; +static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; +static struct uverbs_lock_class dct_lock_class = { .name = "DCT-uobj" }; + +static int uverbs_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) +{ + return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; +} + +static int uverbs_copy_to_udata(struct ib_udata *udata, void *src, size_t len) +{ + return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; +} + +static struct ib_udata_ops uverbs_copy = { + .copy_from = uverbs_copy_from_udata, + .copy_to = uverbs_copy_to_udata +}; #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ do { \ + (udata)->ops = &uverbs_copy; \ (udata)->inbuf = (void __user *) (ibuf); \ (udata)->outbuf = (void __user *) (obuf); \ (udata)->inlen = (ilen); \ (udata)->outlen = (olen); \ } while (0) +enum uverbs_cmd_type { + IB_USER_VERBS_CMD_BASIC, + IB_USER_VERBS_CMD_EXTENDED +}; + /* * The ib_uobject locking scheme is as follows: * @@ -83,13 +123,13 @@ static struct lock_class_key srq_lock_key; */ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, - struct ib_ucontext *context, struct lock_class_key *key) + struct ib_ucontext *context, struct uverbs_lock_class *c) { uobj->user_handle = user_handle; uobj->context = context; kref_init(&uobj->ref); init_rwsem(&uobj->mutex); - lockdep_set_class(&uobj->mutex, key); + lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name); uobj->live = 0; } @@ -241,11 +281,34 @@ static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context) return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); } +static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) +{ + struct ib_uobject *uobj; + + uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context); + return uobj ? uobj->object : NULL; +} + static void put_qp_read(struct ib_qp *qp) { put_uobj_read(qp->uobject); } +static void put_qp_write(struct ib_qp *qp) +{ + put_uobj_write(qp->uobject); +} + +static struct ib_dct *idr_read_dct(int dct_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_dct_idr, dct_handle, context, 0); +} + +static void put_dct_read(struct ib_dct *dct) +{ + put_uobj_read(dct->uobject); +} + static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0); @@ -256,12 +319,10 @@ static void put_srq_read(struct ib_srq *srq) put_uobj_read(srq->uobject); } -static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, - struct ib_ucontext *context, +static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context, struct ib_uobject **uobj) { - *uobj = idr_read_uobj(&ib_uverbs_xrc_domain_idr, xrcd_handle, - context, 0); + *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0); return *uobj ? (*uobj)->object : NULL; } @@ -301,7 +362,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext = ibdev->alloc_ucontext(ibdev, &udata); if (IS_ERR(ucontext)) { - ret = PTR_ERR(file->ucontext); + ret = PTR_ERR(ucontext); goto err; } @@ -314,20 +375,23 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); INIT_LIST_HEAD(&ucontext->xrcd_list); + INIT_LIST_HEAD(&ucontext->rule_list); + INIT_LIST_HEAD(&ucontext->dct_list); ucontext->closing = 0; + ucontext->peer_mem_private_data = NULL; + ucontext->peer_mem_name = NULL; resp.num_comp_vectors = file->device->num_comp_vectors; - filp = ib_uverbs_alloc_event_file(file, 1, &resp.async_fd); + ret = get_unused_fd(); + if (ret < 0) + goto err_free; + resp.async_fd = ret; + + filp = ib_uverbs_alloc_event_file(file, 1); if (IS_ERR(filp)) { ret = PTR_ERR(filp); - goto err_free; - } - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - ret = -EFAULT; - goto err_file; + goto err_fd; } file->async_file = filp->private_data; @@ -338,6 +402,11 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, if (ret) goto err_file; + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_file; + } kref_get(&file->async_file->ref); kref_get(&file->ref); file->ucontext = ucontext; @@ -349,9 +418,11 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, return in_len; err_file: - put_unused_fd(resp.async_fd); fput(filp); +err_fd: + put_unused_fd(resp.async_fd); + err_free: ibdev->dealloc_ucontext(ucontext); @@ -360,6 +431,55 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, return ret; } +static void ib_uverbs_query_device_assign( + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr, + struct ib_uverbs_file *file) +{ + memset(resp, 0, sizeof(*resp)); + + resp->fw_ver = attr->fw_ver; + resp->node_guid = file->device->ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = attr->device_cap_flags; + resp->max_sge = attr->max_sge; + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_fmr = attr->max_fmr; + resp->max_map_per_fmr = attr->max_map_per_fmr; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; +} + ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -379,51 +499,10 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, if (ret) return ret; - memset(&resp, 0, sizeof resp); + ib_uverbs_query_device_assign(&resp, &attr, file); - resp.fw_ver = attr.fw_ver; - resp.node_guid = file->device->ib_dev->node_guid; - resp.sys_image_guid = attr.sys_image_guid; - resp.max_mr_size = attr.max_mr_size; - resp.page_size_cap = attr.page_size_cap; - resp.vendor_id = attr.vendor_id; - resp.vendor_part_id = attr.vendor_part_id; - resp.hw_ver = attr.hw_ver; - resp.max_qp = attr.max_qp; - resp.max_qp_wr = attr.max_qp_wr; - resp.device_cap_flags = attr.device_cap_flags; - resp.max_sge = attr.max_sge; - resp.max_sge_rd = attr.max_sge_rd; - resp.max_cq = attr.max_cq; - resp.max_cqe = attr.max_cqe; - resp.max_mr = attr.max_mr; - resp.max_pd = attr.max_pd; - resp.max_qp_rd_atom = attr.max_qp_rd_atom; - resp.max_ee_rd_atom = attr.max_ee_rd_atom; - resp.max_res_rd_atom = attr.max_res_rd_atom; - resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom; - resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom; - resp.atomic_cap = attr.atomic_cap; - resp.max_ee = attr.max_ee; - resp.max_rdd = attr.max_rdd; - resp.max_mw = attr.max_mw; - resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp; - resp.max_raw_ethy_qp = attr.max_raw_ethy_qp; - resp.max_mcast_grp = attr.max_mcast_grp; - resp.max_mcast_qp_attach = attr.max_mcast_qp_attach; - resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; - resp.max_ah = attr.max_ah; - resp.max_fmr = attr.max_fmr; - resp.max_map_per_fmr = attr.max_map_per_fmr; - resp.max_srq = attr.max_srq; - resp.max_srq_wr = attr.max_srq_wr; - resp.max_srq_sge = attr.max_srq_sge; - resp.max_pkeys = attr.max_pkeys; - resp.local_ca_ack_delay = attr.local_ca_ack_delay; - resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt; - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) + if (copy_to_user((void __user *)(unsigned long) cmd.response, + &resp, sizeof(resp))) return -EFAULT; return in_len; @@ -469,7 +548,8 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.active_width = attr.active_width; resp.active_speed = attr.active_speed; resp.phys_state = attr.phys_state; - resp.link_layer = attr.link_layer; + resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, + cmd.port_num); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -503,7 +583,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, if (!uobj) return -ENOMEM; - init_uobj(uobj, 0, file->ucontext, &pd_lock_key); + init_uobj(uobj, 0, file->ucontext, &pd_lock_class); down_write(&uobj->mutex); pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, @@ -587,17 +667,316 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, return in_len; } -ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, +struct xrcd_table_entry { + struct rb_node node; + struct ib_xrcd *xrcd; + struct inode *inode; +}; + +static int xrcd_table_insert(struct ib_uverbs_device *dev, + struct inode *inode, + struct ib_xrcd *xrcd) +{ + struct xrcd_table_entry *entry, *scan; + struct rb_node **p = &dev->xrcd_tree.rb_node; + struct rb_node *parent = NULL; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->xrcd = xrcd; + entry->inode = inode; + + while (*p) { + parent = *p; + scan = rb_entry(parent, struct xrcd_table_entry, node); + + if (inode < scan->inode) { + p = &(*p)->rb_left; + } else if (inode > scan->inode) { + p = &(*p)->rb_right; + } else { + kfree(entry); + return -EEXIST; + } + } + + rb_link_node(&entry->node, parent, p); + rb_insert_color(&entry->node, &dev->xrcd_tree); + igrab(inode); + return 0; +} + +static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev, + struct inode *inode) +{ + struct xrcd_table_entry *entry; + struct rb_node *p = dev->xrcd_tree.rb_node; + + while (p) { + entry = rb_entry(p, struct xrcd_table_entry, node); + + if (inode < entry->inode) + p = p->rb_left; + else if (inode > entry->inode) + p = p->rb_right; + else + return entry; + } + + return NULL; +} + +static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, inode); + if (!entry) + return NULL; + + return entry->xrcd; +} + +static void xrcd_table_delete(struct ib_uverbs_device *dev, + struct inode *inode) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, inode); + if (entry) { + iput(inode); + rb_erase(&entry->node, &dev->xrcd_tree); + kfree(entry); + } +} + +ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) +{ + struct ib_uverbs_open_xrcd cmd; + struct ib_uverbs_open_xrcd_resp resp; + struct ib_udata udata; + struct ib_uxrcd_object *obj; + struct ib_xrcd *xrcd = NULL; + struct fd f = {NULL}; + struct inode *inode = NULL; + int ret = 0; + int new_xrcd = 0; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + mutex_lock(&file->device->xrcd_tree_mutex); + + if (cmd.fd != -1) { + /* search for file descriptor */ + f = fdget(cmd.fd); + if (!f.file) { + ret = -EBADF; + goto err_tree_mutex_unlock; + } + + inode = f.file->f_dentry->d_inode; + xrcd = find_xrcd(file->device, inode); + if (!xrcd && !(cmd.oflags & O_CREAT)) { + /* no file descriptor. Need CREATE flag */ + ret = -EAGAIN; + goto err_tree_mutex_unlock; + } + + if (xrcd && cmd.oflags & O_EXCL) { + ret = -EINVAL; + goto err_tree_mutex_unlock; + } + } + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) { + ret = -ENOMEM; + goto err_tree_mutex_unlock; + } + + init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class); + + down_write(&obj->uobject.mutex); + + if (!xrcd) { + xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, + file->ucontext, &udata); + if (IS_ERR(xrcd)) { + ret = PTR_ERR(xrcd); + goto err; + } + + xrcd->inode = inode; + xrcd->device = file->device->ib_dev; + atomic_set(&xrcd->usecnt, 0); + mutex_init(&xrcd->tgt_qp_mutex); + INIT_LIST_HEAD(&xrcd->tgt_qp_list); + new_xrcd = 1; + } + + atomic_set(&obj->refcnt, 0); + obj->uobject.object = xrcd; + ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); + if (ret) + goto err_idr; + + memset(&resp, 0, sizeof resp); + resp.xrcd_handle = obj->uobject.id; + + if (inode) { + if (new_xrcd) { + /* create new inode/xrcd table entry */ + ret = xrcd_table_insert(file->device, inode, xrcd); + if (ret) + goto err_insert_xrcd; + } + atomic_inc(&xrcd->usecnt); + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + if (f.file) + fdput(f); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list); + mutex_unlock(&file->mutex); + + obj->uobject.live = 1; + up_write(&obj->uobject.mutex); + + mutex_unlock(&file->device->xrcd_tree_mutex); + return in_len; + +err_copy: + if (inode) { + if (new_xrcd) + xrcd_table_delete(file->device, inode); + atomic_dec(&xrcd->usecnt); + } + +err_insert_xrcd: + idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); + +err_idr: + ib_dealloc_xrcd(xrcd); + +err: + put_uobj_write(&obj->uobject); + +err_tree_mutex_unlock: + if (f.file) + fdput(f); + + mutex_unlock(&file->device->xrcd_tree_mutex); + + return ret; +} + +ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_close_xrcd cmd; + struct ib_uobject *uobj; + struct ib_xrcd *xrcd = NULL; + struct inode *inode = NULL; + struct ib_uxrcd_object *obj; + int live; + int ret = 0; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + mutex_lock(&file->device->xrcd_tree_mutex); + uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext); + if (!uobj) { + ret = -EINVAL; + goto out; + } + + xrcd = uobj->object; + inode = xrcd->inode; + obj = container_of(uobj, struct ib_uxrcd_object, uobject); + if (atomic_read(&obj->refcnt)) { + put_uobj_write(uobj); + ret = -EBUSY; + goto out; + } + + if (!inode || atomic_dec_and_test(&xrcd->usecnt)) { + ret = ib_dealloc_xrcd(uobj->object); + if (!ret) + uobj->live = 0; + } + + live = uobj->live; + if (inode && ret) + atomic_inc(&xrcd->usecnt); + + put_uobj_write(uobj); + + if (ret) + goto out; + + if (inode && !live) + xrcd_table_delete(file->device, inode); + + idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + ret = in_len; + +out: + mutex_unlock(&file->device->xrcd_tree_mutex); + return ret; +} + +void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, + struct ib_xrcd *xrcd) +{ + struct inode *inode; + + inode = xrcd->inode; + if (inode && !atomic_dec_and_test(&xrcd->usecnt)) + return; + + ib_dealloc_xrcd(xrcd); + + if (inode) + xrcd_table_delete(dev, inode); +} + +ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_reg_mr cmd; struct ib_uverbs_reg_mr_resp resp; - struct ib_udata udata; + struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; - int ret; + int ret; if (out_len < sizeof resp) return -ENOSPC; @@ -612,32 +991,34 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - /* - * Local write permission is required if remote write or - * remote atomic permission is also requested. - */ - if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && - !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) - return -EINVAL; + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) return -ENOMEM; - init_uobj(uobj, 0, file->ucontext, &mr_lock_key); + init_uobj(uobj, 0, file->ucontext, &mr_lock_class); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { - ret = -EINVAL; + ret = -EINVAL; goto err_free; } + /* We first get a new "obj id" to be passed later to reg mr for + further use as mr_id. + */ + ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); + if (ret) + goto err_put; mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, - cmd.access_flags, &udata, 0); + cmd.access_flags, &udata, uobj->id); if (IS_ERR(mr)) { ret = PTR_ERR(mr); - goto err_put; + goto err_remove_uobj; } mr->device = pd->device; @@ -647,9 +1028,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, atomic_set(&mr->usecnt, 0); uobj->object = mr; - ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); - if (ret) - goto err_unreg; memset(&resp, 0, sizeof resp); resp.lkey = mr->lkey; @@ -675,11 +1053,11 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, return in_len; err_copy: - idr_remove_uobj(&ib_uverbs_mr_idr, uobj); - -err_unreg: ib_dereg_mr(mr); +err_remove_uobj: + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + err_put: put_pd_read(pd); @@ -689,13 +1067,13 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, } ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_dereg_mr cmd; struct ib_mr *mr; struct ib_uobject *uobj; - int ret = -EINVAL; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -726,13 +1104,134 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, return in_len; } +ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_alloc_mw cmd; + struct ib_uverbs_alloc_mw_resp resp; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_mw *mw; + int ret; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return -ENOMEM; + + init_uobj(uobj, 0, file->ucontext, &mw_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_free; + } + + mw = pd->device->alloc_mw(pd, cmd.mw_type); + if (IS_ERR(mw)) { + ret = PTR_ERR(mw); + goto err_put; + } + + mw->device = pd->device; + mw->pd = pd; + mw->uobject = uobj; + atomic_inc(&pd->usecnt); + + uobj->object = mw; + ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj); + if (ret) + goto err_unalloc; + + memset(&resp, 0, sizeof(resp)); + resp.rkey = mw->rkey; + resp.mw_handle = uobj->id; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) { + ret = -EFAULT; + goto err_copy; + } + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->mw_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + +err_unalloc: + ib_dealloc_mw(mw); + +err_put: + put_pd_read(pd); + +err_free: + put_uobj_write(uobj); + return ret; +} + +ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_dealloc_mw cmd; + struct ib_mw *mw; + struct ib_uobject *uobj; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + mw = uobj->object; + + ret = ib_dealloc_mw(mw); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_create_comp_channel cmd; struct ib_uverbs_create_comp_channel_resp resp; struct file *filp; + int ret; if (out_len < sizeof resp) return -ENOSPC; @@ -740,9 +1239,16 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - filp = ib_uverbs_alloc_event_file(file, 0, &resp.fd); - if (IS_ERR(filp)) + ret = get_unused_fd(); + if (ret < 0) + return ret; + resp.fd = ret; + + filp = ib_uverbs_alloc_event_file(file, 0); + if (IS_ERR(filp)) { + put_unused_fd(resp.fd); return PTR_ERR(filp); + } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { @@ -755,40 +1261,44 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, return in_len; } -ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +static ssize_t create_cq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len, void *vcmd, int ex, + void __user *response) { - struct ib_uverbs_create_cq cmd; + struct ib_uverbs_create_cq *cmd; + struct ib_uverbs_create_cq_ex *cmd_e; struct ib_uverbs_create_cq_resp resp; struct ib_udata udata; struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file = NULL; struct ib_cq *cq; + struct ib_cq_init_attr attr; + int cmd_sz; int ret; if (out_len < sizeof resp) return -ENOSPC; - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + cmd = vcmd; + cmd_e = vcmd; + cmd_sz = ex ? sizeof(*cmd_e) : sizeof(*cmd); + INIT_UDATA(&udata, buf + cmd_sz, response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); - - if (cmd.comp_vector >= file->device->num_comp_vectors) + if (cmd->comp_vector >= file->device->num_comp_vectors) return -EINVAL; obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_key); + init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, + &cq_lock_class); down_write(&obj->uobject.mutex); - if (cmd.comp_channel >= 0) { - ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel); + if (cmd->comp_channel >= 0) { + ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel); if (!ev_file) { ret = -EINVAL; goto err; @@ -801,8 +1311,12 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); - cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe, - cmd.comp_vector, + memset(&attr, 0, sizeof(attr)); + attr.cqe = cmd->cqe; + attr.comp_vector = cmd->comp_vector; + if (ex && (cmd_e->comp_mask & IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS)) + attr.flags = cmd_e->create_flags; + cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr, file->ucontext, &udata); if (IS_ERR(cq)) { ret = PTR_ERR(cq); @@ -825,8 +1339,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, resp.cq_handle = obj->uobject.id; resp.cqe = cq->cqe; - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { + if (copy_to_user(response, &resp, sizeof(resp))) { ret = -EFAULT; goto err_copy; } @@ -856,6 +1369,19 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, return ret; } +ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_cq cmd; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + return create_cq(file, buf, in_len, out_len, &cmd, + IB_USER_VERBS_CMD_BASIC, (void __user *)cmd.response); +} + ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -893,68 +1419,81 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, return ret ? ret : in_len; } +static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) +{ + struct ib_uverbs_wc tmp; + + tmp.wr_id = wc->wr_id; + tmp.status = wc->status; + tmp.opcode = wc->opcode; + tmp.vendor_err = wc->vendor_err; + tmp.byte_len = wc->byte_len; + tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; + tmp.qp_num = wc->qp->qp_num; + tmp.src_qp = wc->src_qp; + tmp.wc_flags = wc->wc_flags; + tmp.pkey_index = wc->pkey_index; + tmp.slid = wc->slid; + tmp.sl = wc->sl; + tmp.dlid_path_bits = wc->dlid_path_bits; + tmp.port_num = wc->port_num; + tmp.reserved = 0; + + if (copy_to_user(dest, &tmp, sizeof tmp)) + return -EFAULT; + + return 0; +} + ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_poll_cq cmd; - struct ib_uverbs_poll_cq_resp *resp; + struct ib_uverbs_poll_cq_resp resp; + u8 __user *header_ptr; + u8 __user *data_ptr; struct ib_cq *cq; - struct ib_wc *wc; - int ret = 0; - int i; - int rsize; + struct ib_wc wc; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - wc = kmalloc(cmd.ne * sizeof *wc, GFP_KERNEL); - if (!wc) - return -ENOMEM; - - rsize = sizeof *resp + cmd.ne * sizeof(struct ib_uverbs_wc); - resp = kmalloc(rsize, GFP_KERNEL); - if (!resp) { - ret = -ENOMEM; - goto out_wc; - } - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); - if (!cq) { - ret = -EINVAL; - goto out; + if (!cq) + return -EINVAL; + + /* we copy a struct ib_uverbs_poll_cq_resp to user space */ + header_ptr = (void __user *)(unsigned long) cmd.response; + data_ptr = header_ptr + sizeof resp; + + memset(&resp, 0, sizeof resp); + while (resp.count < cmd.ne) { + ret = ib_poll_cq(cq, 1, &wc); + if (ret < 0) + goto out_put; + if (!ret) + break; + + ret = copy_wc_to_user(data_ptr, &wc); + if (ret) + goto out_put; + + data_ptr += sizeof(struct ib_uverbs_wc); + ++resp.count; } - resp->count = ib_poll_cq(cq, cmd.ne, wc); - - put_cq_read(cq); - - for (i = 0; i < resp->count; i++) { - resp->wc[i].wr_id = wc[i].wr_id; - resp->wc[i].status = wc[i].status; - resp->wc[i].opcode = wc[i].opcode; - resp->wc[i].vendor_err = wc[i].vendor_err; - resp->wc[i].byte_len = wc[i].byte_len; - resp->wc[i].ex.imm_data = (__u32 __force) wc[i].ex.imm_data; - resp->wc[i].qp_num = wc[i].qp->qp_num; - resp->wc[i].src_qp = wc[i].src_qp; - resp->wc[i].wc_flags = wc[i].wc_flags; - resp->wc[i].pkey_index = wc[i].pkey_index; - resp->wc[i].slid = wc[i].slid; - resp->wc[i].sl = wc[i].sl; - resp->wc[i].dlid_path_bits = wc[i].dlid_path_bits; - resp->wc[i].port_num = wc[i].port_num; - } - - if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, rsize)) + if (copy_to_user(header_ptr, &resp, sizeof resp)) { ret = -EFAULT; + goto out_put; + } -out: - kfree(resp); + ret = in_len; -out_wc: - kfree(wc); - return ret ? ret : in_len; +out_put: + put_cq_read(cq); + return ret; } ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, @@ -1035,124 +1574,181 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { - struct ib_uverbs_create_qp cmd; - struct ib_uverbs_create_qp_resp resp; + void __user *response; struct ib_udata udata; struct ib_uqp_object *obj; - struct ib_pd *pd; - struct ib_cq *scq, *rcq; - struct ib_srq *srq; + struct ib_device *device; + struct ib_pd *pd = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_cq *scq = NULL, *rcq = NULL; + struct ib_srq *srq = NULL; struct ib_qp *qp; struct ib_qp_init_attr attr; - struct ib_xrcd *xrcd; - struct ib_uobject *xrcd_uobj; int ret; + union { + struct ib_uverbs_create_qp basic; + } cmd_obj; + struct ib_uverbs_create_qp *cmd; + size_t cmd_size = 0; + union { + struct ib_uverbs_create_qp_resp basic; + } resp_obj; + struct ib_uverbs_create_qp_resp *resp; + size_t resp_size = 0; - if (out_len < sizeof resp) + cmd_size = sizeof(cmd_obj.basic); + cmd = &cmd_obj.basic; + + resp_size = sizeof(resp_obj.basic); + resp = &resp_obj.basic; + + if (out_len < resp_size) return -ENOSPC; - if (copy_from_user(&cmd, buf, sizeof cmd)) + if (copy_from_user(&cmd_obj, buf, cmd_size)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + response = (void __user *)cmd->response; - obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!disable_raw_qp_enforcement && + cmd->qp_type == IB_QPT_RAW_PACKET && !priv_check(curthread, PRIV_NET_RAW)) + return -EPERM; + + INIT_UDATA(&udata, buf + cmd_size, response + resp_size, + in_len - cmd_size, out_len - resp_size); + + obj = kzalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; - init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key); + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); - srq = (cmd.is_srq && cmd.qp_type != IB_QPT_XRC) ? - idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; - xrcd = cmd.qp_type == IB_QPT_XRC ? - idr_read_xrcd(cmd.srq_handle, file->ucontext, &xrcd_uobj) : NULL; - pd = idr_read_pd(cmd.pd_handle, file->ucontext); - scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0); - rcq = cmd.recv_cq_handle == cmd.send_cq_handle ? - scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1); + if (cmd->qp_type == IB_QPT_XRC_TGT) { + xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, &xrcd_uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_put; + } + device = xrcd->device; + } else { + if (cmd->qp_type == IB_QPT_XRC_INI) { + cmd->max_recv_wr = 0; + cmd->max_recv_sge = 0; + } else { + if (cmd->is_srq) { + srq = idr_read_srq(cmd->srq_handle, file->ucontext); + if (!srq || srq->srq_type != IB_SRQT_BASIC) { + ret = -EINVAL; + goto err_put; + } + } - if (!pd || !scq || !rcq || (cmd.is_srq && !srq) || - (cmd.qp_type == IB_QPT_XRC && !xrcd)) { - ret = -EINVAL; - goto err_put; + if (cmd->recv_cq_handle != cmd->send_cq_handle) { + rcq = idr_read_cq(cmd->recv_cq_handle, file->ucontext, 0); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } + } + } + + scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); + rcq = rcq ?: scq; + pd = idr_read_pd(cmd->pd_handle, file->ucontext); + if (!pd || !scq) { + ret = -EINVAL; + goto err_put; } - attr.create_flags = 0; + device = pd->device; + } + + memset(&attr, 0, sizeof attr); attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.send_cq = scq; attr.recv_cq = rcq; attr.srq = srq; - attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; - attr.qp_type = cmd.qp_type; - attr.xrcd = xrcd; + attr.xrcd = xrcd; + attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + attr.qp_type = cmd->qp_type; attr.create_flags = 0; - attr.cap.max_send_wr = cmd.max_send_wr; - attr.cap.max_recv_wr = cmd.max_recv_wr; - attr.cap.max_send_sge = cmd.max_send_sge; - attr.cap.max_recv_sge = cmd.max_recv_sge; - attr.cap.max_inline_data = cmd.max_inline_data; + attr.cap.max_send_wr = cmd->max_send_wr; + attr.cap.max_recv_wr = cmd->max_recv_wr; + attr.cap.max_send_sge = cmd->max_send_sge; + attr.cap.max_recv_sge = cmd->max_recv_sge; + attr.cap.max_inline_data = cmd->max_inline_data; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); - qp = pd->device->create_qp(pd, &attr, &udata); + if (cmd->qp_type == IB_QPT_XRC_TGT) + qp = ib_create_qp(pd, &attr); + else + qp = device->create_qp(pd, &attr, &udata); + if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } - qp->device = pd->device; - qp->pd = pd; - qp->send_cq = attr.send_cq; - qp->recv_cq = attr.recv_cq; - qp->srq = attr.srq; - qp->uobject = &obj->uevent.uobject; - qp->event_handler = attr.event_handler; - qp->qp_context = attr.qp_context; - qp->qp_type = attr.qp_type; - qp->xrcd = attr.xrcd; - atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); - atomic_inc(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_inc(&attr.srq->usecnt); - else if (attr.xrcd) - atomic_inc(&attr.xrcd->usecnt); + if (cmd->qp_type != IB_QPT_XRC_TGT) { + qp->real_qp = qp; + qp->device = device; + qp->pd = pd; + qp->send_cq = attr.send_cq; + qp->recv_cq = attr.recv_cq; + qp->srq = attr.srq; + qp->event_handler = attr.event_handler; + qp->qp_context = attr.qp_context; + qp->qp_type = attr.qp_type; + atomic_set(&qp->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&attr.send_cq->usecnt); + if (attr.recv_cq) + atomic_inc(&attr.recv_cq->usecnt); + if (attr.srq) + atomic_inc(&attr.srq->usecnt); + } + qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; - memset(&resp, 0, sizeof resp); - resp.qpn = qp->qp_num; - resp.qp_handle = obj->uevent.uobject.id; - resp.max_recv_sge = attr.cap.max_recv_sge; - resp.max_send_sge = attr.cap.max_send_sge; - resp.max_recv_wr = attr.cap.max_recv_wr; - resp.max_send_wr = attr.cap.max_send_wr; - resp.max_inline_data = attr.cap.max_inline_data; + memset(&resp_obj, 0, sizeof(resp_obj)); + resp->qpn = qp->qp_num; + resp->qp_handle = obj->uevent.uobject.id; + resp->max_recv_sge = attr.cap.max_recv_sge; + resp->max_send_sge = attr.cap.max_send_sge; + resp->max_recv_wr = attr.cap.max_recv_wr; + resp->max_send_wr = attr.cap.max_send_wr; + resp->max_inline_data = attr.cap.max_inline_data; - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - ret = -EFAULT; + if (copy_to_user(response, &resp_obj, resp_size)) { + ret = -EFAULT; goto err_copy; - } + } - put_pd_read(pd); - put_cq_read(scq); - if (rcq != scq) + if (xrcd) { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + put_xrcd_read(xrcd_uobj); + } + + if (pd) + put_pd_read(pd); + if (scq) + put_cq_read(scq); + if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); - if (xrcd) - put_xrcd_read(xrcd_uobj); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); @@ -1171,6 +1767,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, ib_destroy_qp(qp); err_put: + if (xrcd) + put_xrcd_read(xrcd_uobj); if (pd) put_pd_read(pd); if (scq) @@ -1179,16 +1777,107 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, put_cq_read(rcq); if (srq) put_srq_read(srq); - if (xrcd) - put_xrcd_read(xrcd_uobj); put_uobj_write(&obj->uevent.uobject); return ret; } +ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, int out_len) +{ + struct ib_uverbs_open_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct ib_udata udata; + struct ib_uqp_object *obj; + struct ib_xrcd *xrcd; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_qp *qp; + struct ib_qp_open_attr attr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class); + down_write(&obj->uevent.uobject.mutex); + + xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_put; + } + + attr.event_handler = ib_uverbs_qp_event_handler; + attr.qp_context = file; + attr.qp_num = cmd.qpn; + attr.qp_type = cmd.qp_type; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + + qp = ib_open_qp(xrcd, &attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + + qp->uobject = &obj->uevent.uobject; + + obj->uevent.uobject.object = qp; + ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.qpn = qp->qp_num; + resp.qp_handle = obj->uevent.uobject.id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_remove; + } + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + put_xrcd_read(xrcd_uobj); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + up_write(&obj->uevent.uobject.mutex); + + return in_len; + +err_remove: + idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_qp(qp); + +err_put: + put_xrcd_read(xrcd_uobj); + put_uobj_write(&obj->uevent.uobject); + return ret; +} + ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_query_qp cmd; struct ib_uverbs_query_qp_resp resp; @@ -1286,30 +1975,59 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, return ret ? ret : in_len; } -ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +/* Remove ignored fields set in the attribute mask */ +static int modify_qp_mask(enum ib_qp_type qp_type, int mask) { - struct ib_uverbs_modify_qp cmd; - struct ib_udata udata; - struct ib_qp *qp; - struct ib_qp_attr *attr; - int ret; + switch (qp_type) { + case IB_QPT_XRC_INI: + return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER); + case IB_QPT_XRC_TGT: + return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY); + default: + return mask; + } +} - if (copy_from_user(&cmd, buf, sizeof cmd)) +static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len, + enum uverbs_cmd_type cmd_type) +{ + struct ib_uverbs_modify_qp_ex cmd; + struct ib_udata udata; + struct ib_qp *qp; + struct ib_qp_attr *attr; + struct ib_qp_attr_ex *attrx; + int ret; + void *p; + union ib_gid sgid; + union ib_gid *dgid; + u8 port_num; + + if (cmd_type == IB_USER_VERBS_CMD_BASIC) { + p = &cmd; + p += sizeof(cmd.comp_mask); + if (copy_from_user(p, buf, + sizeof(struct ib_uverbs_modify_qp))) return -EFAULT; + } else { + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + } INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); - attr = kmalloc(sizeof *attr, GFP_KERNEL); - if (!attr) + attrx = kzalloc(sizeof(*attrx), GFP_KERNEL); + if (!attrx) return -ENOMEM; + attr = (struct ib_qp_attr *)attrx; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { - ret = -EINVAL; - goto out; + kfree(attrx); + return -EINVAL; } attr->qp_state = cmd.qp_state; @@ -1357,10 +2075,49 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; + port_num = (cmd.attr_mask & IB_QP_PORT) ? cmd.port_num : qp->port_num; + if ((cmd.attr_mask & IB_QP_AV) && port_num && + (rdma_port_get_link_layer(qp->device, port_num) == + IB_LINK_LAYER_ETHERNET)) { + ret = ib_query_gid(qp->device, port_num, + attr->ah_attr.grh.sgid_index, &sgid); + if (ret) + goto out; + dgid = &attr->ah_attr.grh.dgid; + if (rdma_link_local_addr((struct in6_addr *)dgid->raw)) { + rdma_get_ll_mac((struct in6_addr *)dgid->raw, + attr->ah_attr.dmac); + rdma_get_ll_mac((struct in6_addr *)sgid.raw, + attr->smac); + attr->vlan_id = rdma_get_vlan_id(&sgid); + } else { + ret = rdma_addr_find_dmac_by_grh(&sgid, dgid, + attr->ah_attr.dmac, + &attr->vlan_id); + if (ret) + goto out; + ret = rdma_addr_find_smac_by_sgid(&sgid, attr->smac, + NULL); + if (ret) + goto out; + } + cmd.attr_mask |= IB_QP_SMAC; + if (attr->vlan_id < 0xFFFF) + cmd.attr_mask |= IB_QP_VID; + } + if (cmd_type == IB_USER_VERBS_CMD_EXTENDED) { + if (cmd.comp_mask & IB_UVERBS_QP_ATTR_DCT_KEY) + attrx->dct_key = cmd.dct_key; + } - ret = qp->device->modify_qp(qp, attr, cmd.attr_mask, &udata); - - put_qp_read(qp); + if (qp->real_qp == qp) { + ret = qp->device->modify_qp(qp, attr, + modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); + if (!ret && (cmd.attr_mask & IB_QP_PORT)) + qp->port_num = attr->port_num; + } else { + ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask)); + } if (ret) goto out; @@ -1368,18 +2125,27 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, ret = in_len; out: - kfree(attr); + put_qp_read(qp); + kfree(attrx); return ret; } +ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + return __uverbs_modify_qp(file, buf, in_len, out_len, + IB_USER_VERBS_CMD_BASIC); +} + ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_qp *qp; struct ib_uqp_object *obj; int ret = -EINVAL; @@ -1409,6 +2175,9 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, if (ret) return ret; + if (obj->uxrcd) + atomic_dec(&obj->uxrcd->refcnt); + idr_remove_uobj(&ib_uverbs_qp_idr, uobj); mutex_lock(&file->mutex); @@ -1429,14 +2198,14 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, } ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_post_send cmd; struct ib_uverbs_post_send_resp resp; struct ib_uverbs_send_wr *user_wr; struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; - struct ib_qp *qp; + struct ib_qp *qp; int i, sg_ind; int is_ud; ssize_t ret = -EINVAL; @@ -1479,13 +2248,13 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, user_wr->num_sge * sizeof (struct ib_sge), GFP_KERNEL); if (!next) { - ret = -ENOMEM; - goto out_put; - } + ret = -ENOMEM; + goto out_put; + } if (!last) wr = next; - else + else last->next = next; last = next; @@ -1500,7 +2269,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, file->ucontext); if (!next->wr.ud.ah) { ret = -EINVAL; - goto out_put; + goto out_put; } next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; @@ -1555,12 +2324,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, } resp.bad_wr = 0; - ret = qp->device->post_send(qp, wr, &bad_wr); + ret = qp->device->post_send(qp->real_qp, wr, &bad_wr); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) - break; + break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, @@ -1594,7 +2363,7 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf, struct ib_recv_wr *wr = NULL, *last, *next; int sg_ind; int i; - int ret; + int ret; if (in_len < wqe_size * wr_count + sge_count * sizeof (struct ib_uverbs_sge)) @@ -1617,9 +2386,9 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf, } if (user_wr->num_sge + sg_ind > sge_count) { - ret = -EINVAL; - goto err; - } + ret = -EINVAL; + goto err; + } next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + user_wr->num_sge * sizeof (struct ib_sge), @@ -1627,7 +2396,7 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf, if (!next) { ret = -ENOMEM; goto err; - } + } if (!last) wr = next; @@ -1693,7 +2462,7 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, goto out; resp.bad_wr = 0; - ret = qp->device->post_recv(qp, wr, &bad_wr); + ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr); put_qp_read(qp); @@ -1768,8 +2537,8 @@ ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, } ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_create_ah cmd; struct ib_uverbs_create_ah_resp resp; @@ -1789,10 +2558,10 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, if (!uobj) return -ENOMEM; - init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_key); + init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class); down_write(&uobj->mutex); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err; @@ -1863,7 +2632,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, struct ib_uverbs_destroy_ah cmd; struct ib_ah *ah; struct ib_uobject *uobj; - int ret; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -1906,7 +2675,7 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - qp = idr_read_qp(cmd.qp_handle, file->ucontext); + qp = idr_write_qp(cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; @@ -1935,25 +2704,25 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, kfree(mcast); out_put: - put_qp_read(qp); + put_qp_write(qp); return ret ? ret : in_len; } ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_detach_mcast cmd; struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; - int ret = -EINVAL; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - qp = idr_read_qp(cmd.qp_handle, file->ucontext); + qp = idr_write_qp(cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; @@ -1972,102 +2741,122 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, } out_put: - put_qp_read(qp); + put_qp_write(qp); return ret ? ret : in_len; } -ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +static int __uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_uverbs_create_xsrq *cmd, + struct ib_udata *udata) { - struct ib_uverbs_create_srq cmd; struct ib_uverbs_create_srq_resp resp; - struct ib_udata udata; - struct ib_uevent_object *obj; + struct ib_usrq_object *obj; struct ib_pd *pd; struct ib_srq *srq; + struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_srq_init_attr attr; int ret; - if (out_len < sizeof resp) - return -ENOSPC; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); - - obj = kmalloc(sizeof *obj, GFP_KERNEL); + obj = kmalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &srq_lock_key); - down_write(&obj->uobject.mutex); + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); + down_write(&obj->uevent.uobject.mutex); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (cmd->srq_type == IB_SRQT_XRC) { + attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); + if (!attr.ext.xrc.xrcd) { + ret = -EINVAL; + goto err; + } + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + + attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); + if (!attr.ext.xrc.cq) { + ret = -EINVAL; + goto err_put_xrcd; + } + } + + pd = idr_read_pd(cmd->pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; - goto err; - } + goto err_put_cq; + } attr.event_handler = ib_uverbs_srq_event_handler; attr.srq_context = file; - attr.attr.max_wr = cmd.max_wr; - attr.attr.max_sge = cmd.max_sge; - attr.attr.srq_limit = cmd.srq_limit; + attr.srq_type = cmd->srq_type; + attr.attr.max_wr = cmd->max_wr; + attr.attr.max_sge = cmd->max_sge; + attr.attr.srq_limit = cmd->srq_limit; - obj->events_reported = 0; - INIT_LIST_HEAD(&obj->event_list); + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); - srq = pd->device->create_srq(pd, &attr, &udata); + srq = pd->device->create_srq(pd, &attr, udata); if (IS_ERR(srq)) { ret = PTR_ERR(srq); goto err_put; } - srq->device = pd->device; - srq->pd = pd; - srq->uobject = &obj->uobject; + srq->device = pd->device; + srq->pd = pd; + srq->srq_type = cmd->srq_type; + srq->uobject = &obj->uevent.uobject; srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; - srq->ext.xrc.cq = NULL; - srq->ext.xrc.xrcd = NULL; + + if (cmd->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.cq = attr.ext.xrc.cq; + srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; + atomic_inc(&attr.ext.xrc.cq->usecnt); + atomic_inc(&attr.ext.xrc.xrcd->usecnt); + } + atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); - obj->uobject.object = srq; - ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject); + obj->uevent.uobject.object = srq; + ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); if (ret) goto err_destroy; memset(&resp, 0, sizeof resp); - resp.srq_handle = obj->uobject.id; + resp.srq_handle = obj->uevent.uobject.id; resp.max_wr = attr.attr.max_wr; resp.max_sge = attr.attr.max_sge; + if (cmd->srq_type == IB_SRQT_XRC) + resp.srqn = srq->ext.xrc.srq_num; - if (copy_to_user((void __user *) (unsigned long) cmd.response, + if (copy_to_user((void __user *) (unsigned long) cmd->response, &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; } + if (cmd->srq_type == IB_SRQT_XRC) { + put_uobj_read(xrcd_uobj); + put_cq_read(attr.ext.xrc.cq); + } put_pd_read(pd); mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->srq_list); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); mutex_unlock(&file->mutex); - obj->uobject.live = 1; + obj->uevent.uobject.live = 1; - up_write(&obj->uobject.mutex); + up_write(&obj->uevent.uobject.mutex); - return in_len; + return 0; err_copy: - idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject); + idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); err_destroy: ib_destroy_srq(srq); @@ -2075,25 +2864,62 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, err_put: put_pd_read(pd); +err_put_cq: + if (cmd->srq_type == IB_SRQT_XRC) + put_cq_read(attr.ext.xrc.cq); + +err_put_xrcd: + if (cmd->srq_type == IB_SRQT_XRC) { + atomic_dec(&obj->uxrcd->refcnt); + put_uobj_read(xrcd_uobj); + } + err: - put_uobj_write(&obj->uobject); + put_uobj_write(&obj->uevent.uobject); return ret; } -ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) { - struct ib_uverbs_create_xsrq cmd; + struct ib_uverbs_create_srq cmd; + struct ib_uverbs_create_xsrq xcmd; struct ib_uverbs_create_srq_resp resp; - struct ib_udata udata; - struct ib_uevent_object *obj; - struct ib_pd *pd; - struct ib_srq *srq; - struct ib_cq *xrc_cq; - struct ib_xrcd *xrcd; - struct ib_srq_init_attr attr; - struct ib_uobject *xrcd_uobj; + struct ib_udata udata; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + xcmd.response = cmd.response; + xcmd.user_handle = cmd.user_handle; + xcmd.srq_type = IB_SRQT_BASIC; + xcmd.pd_handle = cmd.pd_handle; + xcmd.max_wr = cmd.max_wr; + xcmd.max_sge = cmd.max_sge; + xcmd.srq_limit = cmd.srq_limit; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + ret = __uverbs_create_xsrq(file, &xcmd, &udata); + if (ret) + return ret; + + return in_len; +} + +ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, int out_len) +{ + struct ib_uverbs_create_xsrq cmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; int ret; if (out_len < sizeof resp) @@ -2106,109 +2932,11 @@ ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - obj = kmalloc(sizeof *obj, GFP_KERNEL); - if (!obj) - return -ENOMEM; - - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, - &srq_lock_key); - down_write(&obj->uobject.mutex); - - pd = idr_read_pd(cmd.pd_handle, file->ucontext); - if (!pd) { - ret = -EINVAL; - goto err; - } - - xrc_cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); - if (!xrc_cq) { - ret = -EINVAL; - goto err_put_pd; - } - - xrcd = idr_read_xrcd(cmd.xrcd_handle, file->ucontext, &xrcd_uobj); - if (!xrcd) { - ret = -EINVAL; - goto err_put_cq; - } - - - attr.event_handler = ib_uverbs_srq_event_handler; - attr.srq_context = file; - attr.attr.max_wr = cmd.max_wr; - attr.attr.max_sge = cmd.max_sge; - attr.attr.srq_limit = cmd.srq_limit; - - obj->events_reported = 0; - INIT_LIST_HEAD(&obj->event_list); - - srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, &attr, &udata); - if (IS_ERR(srq)) { - ret = PTR_ERR(srq); - goto err_put; - } - - srq->device = pd->device; - srq->pd = pd; - srq->uobject = &obj->uobject; - srq->event_handler = attr.event_handler; - srq->srq_context = attr.srq_context; - srq->ext.xrc.cq = xrc_cq; - srq->ext.xrc.xrcd = xrcd; - atomic_inc(&pd->usecnt); - atomic_inc(&xrc_cq->usecnt); - atomic_inc(&xrcd->usecnt); - - atomic_set(&srq->usecnt, 0); - - obj->uobject.object = srq; - ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject); + ret = __uverbs_create_xsrq(file, &cmd, &udata); if (ret) - goto err_destroy; - - memset(&resp, 0, sizeof resp); - resp.srq_handle = obj->uobject.id; - resp.max_wr = attr.attr.max_wr; - resp.max_sge = attr.attr.max_sge; - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - ret = -EFAULT; - goto err_copy; - } - - put_xrcd_read(xrcd_uobj); - put_cq_read(xrc_cq); - put_pd_read(pd); - - mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->srq_list); - mutex_unlock(&file->mutex); - - obj->uobject.live = 1; - - up_write(&obj->uobject.mutex); + return ret; return in_len; - -err_copy: - idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject); - -err_destroy: - ib_destroy_srq(srq); - -err_put: - put_xrcd_read(xrcd_uobj); - -err_put_cq: - put_cq_read(xrc_cq); - -err_put_pd: - put_pd_read(pd); - -err: - put_uobj_write(&obj->uobject); - return ret; } ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, @@ -2266,7 +2994,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, put_srq_read(srq); if (ret) - return ret; + return ret; memset(&resp, 0, sizeof resp); @@ -2282,8 +3010,8 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_destroy_srq cmd; struct ib_uverbs_destroy_srq_resp resp; @@ -2291,6 +3019,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, struct ib_srq *srq; struct ib_uevent_object *obj; int ret = -EINVAL; + struct ib_usrq_object *us; + enum ib_srq_type srq_type; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2300,6 +3030,7 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, return -EINVAL; srq = uobj->object; obj = container_of(uobj, struct ib_uevent_object, uobject); + srq_type = srq->srq_type; ret = ib_destroy_srq(srq); if (!ret) @@ -2310,6 +3041,11 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, if (ret) return ret; + if (srq_type == IB_SRQT_XRC) { + us = container_of(obj, struct ib_usrq_object, uevent); + atomic_dec(&us->uxrcd->refcnt); + } + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); mutex_lock(&file->mutex); @@ -2330,313 +3066,467 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, return ret ? ret : in_len; } -static struct inode *xrc_file2inode(struct file *f) +ssize_t ib_uverbs_exp_create_dct(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { - return f->f_dentry->d_inode; -} + int in_len = ucore->inlen + uhw->inlen; + int out_len = ucore->outlen + uhw->outlen; + struct ib_uverbs_create_dct cmd; + struct ib_uverbs_create_dct_resp resp; + struct ib_udata udata; + struct ib_udct_object *obj; + struct ib_dct *dct; + int ret; + struct ib_dct_init_attr attr; + struct ib_pd *pd = NULL; + struct ib_cq *cq = NULL; + struct ib_srq *srq = NULL; -struct xrcd_table_entry { - struct rb_node node; - struct inode *inode; - struct ib_xrcd *xrcd; -}; - -static int xrcd_table_insert(struct ib_device *dev, - struct inode *i_n, - struct ib_xrcd *xrcd) -{ - struct xrcd_table_entry *entry, *scan; - struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node; - struct rb_node *parent = NULL; - - entry = kmalloc(sizeof(struct xrcd_table_entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - entry->inode = i_n; - entry->xrcd = xrcd; - - while (*p) { - parent = *p; - scan = rb_entry(parent, struct xrcd_table_entry, node); - - if (i_n < scan->inode) - p = &(*p)->rb_left; - else if (i_n > scan->inode) - p = &(*p)->rb_right; - else { - kfree(entry); - return -EEXIST; - } - } - - rb_link_node(&entry->node, parent, p); - rb_insert_color(&entry->node, &dev->ib_uverbs_xrcd_table); - igrab(i_n); - return 0; -} - -static struct xrcd_table_entry *xrcd_table_search(struct ib_device *dev, - struct inode *i_n) -{ - struct xrcd_table_entry *scan; - struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node; - struct rb_node *parent = NULL; - - while (*p) { - parent = *p; - scan = rb_entry(parent, struct xrcd_table_entry, node); - - if (i_n < scan->inode) - p = &(*p)->rb_left; - else if (i_n > scan->inode) - p = &(*p)->rb_right; - else - return scan; - } - return NULL; -} - -static int find_xrcd(struct ib_device *dev, struct inode *i_n, - struct ib_xrcd **xrcd) -{ - struct xrcd_table_entry *entry; - - entry = xrcd_table_search(dev, i_n); - if (!entry) - return -EINVAL; - - *xrcd = entry->xrcd; - return 0; -} - - -static void xrcd_table_delete(struct ib_device *dev, - struct inode *i_n) -{ - struct xrcd_table_entry *entry = xrcd_table_search(dev, i_n); - - if (entry) { - iput(i_n); - rb_erase(&entry->node, &dev->ib_uverbs_xrcd_table); - kfree(entry); - } -} - -ssize_t ib_uverbs_open_xrc_domain(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - struct ib_uverbs_open_xrc_domain cmd; - struct ib_uverbs_open_xrc_domain_resp resp; - struct ib_udata udata; - struct ib_uobject *uobj; - struct ib_uxrcd_object *xrcd_uobj; - struct ib_xrcd *xrcd = NULL; - struct file *f = NULL; - struct inode *inode = NULL; - int ret = 0; - int new_xrcd = 0; - - if (out_len < sizeof resp) + if (out_len < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); - - mutex_lock(&file->device->ib_dev->xrcd_table_mutex); - if (cmd.fd != (u32) (-1)) { - /* search for file descriptor */ - f = fget(cmd.fd); - if (!f) { - ret = -EBADF; - goto err_table_mutex_unlock; - } - - inode = xrc_file2inode(f); - if (!inode) { - ret = -EBADF; - goto err_table_mutex_unlock; - } - - ret = find_xrcd(file->device->ib_dev, inode, &xrcd); - if (ret && !(cmd.oflags & O_CREAT)) { - /* no file descriptor. Need CREATE flag */ - ret = -EAGAIN; - goto err_table_mutex_unlock; - } - - if (xrcd && cmd.oflags & O_EXCL) { - ret = -EINVAL; - goto err_table_mutex_unlock; - } - } - - xrcd_uobj = kmalloc(sizeof *xrcd_uobj, GFP_KERNEL); - if (!xrcd_uobj) { - ret = -ENOMEM; - goto err_table_mutex_unlock; - } - - uobj = &xrcd_uobj->uobject; - init_uobj(uobj, 0, file->ucontext, &pd_lock_key); - down_write(&uobj->mutex); - - if (!xrcd) { - xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, - file->ucontext, &udata); - if (IS_ERR(xrcd)) { - ret = PTR_ERR(xrcd); - goto err; - } - xrcd->uobject = (cmd.fd == -1) ? uobj : NULL; - xrcd->inode = inode; - xrcd->device = file->device->ib_dev; - atomic_set(&xrcd->usecnt, 0); - new_xrcd = 1; - } - - uobj->object = xrcd; - ret = idr_add_uobj(&ib_uverbs_xrc_domain_idr, uobj); + ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); if (ret) - goto err_idr; + return ret; - memset(&resp, 0, sizeof resp); - resp.xrcd_handle = uobj->id; + obj = kmalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; - if (inode) { - if (new_xrcd) { - /* create new inode/xrcd table entry */ - ret = xrcd_table_insert(file->device->ib_dev, inode, xrcd); - if (ret) - goto err_insert_xrcd; - } - atomic_inc(&xrcd->usecnt); + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, + &dct_lock_class); + down_write(&obj->uobject.mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_pd; } - if (f) - fput(f); - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - ret = -EFAULT; + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) { + ret = -EINVAL; + goto err_put; + } + + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) { + ret = -EINVAL; + goto err_put; + } + + attr.cq = cq; + attr.access_flags = cmd.access_flags; + attr.min_rnr_timer = cmd.min_rnr_timer; + attr.srq = srq; + attr.tclass = cmd.tclass; + attr.flow_label = cmd.flow_label; + attr.dc_key = cmd.dc_key; + attr.mtu = cmd.mtu; + attr.port = cmd.port; + attr.pkey_index = cmd.pkey_index; + attr.gid_index = cmd.gid_index; + attr.hop_limit = cmd.hop_limit; + attr.create_flags = cmd.create_flags; + + dct = ib_create_dct(pd, &attr, &udata); + if (IS_ERR(dct)) { + ret = PTR_ERR(dct); + goto err_put; + } + + dct->device = file->device->ib_dev; + dct->uobject = &obj->uobject; + + obj->uobject.object = dct; + ret = idr_add_uobj(&ib_uverbs_dct_idr, &obj->uobject); + if (ret) + goto err_dct; + + memset(&resp, 0, sizeof(resp)); + resp.dct_handle = obj->uobject.id; + resp.dctn = dct->dct_num; + + ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); + if (ret) goto err_copy; - } - - INIT_LIST_HEAD(&xrcd_uobj->xrc_reg_qp_list); mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->xrcd_list); + list_add_tail(&obj->uobject.list, &file->ucontext->dct_list); mutex_unlock(&file->mutex); - uobj->live = 1; + obj->uobject.live = 1; - up_write(&uobj->mutex); + put_srq_read(srq); + put_cq_read(cq); + put_pd_read(pd); + + up_write(&obj->uobject.mutex); - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); return in_len; err_copy: + idr_remove_uobj(&ib_uverbs_dct_idr, &obj->uobject); - if (inode) { - if (new_xrcd) - xrcd_table_delete(file->device->ib_dev, inode); - atomic_dec(&xrcd->usecnt); - } +err_dct: + ib_destroy_dct(dct); -err_insert_xrcd: - idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj); +err_put: + if (srq) + put_srq_read(srq); -err_idr: - ib_dealloc_xrcd(xrcd); + if (cq) + put_cq_read(cq); -err: - put_uobj_write(uobj); + put_pd_read(pd); -err_table_mutex_unlock: - - if (f) - fput(f); - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); +err_pd: + put_uobj_write(&obj->uobject); return ret; } -ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +ssize_t ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_close_xrc_domain cmd; - struct ib_uobject *uobj, *t_uobj; - struct ib_uxrcd_object *xrcd_uobj; - struct ib_xrcd *xrcd = NULL; - struct inode *inode = NULL; - int ret = 0; + int in_len = ucore->inlen + uhw->inlen; + int out_len = ucore->outlen + uhw->outlen; + struct ib_uverbs_destroy_dct cmd; + struct ib_uverbs_destroy_dct_resp resp; + struct ib_uobject *uobj; + struct ib_dct *dct; + struct ib_udct_object *obj; + int ret; - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + if (out_len < sizeof(resp)) + return -ENOSPC; - mutex_lock(&file->device->ib_dev->xrcd_table_mutex); - uobj = idr_write_uobj(&ib_uverbs_xrc_domain_idr, cmd.xrcd_handle, - file->ucontext); - if (!uobj) { - ret = -EINVAL; - goto err_unlock_mutex; - } + ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; - mutex_lock(&file->mutex); - if (!ret) { - list_for_each_entry(t_uobj, &file->ucontext->qp_list, list) { - struct ib_qp *qp = t_uobj->object; - if (qp->xrcd && qp->xrcd == uobj->object) { - ret = -EBUSY; - break; - } - } - } - if (!ret) { - list_for_each_entry(t_uobj, &file->ucontext->srq_list, list) { - struct ib_srq *srq = t_uobj->object; - if (srq->ext.xrc.xrcd && srq->ext.xrc.xrcd == uobj->object) { - ret = -EBUSY; - break; - } - } - } - mutex_unlock(&file->mutex); - if (ret) { - put_uobj_write(uobj); - goto err_unlock_mutex; - } + uobj = idr_write_uobj(&ib_uverbs_dct_idr, cmd.user_handle, file->ucontext); + if (!uobj) + return -EINVAL; - xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject); - if (!list_empty(&xrcd_uobj->xrc_reg_qp_list)) { - ret = -EBUSY; - put_uobj_write(uobj); - goto err_unlock_mutex; - } + dct = uobj->object; + obj = container_of(dct->uobject, struct ib_udct_object, uobject); - xrcd = (struct ib_xrcd *) (uobj->object); - inode = xrcd->inode; - - if (inode) - atomic_dec(&xrcd->usecnt); - - ret = ib_dealloc_xrcd(uobj->object); + ret = ib_destroy_dct(dct); if (!ret) uobj->live = 0; put_uobj_write(uobj); - if (ret && !inode) - goto err_unlock_mutex; + if (ret) + return ret; - if (!ret && inode) - xrcd_table_delete(file->device->ib_dev, inode); + idr_remove_uobj(&ib_uverbs_dct_idr, uobj); - idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj); + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + memset(&resp, 0, sizeof(resp)); + + put_uobj(uobj); + + ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); + if (ret) + return ret; + + return in_len; +} + +ssize_t ib_uverbs_exp_query_dct(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + int in_len = ucore->inlen + uhw->inlen; + int out_len = ucore->outlen + uhw->outlen; + struct ib_uverbs_query_dct cmd; + struct ib_uverbs_query_dct_resp resp; + struct ib_dct *dct; + struct ib_dct_attr *attr; + int err; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + err = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) { + err = -ENOMEM; + goto out; + } + + dct = idr_read_dct(cmd.dct_handle, file->ucontext); + if (!dct) { + err = -EINVAL; + goto out; + } + + err = ib_query_dct(dct, attr); + + put_dct_read(dct); + + if (err) + goto out; + + memset(&resp, 0, sizeof(resp)); + + resp.dc_key = attr->dc_key; + resp.access_flags = attr->access_flags; + resp.flow_label = attr->flow_label; + resp.key_violations = attr->key_violations; + resp.port = attr->port; + resp.min_rnr_timer = attr->min_rnr_timer; + resp.tclass = attr->tclass; + resp.mtu = attr->mtu; + resp.pkey_index = attr->pkey_index; + resp.gid_index = attr->gid_index; + resp.hop_limit = attr->hop_limit; + resp.state = attr->state; + + err = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); + +out: + kfree(attr); + + return err ? err : in_len; +} + +/* + * Experimental functions + */ + +static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; + +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + ib_spec->type = kern_spec->type; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + ib_spec->eth.size = sizeof(struct ib_flow_spec_eth); + memcpy(&ib_spec->eth.val, &kern_spec->eth.val, + sizeof(struct ib_flow_eth_filter)); + memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask, + sizeof(struct ib_flow_eth_filter)); + break; + case IB_FLOW_SPEC_IB: + ib_spec->ib.size = sizeof(struct ib_flow_spec_ib); + memcpy(&ib_spec->ib.val, &kern_spec->ib.val, + sizeof(struct ib_flow_ib_filter)); + memcpy(&ib_spec->ib.mask, &kern_spec->ib.mask, + sizeof(struct ib_flow_ib_filter)); + break; + case IB_FLOW_SPEC_IPV4: + ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4); + memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val, + sizeof(struct ib_flow_ipv4_filter)); + memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, + sizeof(struct ib_flow_ipv4_filter)); + break; + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); + memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val, + sizeof(struct ib_flow_tcp_udp_filter)); + memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask, + sizeof(struct ib_flow_tcp_udp_filter)); + break; + default: + return -EINVAL; + } + return 0; +} + +int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_create_flow cmd; + struct ib_uverbs_create_flow_resp resp; + struct ib_uobject *uobj; + struct ib_flow *flow_id; + struct ib_uverbs_flow_attr *kern_flow_attr; + struct ib_flow_attr *flow_attr; + struct ib_qp *qp; + int err = 0; + void *kern_spec; + void *ib_spec; + int i; + + if (ucore->outlen < sizeof(resp)) + return -ENOSPC; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); + + if (cmd.comp_mask) + return -EINVAL; + + if (!priv_check(curthread, PRIV_NET_RAW) && !disable_raw_qp_enforcement) + return -EPERM; + + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + return -EINVAL; + + if (cmd.flow_attr.size > ucore->inlen || + cmd.flow_attr.size > + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) + return -EINVAL; + + if (cmd.flow_attr.num_of_specs) { + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + + cmd.flow_attr.size, GFP_KERNEL); + if (!kern_flow_attr) + return -ENOMEM; + + memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); + err = ib_copy_from_udata(kern_flow_attr + 1, ucore, + cmd.flow_attr.size); + if (err) + goto err_free_attr; + } else { + kern_flow_attr = &cmd.flow_attr; + } + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto err_free_attr; + } + init_uobj(uobj, 0, file->ucontext, &rule_lock_class); + down_write(&uobj->mutex); + + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) { + err = -EINVAL; + goto err_uobj; + } + + flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); + if (!flow_attr) { + err = -ENOMEM; + goto err_put; + } + + flow_attr->type = kern_flow_attr->type; + flow_attr->priority = kern_flow_attr->priority; + flow_attr->num_of_specs = kern_flow_attr->num_of_specs; + flow_attr->port = kern_flow_attr->port; + flow_attr->flags = kern_flow_attr->flags; + flow_attr->size = sizeof(*flow_attr); + + kern_spec = kern_flow_attr + 1; + ib_spec = flow_attr + 1; + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size > + offsetof(struct ib_uverbs_flow_spec, reserved) && + cmd.flow_attr.size >= + ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { + err = kern_spec_to_ib_spec(kern_spec, ib_spec); + if (err) + goto err_free; + flow_attr->size += + ((union ib_flow_spec *)ib_spec)->size; + cmd.flow_attr.size -= + ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_flow_spec *)kern_spec)->size; + ib_spec += ((union ib_flow_spec *)ib_spec)->size; + } + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + i, cmd.flow_attr.size); + goto err_free; + } + flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); + if (IS_ERR(flow_id)) { + err = PTR_ERR(flow_id); + goto err_free; + } + flow_id->qp = qp; + flow_id->uobject = uobj; + uobj->object = flow_id; + + err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); + if (err) + goto destroy_flow; + + memset(&resp, 0, sizeof(resp)); + resp.flow_handle = uobj->id; + + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) + goto err_copy; + + put_qp_read(qp); + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->rule_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + kfree(flow_attr); + if (cmd.flow_attr.num_of_specs) + kfree(kern_flow_attr); + return 0; +err_copy: + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); +destroy_flow: + ib_destroy_flow(flow_id); +err_free: + kfree(flow_attr); +err_put: + put_qp_read(qp); +err_uobj: + put_uobj_write(uobj); +err_free_attr: + if (cmd.flow_attr.num_of_specs) + kfree(kern_flow_attr); + return err; +} + +int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_destroy_flow cmd; + struct ib_flow *flow_id; + struct ib_uobject *uobj; + int ret; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + flow_id = uobj->object; + + ret = ib_destroy_flow(flow_id); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); @@ -2644,380 +3534,378 @@ ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file, put_uobj(uobj); - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); - return in_len; - -err_unlock_mutex: - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); return ret; } -void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev, - struct ib_xrcd *xrcd) +ssize_t ib_uverbs_exp_modify_qp(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { - struct inode *inode = NULL; - int ret = 0; + const char __user *buf = ucore->inbuf; + int in_len = ucore->inlen + uhw->inlen; + int out_len = ucore->outlen + uhw->outlen; - inode = xrcd->inode; - if (inode) - atomic_dec(&xrcd->usecnt); - - ret = ib_dealloc_xrcd(xrcd); - if (!ret && inode) - xrcd_table_delete(ib_dev, inode); + return __uverbs_modify_qp(file, buf, in_len, out_len, + IB_USER_VERBS_CMD_EXTENDED); } -ssize_t ib_uverbs_create_xrc_rcv_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + +ssize_t ib_uverbs_exp_create_cq(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { - struct ib_uverbs_create_xrc_rcv_qp cmd; - struct ib_uverbs_create_xrc_rcv_qp_resp resp; - struct ib_uxrc_rcv_object *obj; - struct ib_qp_init_attr init_attr; - struct ib_xrcd *xrcd; - struct ib_uobject *uobj; - struct ib_uxrcd_object *xrcd_uobj; - u32 qp_num; - int err; + const char __user *buf = ucore->inbuf; + int in_len = ucore->inlen + uhw->inlen; + int out_len = ucore->outlen + uhw->outlen; + struct ib_uverbs_create_cq_ex cmd; - if (out_len < sizeof resp) - return -ENOSPC; - - if (copy_from_user(&cmd, buf, sizeof cmd)) + if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - obj = kzalloc(sizeof *obj, GFP_KERNEL); - if (!obj) - return -ENOMEM; - - xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); - if (!xrcd) { - err = -EINVAL; - goto err_out; - } - - init_attr.event_handler = ib_uverbs_xrc_rcv_qp_event_handler; - init_attr.qp_context = file; - init_attr.srq = NULL; - init_attr.sq_sig_type = - cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; - init_attr.qp_type = IB_QPT_XRC; - init_attr.xrcd = xrcd; - - init_attr.cap.max_send_wr = 1; - init_attr.cap.max_recv_wr = 0; - init_attr.cap.max_send_sge = 1; - init_attr.cap.max_recv_sge = 0; - init_attr.cap.max_inline_data = 0; - - err = xrcd->device->create_xrc_rcv_qp(&init_attr, &qp_num); - if (err) - goto err_put; - - memset(&resp, 0, sizeof resp); - resp.qpn = qp_num; - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - err = -EFAULT; - goto err_destroy; - } - - atomic_inc(&xrcd->usecnt); - put_xrcd_read(uobj); - obj->qp_num = qp_num; - obj->domain_handle = cmd.xrc_domain_handle; - xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject); - mutex_lock(&file->device->ib_dev->xrcd_table_mutex); - list_add_tail(&obj->list, &xrcd_uobj->xrc_reg_qp_list); - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); - - return in_len; - -err_destroy: - xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num); -err_put: - put_xrcd_read(uobj); -err_out: - kfree(obj); - return err; + return create_cq(file, buf, in_len, out_len, &cmd, + IB_USER_VERBS_CMD_EXTENDED, ucore->outbuf); } -ssize_t ib_uverbs_modify_xrc_rcv_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +ssize_t ib_uverbs_exp_modify_cq(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { - struct ib_uverbs_modify_xrc_rcv_qp cmd; - struct ib_qp_attr *attr; - struct ib_xrcd *xrcd; - struct ib_uobject *uobj; - int err; + const char __user *buf = ucore->inbuf; + int in_len = ucore->inlen + uhw->inlen; + struct ib_uverbs_modify_cq_ex cmd; + struct ib_cq *cq; + struct ib_cq_attr attr; + int ret; - if (copy_from_user(&cmd, buf, sizeof cmd)) + if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - attr = kzalloc(sizeof *attr, GFP_KERNEL); - if (!attr) - return -ENOMEM; - - xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); - if (!xrcd) { - kfree(attr); + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) return -EINVAL; - } - attr->qp_state = cmd.qp_state; - attr->cur_qp_state = cmd.cur_qp_state; - attr->qp_access_flags = cmd.qp_access_flags; - attr->pkey_index = cmd.pkey_index; - attr->port_num = cmd.port_num; - attr->path_mtu = cmd.path_mtu; - attr->path_mig_state = cmd.path_mig_state; - attr->qkey = cmd.qkey; - attr->rq_psn = cmd.rq_psn; - attr->sq_psn = cmd.sq_psn; - attr->dest_qp_num = cmd.dest_qp_num; - attr->alt_pkey_index = cmd.alt_pkey_index; - attr->en_sqd_async_notify = cmd.en_sqd_async_notify; - attr->max_rd_atomic = cmd.max_rd_atomic; - attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic; - attr->min_rnr_timer = cmd.min_rnr_timer; - attr->port_num = cmd.port_num; - attr->timeout = cmd.timeout; - attr->retry_cnt = cmd.retry_cnt; - attr->rnr_retry = cmd.rnr_retry; - attr->alt_port_num = cmd.alt_port_num; - attr->alt_timeout = cmd.alt_timeout; + attr.moderation.cq_count = cmd.cq_count; + attr.moderation.cq_period = cmd.cq_period; + attr.cq_cap_flags = cmd.cq_cap_flags; - memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16); - attr->ah_attr.grh.flow_label = cmd.dest.flow_label; - attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index; - attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit; - attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class; - attr->ah_attr.dlid = cmd.dest.dlid; - attr->ah_attr.sl = cmd.dest.sl; - attr->ah_attr.src_path_bits = cmd.dest.src_path_bits; - attr->ah_attr.static_rate = cmd.dest.static_rate; - attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0; - attr->ah_attr.port_num = cmd.dest.port_num; + ret = ib_modify_cq(cq, &attr, cmd.attr_mask); - memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16); - attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label; - attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index; - attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit; - attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class; - attr->alt_ah_attr.dlid = cmd.alt_dest.dlid; - attr->alt_ah_attr.sl = cmd.alt_dest.sl; - attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits; - attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; - attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; - attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; - - err = xrcd->device->modify_xrc_rcv_qp(xrcd, cmd.qp_num, attr, cmd.attr_mask); - put_xrcd_read(uobj); - kfree(attr); - return err ? err : in_len; -} - -ssize_t ib_uverbs_query_xrc_rcv_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - struct ib_uverbs_query_xrc_rcv_qp cmd; - struct ib_uverbs_query_qp_resp resp; - struct ib_qp_attr *attr; - struct ib_qp_init_attr *init_attr; - struct ib_xrcd *xrcd; - struct ib_uobject *uobj; - int ret; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - attr = kmalloc(sizeof *attr, GFP_KERNEL); - init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL); - if (!attr || !init_attr) { - ret = -ENOMEM; - goto out; - } - - xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); - if (!xrcd) { - ret = -EINVAL; - goto out; - } - - ret = xrcd->device->query_xrc_rcv_qp(xrcd, cmd.qp_num, attr, - cmd.attr_mask, init_attr); - - put_xrcd_read(uobj); - - if (ret) - goto out; - - memset(&resp, 0, sizeof resp); - resp.qp_state = attr->qp_state; - resp.cur_qp_state = attr->cur_qp_state; - resp.path_mtu = attr->path_mtu; - resp.path_mig_state = attr->path_mig_state; - resp.qkey = attr->qkey; - resp.rq_psn = attr->rq_psn; - resp.sq_psn = attr->sq_psn; - resp.dest_qp_num = attr->dest_qp_num; - resp.qp_access_flags = attr->qp_access_flags; - resp.pkey_index = attr->pkey_index; - resp.alt_pkey_index = attr->alt_pkey_index; - resp.sq_draining = attr->sq_draining; - resp.max_rd_atomic = attr->max_rd_atomic; - resp.max_dest_rd_atomic = attr->max_dest_rd_atomic; - resp.min_rnr_timer = attr->min_rnr_timer; - resp.port_num = attr->port_num; - resp.timeout = attr->timeout; - resp.retry_cnt = attr->retry_cnt; - resp.rnr_retry = attr->rnr_retry; - resp.alt_port_num = attr->alt_port_num; - resp.alt_timeout = attr->alt_timeout; - - memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16); - resp.dest.flow_label = attr->ah_attr.grh.flow_label; - resp.dest.sgid_index = attr->ah_attr.grh.sgid_index; - resp.dest.hop_limit = attr->ah_attr.grh.hop_limit; - resp.dest.traffic_class = attr->ah_attr.grh.traffic_class; - resp.dest.dlid = attr->ah_attr.dlid; - resp.dest.sl = attr->ah_attr.sl; - resp.dest.src_path_bits = attr->ah_attr.src_path_bits; - resp.dest.static_rate = attr->ah_attr.static_rate; - resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH); - resp.dest.port_num = attr->ah_attr.port_num; - - memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); - resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; - resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; - resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; - resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; - resp.alt_dest.dlid = attr->alt_ah_attr.dlid; - resp.alt_dest.sl = attr->alt_ah_attr.sl; - resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; - resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate; - resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH); - resp.alt_dest.port_num = attr->alt_ah_attr.port_num; - - resp.max_send_wr = init_attr->cap.max_send_wr; - resp.max_recv_wr = init_attr->cap.max_recv_wr; - resp.max_send_sge = init_attr->cap.max_send_sge; - resp.max_recv_sge = init_attr->cap.max_recv_sge; - resp.max_inline_data = init_attr->cap.max_inline_data; - resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR; - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) - ret = -EFAULT; - -out: - kfree(attr); - kfree(init_attr); + put_cq_read(cq); return ret ? ret : in_len; } -ssize_t ib_uverbs_reg_xrc_rcv_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + +ssize_t ib_uverbs_exp_query_device(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { - struct ib_uverbs_reg_xrc_rcv_qp cmd; - struct ib_uxrc_rcv_object *qp_obj, *tmp; - struct ib_xrcd *xrcd; - struct ib_uobject *uobj; - struct ib_uxrcd_object *xrcd_uobj; - int ret; + struct ib_uverbs_exp_query_device_resp resp; + struct ib_exp_device_attr exp_attr; + int ret; - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + if (ucore->outlen + uhw->outlen < sizeof(resp)) + return -ENOSPC; - qp_obj = kmalloc(sizeof *qp_obj, GFP_KERNEL); - if (!qp_obj) - return -ENOMEM; + memset(&resp, 0, sizeof(resp)); + memset(&exp_attr, 0, sizeof(exp_attr)); + ret = ib_exp_query_device(file->device->ib_dev, &exp_attr); + if (ret) + return ret; - xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); - if (!xrcd) { - ret = -EINVAL; - goto err_out; + ib_uverbs_query_device_assign(&resp.base, &exp_attr.base, file); + + resp.comp_mask = 0; + resp.device_cap_flags2 = 0; + + /* + * Handle regular attr fields + */ + if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK) { + resp.timestamp_mask = exp_attr.base.timestamp_mask; + resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK; } - ret = xrcd->device->reg_xrc_rcv_qp(xrcd, file, cmd.qp_num); - if (ret) - goto err_put; + if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) { + resp.hca_core_clock = exp_attr.base.hca_core_clock; + resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; + } - xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject); - mutex_lock(&file->device->ib_dev->xrcd_table_mutex); - list_for_each_entry(tmp, &xrcd_uobj->xrc_reg_qp_list, list) - if (cmd.qp_num == tmp->qp_num) { - kfree(qp_obj); - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); - put_xrcd_read(uobj); - return in_len; + /* + * Handle experimental attr fields + */ + if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_CAP_FLAGS2) { + resp.device_cap_flags2 = exp_attr.device_cap_flags2; + resp.comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2; + } + + if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_REQ_RD) { + resp.dc_rd_req = exp_attr.dc_rd_req; + resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD; + } + + if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_RES_RD) { + resp.dc_rd_res = exp_attr.dc_rd_res; + resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD; + } + + if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ) { + resp.inline_recv_sz = exp_attr.inline_recv_sz; + resp.comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ; + } + + if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_RSS_TBL_SZ) { + resp.max_rss_tbl_sz = exp_attr.max_rss_tbl_sz; + resp.comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ; + } + + if (copy_to_user(ucore->outbuf, &resp, sizeof(resp))) + return -EFAULT; + + return ucore->inlen + uhw->inlen; +} + +ssize_t ib_uverbs_exp_create_qp(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) +{ + struct ib_uqp_object *obj; + struct ib_device *device; + struct ib_pd *pd = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_cq *scq = NULL, *rcq = NULL; + struct ib_srq *srq = NULL; + struct ib_qp *qp; + struct ib_exp_qp_init_attr attr; + int ret; + struct ib_uverbs_exp_create_qp cmd_exp; + struct ib_uverbs_exp_create_qp_resp resp_exp; + struct ib_qp *parentqp = NULL; + + memset(&cmd_exp, 0, sizeof(cmd_exp)); + + ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp)); + if (ret) + return ret; + + if (!disable_raw_qp_enforcement && + cmd_exp.qp_type == IB_QPT_RAW_PACKET && !priv_check(curthread, + PRIV_NET_RAW)) + return -EPERM; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd_exp.user_handle, file->ucontext, + &qp_lock_class); + down_write(&obj->uevent.uobject.mutex); + + if (cmd_exp.qp_type == IB_QPT_XRC_TGT) { + xrcd = idr_read_xrcd(cmd_exp.pd_handle, file->ucontext, &xrcd_uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_put; } - qp_obj->qp_num = cmd.qp_num; - qp_obj->domain_handle = cmd.xrc_domain_handle; - list_add_tail(&qp_obj->list, &xrcd_uobj->xrc_reg_qp_list); - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); - atomic_inc(&xrcd->usecnt); - put_xrcd_read(uobj); - return in_len; + device = xrcd->device; + } else { + if (cmd_exp.qp_type == IB_QPT_XRC_INI) { + cmd_exp.max_recv_wr = 0; + cmd_exp.max_recv_sge = 0; + } else { + if (cmd_exp.is_srq) { + srq = idr_read_srq(cmd_exp.srq_handle, file->ucontext); + if (!srq || srq->srq_type != IB_SRQT_BASIC) { + ret = -EINVAL; + goto err_put; + } + } + + if (cmd_exp.recv_cq_handle != cmd_exp.send_cq_handle) { + rcq = idr_read_cq(cmd_exp.recv_cq_handle, file->ucontext, 0); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } + } + } + + scq = idr_read_cq(cmd_exp.send_cq_handle, file->ucontext, !!rcq); + rcq = rcq ?: scq; + pd = idr_read_pd(cmd_exp.pd_handle, file->ucontext); + if (!pd || !scq) { + ret = -EINVAL; + goto err_put; + } + + device = pd->device; + } + + memset(&attr, 0, sizeof(attr)); + attr.event_handler = ib_uverbs_qp_event_handler; + attr.qp_context = file; + attr.send_cq = scq; + attr.recv_cq = rcq; + attr.srq = srq; + attr.xrcd = xrcd; + attr.sq_sig_type = cmd_exp.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + attr.qp_type = cmd_exp.qp_type; + attr.create_flags = 0; + + attr.cap.max_send_wr = cmd_exp.max_send_wr; + attr.cap.max_recv_wr = cmd_exp.max_recv_wr; + attr.cap.max_send_sge = cmd_exp.max_send_sge; + attr.cap.max_recv_sge = cmd_exp.max_recv_sge; + attr.cap.max_inline_data = cmd_exp.max_inline_data; + + if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS) + attr.create_flags |= cmd_exp.qp_cap_flags & + (IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV); + + if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_QPG) { + struct ib_uverbs_qpg *qpg; + if (cmd_exp.qp_type != IB_QPT_RAW_PACKET && + cmd_exp.qp_type != IB_QPT_UD) { + ret = -EINVAL; + goto err_put; + } + qpg = &cmd_exp.qpg; + switch (qpg->qpg_type) { + case IB_QPG_PARENT: + attr.parent_attrib.rss_child_count = + qpg->parent_attrib.rss_child_count; + attr.parent_attrib.tss_child_count = + qpg->parent_attrib.tss_child_count; + break; + case IB_QPG_CHILD_RX: + case IB_QPG_CHILD_TX: + parentqp = idr_read_qp(qpg->parent_handle, + file->ucontext); + if (!parentqp) { + ret = -EINVAL; + goto err_put; + } + attr.qpg_parent = parentqp; + break; + default: + ret = -EINVAL; + goto err_put; + } + attr.qpg_type = qpg->qpg_type; + } + + if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) + attr.max_inl_recv = cmd_exp.max_inl_recv; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + + if (cmd_exp.qp_type == IB_QPT_XRC_TGT) + qp = ib_create_qp(pd, (struct ib_qp_init_attr *)&attr); + else + qp = device->exp_create_qp(pd, &attr, uhw); + + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + + if (cmd_exp.qp_type != IB_QPT_XRC_TGT) { + qp->real_qp = qp; + qp->device = device; + qp->pd = pd; + qp->send_cq = attr.send_cq; + qp->recv_cq = attr.recv_cq; + qp->srq = attr.srq; + qp->event_handler = attr.event_handler; + qp->qp_context = attr.qp_context; + qp->qp_type = attr.qp_type; + atomic_set(&qp->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&attr.send_cq->usecnt); + if (attr.recv_cq) + atomic_inc(&attr.recv_cq->usecnt); + if (attr.srq) + atomic_inc(&attr.srq->usecnt); + } + qp->uobject = &obj->uevent.uobject; + + obj->uevent.uobject.object = qp; + ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp_exp, 0, sizeof(resp_exp)); + resp_exp.qpn = qp->qp_num; + resp_exp.qp_handle = obj->uevent.uobject.id; + resp_exp.max_recv_sge = attr.cap.max_recv_sge; + resp_exp.max_send_sge = attr.cap.max_send_sge; + resp_exp.max_recv_wr = attr.cap.max_recv_wr; + resp_exp.max_send_wr = attr.cap.max_send_wr; + resp_exp.max_inline_data = attr.cap.max_inline_data; + + if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) { + resp_exp.comp_mask |= IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV; + resp_exp.max_inl_recv = attr.max_inl_recv; + } + + ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp)); + if (ret) + goto err_copy; + + if (xrcd) { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + put_xrcd_read(xrcd_uobj); + } + + if (pd) + put_pd_read(pd); + if (scq) + put_cq_read(scq); + if (rcq && rcq != scq) + put_cq_read(rcq); + if (srq) + put_srq_read(srq); + if (parentqp) + put_qp_read(parentqp); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); + + return ucore->inlen + uhw->inlen; + +err_copy: + idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_qp(qp); err_put: - put_xrcd_read(uobj); -err_out: + if (xrcd) + put_xrcd_read(xrcd_uobj); + if (pd) + put_pd_read(pd); + if (scq) + put_cq_read(scq); + if (rcq && rcq != scq) + put_cq_read(rcq); + if (srq) + put_srq_read(srq); + if (parentqp) + put_qp_read(parentqp); - kfree(qp_obj); + put_uobj_write(&obj->uevent.uobject); return ret; } -int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file, - struct ib_xrcd *xrcd, u32 qp_num) +int ib_exp_query_device(struct ib_device *device, + struct ib_exp_device_attr *device_attr) { - int err; - err = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num); - if (!err) - atomic_dec(&xrcd->usecnt); - return err; -} - -ssize_t ib_uverbs_unreg_xrc_rcv_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - struct ib_uverbs_unreg_xrc_rcv_qp cmd; - struct ib_uxrc_rcv_object *qp_obj, *tmp; - struct ib_xrcd *xrcd; - struct ib_uobject *uobj; - struct ib_uxrcd_object *xrcd_uobj; - int ret; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); - if (!xrcd) - return -EINVAL; - - ret = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, cmd.qp_num); - if (ret) { - put_xrcd_read(uobj); - return -EINVAL; - } - atomic_dec(&xrcd->usecnt); - - xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject); - mutex_lock(&file->device->ib_dev->xrcd_table_mutex); - list_for_each_entry_safe(qp_obj, tmp, &xrcd_uobj->xrc_reg_qp_list, list) - if (cmd.qp_num == qp_obj->qp_num) { - list_del(&qp_obj->list); - kfree(qp_obj); - break; - } - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); - put_xrcd_read(uobj); - return in_len; + return device->exp_query_device(device, device_attr); } +EXPORT_SYMBOL(ib_exp_query_device); diff --git a/sys/ofed/drivers/infiniband/core/uverbs_main.c b/sys/ofed/drivers/infiniband/core/uverbs_main.c index 30b925915ea9..12bc0d32277f 100644 --- a/sys/ofed/drivers/infiniband/core/uverbs_main.c +++ b/sys/ofed/drivers/infiniband/core/uverbs_main.c @@ -39,8 +39,13 @@ #include #include #include +#include #include #include +#include +#include +#include +#include #include @@ -50,8 +55,6 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace verbs access"); MODULE_LICENSE("Dual BSD/GPL"); -#define INFINIBANDEVENTFS_MAGIC 0x49426576 /* "IBev" */ - enum { IB_UVERBS_MAJOR = 231, IB_UVERBS_BASE_MINOR = 192, @@ -60,6 +63,31 @@ enum { #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) +static int uverbs_copy_from_udata_ex(void *dest, struct ib_udata *udata, size_t len) +{ + return copy_from_user(dest, udata->inbuf, min(udata->inlen, len)) ? -EFAULT : 0; +} + +static int uverbs_copy_to_udata_ex(struct ib_udata *udata, void *src, size_t len) +{ + return copy_to_user(udata->outbuf, src, min(udata->outlen, len)) ? -EFAULT : 0; +} + +static struct ib_udata_ops uverbs_copy_ex = { + .copy_from = uverbs_copy_from_udata_ex, + .copy_to = uverbs_copy_to_udata_ex +}; + +#define INIT_UDATA_EX(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->ops = &uverbs_copy_ex; \ + (udata)->inbuf = (void __user *)(ibuf); \ + (udata)->outbuf = (void __user *)(obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + + static struct class *uverbs_class; DEFINE_SPINLOCK(ib_uverbs_idr_lock); @@ -70,10 +98,11 @@ DEFINE_IDR(ib_uverbs_ah_idr); DEFINE_IDR(ib_uverbs_cq_idr); DEFINE_IDR(ib_uverbs_qp_idr); DEFINE_IDR(ib_uverbs_srq_idr); -DEFINE_IDR(ib_uverbs_xrc_domain_idr); +DEFINE_IDR(ib_uverbs_xrcd_idr); +DEFINE_IDR(ib_uverbs_rule_idr); +DEFINE_IDR(ib_uverbs_dct_idr); -static spinlock_t map_lock; -static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES]; +static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, @@ -86,6 +115,8 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, + [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw, + [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw, [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel, [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, @@ -107,20 +138,31 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, - [IB_USER_VERBS_CMD_CREATE_XRC_SRQ] = ib_uverbs_create_xrc_srq, - [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrc_domain, - [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrc_domain, - [IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP] = ib_uverbs_create_xrc_rcv_qp, - [IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP] = ib_uverbs_modify_xrc_rcv_qp, - [IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP] = ib_uverbs_query_xrc_rcv_qp, - [IB_USER_VERBS_CMD_REG_XRC_RCV_QP] = ib_uverbs_reg_xrc_rcv_qp, - [IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP] = ib_uverbs_unreg_xrc_rcv_qp, + [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, + [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, + [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, + [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, }; -#ifdef __linux__ -/* BSD Does not require a fake mountpoint for all files. */ -static struct vfsmount *uverbs_event_mnt; -#endif +static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) = { + [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, +}; + +static ssize_t (*uverbs_exp_cmd_table[])(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) = { + [IB_USER_VERBS_EXP_CMD_CREATE_QP] = ib_uverbs_exp_create_qp, + [IB_USER_VERBS_EXP_CMD_MODIFY_CQ] = ib_uverbs_exp_modify_cq, + [IB_USER_VERBS_EXP_CMD_MODIFY_QP] = ib_uverbs_exp_modify_qp, + [IB_USER_VERBS_EXP_CMD_CREATE_CQ] = ib_uverbs_exp_create_cq, + [IB_USER_VERBS_EXP_CMD_QUERY_DEVICE] = ib_uverbs_exp_query_device, + [IB_USER_VERBS_EXP_CMD_CREATE_DCT] = ib_uverbs_exp_create_dct, + [IB_USER_VERBS_EXP_CMD_DESTROY_DCT] = ib_uverbs_exp_destroy_dct, + [IB_USER_VERBS_EXP_CMD_QUERY_DCT] = ib_uverbs_exp_query_dct, +}; static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); @@ -195,6 +237,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, struct ib_ucontext *context) { struct ib_uobject *uobj, *tmp; + int err; if (!context) return 0; @@ -209,18 +252,55 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uobj); } + /* Remove MWs before QPs, in order to support type 2A MWs. */ + list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) { + struct ib_mw *mw = uobj->object; + + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + err = ib_dealloc_mw(mw); + if (err) { + pr_info("user_verbs: couldn't deallocate MW during cleanup.\n"); + pr_info("user_verbs: the system may have become unstable.\n"); + } + kfree(uobj); + } + list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) { + struct ib_flow *flow_id = uobj->object; + + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); + ib_destroy_flow(flow_id); + kfree(uobj); + } + list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) { struct ib_qp *qp = uobj->object; struct ib_uqp_object *uqp = container_of(uobj, struct ib_uqp_object, uevent.uobject); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); + ib_uverbs_detach_umcast(qp, uqp); - ib_destroy_qp(qp); + err = ib_destroy_qp(qp); + if (err) + pr_info("destroying uverbs qp failed: err %d\n", err); + ib_uverbs_release_uevent(file, &uqp->uevent); kfree(uqp); } + list_for_each_entry_safe(uobj, tmp, &context->dct_list, list) { + struct ib_dct *dct = uobj->object; + struct ib_udct_object *udct = + container_of(uobj, struct ib_udct_object, uobject); + + idr_remove_uobj(&ib_uverbs_dct_idr, uobj); + + err = ib_destroy_dct(dct); + if (err) + pr_info("destroying uverbs dct failed: err %d\n", err); + + kfree(udct); + } list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { struct ib_srq *srq = uobj->object; @@ -228,7 +308,9 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, container_of(uobj, struct ib_uevent_object, uobject); idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - ib_destroy_srq(srq); + err = ib_destroy_srq(srq); + if (err) + pr_info("destroying uverbs srq failed: err %d\n", err); ib_uverbs_release_uevent(file, uevent); kfree(uevent); } @@ -240,41 +322,37 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, container_of(uobj, struct ib_ucq_object, uobject); idr_remove_uobj(&ib_uverbs_cq_idr, uobj); - ib_destroy_cq(cq); + err = ib_destroy_cq(cq); + if (err) + pr_info("destroying uverbs cq failed: err %d\n", err); + ib_uverbs_release_ucq(file, ev_file, ucq); kfree(ucq); } - /* XXX Free MWs */ - list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { struct ib_mr *mr = uobj->object; idr_remove_uobj(&ib_uverbs_mr_idr, uobj); - ib_dereg_mr(mr); + err = ib_dereg_mr(mr); + if (err) { + pr_info("user_verbs: couldn't deregister an MR during cleanup.\n"); + pr_info("user_verbs: the system may have become unstable.\n"); + } kfree(uobj); } - mutex_lock(&file->device->ib_dev->xrcd_table_mutex); + mutex_lock(&file->device->xrcd_tree_mutex); list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) { struct ib_xrcd *xrcd = uobj->object; - struct ib_uxrc_rcv_object *xrc_qp_obj, *tmp1; - struct ib_uxrcd_object *xrcd_uobj = + struct ib_uxrcd_object *uxrcd = container_of(uobj, struct ib_uxrcd_object, uobject); - list_for_each_entry_safe(xrc_qp_obj, tmp1, - &xrcd_uobj->xrc_reg_qp_list, list) { - list_del(&xrc_qp_obj->list); - ib_uverbs_cleanup_xrc_rcv_qp(file, xrcd, - xrc_qp_obj->qp_num); - kfree(xrc_qp_obj); - } - - idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj); - ib_uverbs_dealloc_xrcd(file->device->ib_dev, xrcd); - kfree(uobj); + idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); + ib_uverbs_dealloc_xrcd(file->device, xrcd); + kfree(uxrcd); } - mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + mutex_unlock(&file->device->xrcd_tree_mutex); list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) { struct ib_pd *pd = uobj->object; @@ -405,7 +483,8 @@ static const struct file_operations uverbs_event_fops = { .read = ib_uverbs_event_read, .poll = ib_uverbs_event_poll, .release = ib_uverbs_event_close, - .fasync = ib_uverbs_event_fasync + .fasync = ib_uverbs_event_fasync, + .llseek = no_llseek, }; void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) @@ -524,21 +603,13 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, NULL, NULL); } -void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event, - void *context_ptr) -{ - ib_uverbs_async_handler(context_ptr, event->element.xrc_qp_num, - event->event, NULL, NULL); -} - struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, - int is_async, int *fd) + int is_async) { struct ib_uverbs_event_file *ev_file; struct file *filp; - int ret; - ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL); + ev_file = kzalloc(sizeof *ev_file, GFP_KERNEL); if (!ev_file) return ERR_PTR(-ENOMEM); @@ -547,43 +618,22 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, INIT_LIST_HEAD(&ev_file->event_list); init_waitqueue_head(&ev_file->poll_wait); ev_file->uverbs_file = uverbs_file; - ev_file->async_queue = NULL; ev_file->is_async = is_async; - ev_file->is_closed = 0; - ev_file->filp = NULL; - - *fd = get_unused_fd(); - if (*fd < 0) { - ret = *fd; - goto err; - } /* * fops_get() can't fail here, because we're coming from a * system call on a uverbs file, which will already have a * module reference. */ -#ifdef __linux__ - filp = alloc_file(uverbs_event_mnt, dget(uverbs_event_mnt->mnt_root), - FMODE_READ, fops_get(&uverbs_event_fops)); -#else filp = alloc_file(FMODE_READ, fops_get(&uverbs_event_fops)); -#endif - if (!filp) { - ret = -ENFILE; - goto err_fd; + + if (IS_ERR(filp)) { + kfree(ev_file); + } else { + filp->private_data = ev_file; } - filp->private_data = ev_file; - return filp; - -err_fd: - put_unused_fd(*fd); - -err: - kfree(ev_file); - return ERR_PTR(ret); } /* @@ -594,16 +644,15 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd) { struct ib_uverbs_event_file *ev_file = NULL; - struct file *filp; + struct fd f = fdget(fd); - filp = fget(fd); - if (!filp) + if (!f.file) return NULL; - if (filp->f_op != &uverbs_event_fops) + if (f.file->f_op != &uverbs_event_fops) goto out; - ev_file = filp->private_data; + ev_file = f.file->private_data; if (ev_file->is_async) { ev_file = NULL; goto out; @@ -612,15 +661,225 @@ struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd) kref_get(&ev_file->ref); out: - fput(filp); + fdput(f); return ev_file; } +static const char *verbs_cmd_str(__u32 cmd) +{ + switch (cmd) { + case IB_USER_VERBS_CMD_GET_CONTEXT: + return "GET_CONTEXT"; + case IB_USER_VERBS_CMD_QUERY_DEVICE: + return "QUERY_DEVICE"; + case IB_USER_VERBS_CMD_QUERY_PORT: + return "QUERY_PORT"; + case IB_USER_VERBS_CMD_ALLOC_PD: + return "ALLOC_PD"; + case IB_USER_VERBS_CMD_DEALLOC_PD: + return "DEALLOC_PD"; + case IB_USER_VERBS_CMD_REG_MR: + return "REG_MR"; + case IB_USER_VERBS_CMD_DEREG_MR: + return "DEREG_MR"; + case IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL: + return "CREATE_COMP_CHANNEL"; + case IB_USER_VERBS_CMD_CREATE_CQ: + return "CREATE_CQ"; + case IB_USER_VERBS_CMD_RESIZE_CQ: + return "RESIZE_CQ"; + case IB_USER_VERBS_CMD_POLL_CQ: + return "POLL_CQ"; + case IB_USER_VERBS_CMD_REQ_NOTIFY_CQ: + return "REQ_NOTIFY_CQ"; + case IB_USER_VERBS_CMD_DESTROY_CQ: + return "DESTROY_CQ"; + case IB_USER_VERBS_CMD_CREATE_QP: + return "CREATE_QP"; + case IB_USER_VERBS_CMD_QUERY_QP: + return "QUERY_QP"; + case IB_USER_VERBS_CMD_MODIFY_QP: + return "MODIFY_QP"; + case IB_USER_VERBS_CMD_DESTROY_QP: + return "DESTROY_QP"; + case IB_USER_VERBS_CMD_POST_SEND: + return "POST_SEND"; + case IB_USER_VERBS_CMD_POST_RECV: + return "POST_RECV"; + case IB_USER_VERBS_CMD_POST_SRQ_RECV: + return "POST_SRQ_RECV"; + case IB_USER_VERBS_CMD_CREATE_AH: + return "CREATE_AH"; + case IB_USER_VERBS_CMD_DESTROY_AH: + return "DESTROY_AH"; + case IB_USER_VERBS_CMD_ATTACH_MCAST: + return "ATTACH_MCAST"; + case IB_USER_VERBS_CMD_DETACH_MCAST: + return "DETACH_MCAST"; + case IB_USER_VERBS_CMD_CREATE_SRQ: + return "CREATE_SRQ"; + case IB_USER_VERBS_CMD_MODIFY_SRQ: + return "MODIFY_SRQ"; + case IB_USER_VERBS_CMD_QUERY_SRQ: + return "QUERY_SRQ"; + case IB_USER_VERBS_CMD_DESTROY_SRQ: + return "DESTROY_SRQ"; + case IB_USER_VERBS_CMD_OPEN_XRCD: + return "OPEN_XRCD"; + case IB_USER_VERBS_CMD_CLOSE_XRCD: + return "CLOSE_XRCD"; + case IB_USER_VERBS_CMD_CREATE_XSRQ: + return "CREATE_XSRQ"; + case IB_USER_VERBS_CMD_OPEN_QP: + return "OPEN_QP"; + } + + return "Unknown command"; +} + +enum { + COMMAND_INFO_MASK = 0x1000, +}; + +static ssize_t ib_uverbs_exp_handle_cmd(struct ib_uverbs_file *file, + const char __user *buf, + struct ib_device *dev, + struct ib_uverbs_cmd_hdr *hdr, + size_t count, + int legacy_ex_cmd) +{ + struct ib_udata ucore; + struct ib_udata uhw; + struct ib_uverbs_ex_cmd_hdr ex_hdr; + __u32 command = hdr->command - IB_USER_VERBS_EXP_CMD_FIRST; + + if (hdr->command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + if (command >= ARRAY_SIZE(uverbs_exp_cmd_table) || + !uverbs_exp_cmd_table[command]) + return -EINVAL; + + if (!file->ucontext) + return -EINVAL; + + if (!(dev->uverbs_exp_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (legacy_ex_cmd) { + struct ib_uverbs_ex_cmd_hdr_legacy hxl; + struct ib_uverbs_ex_cmd_resp1_legacy resp1; + __u64 response; + ssize_t ret; + + if (count < sizeof(hxl)) + return -EINVAL; + + if (copy_from_user(&hxl, buf, sizeof(hxl))) + return -EFAULT; + + if (((hxl.in_words + hxl.provider_in_words) * 4) != count) + return -EINVAL; + + count -= sizeof(hxl); + buf += sizeof(hxl); + if (hxl.out_words || hxl.provider_out_words) { + if (count < sizeof(resp1)) + return -EINVAL; + if (copy_from_user(&resp1, buf, sizeof(resp1))) + return -EFAULT; + response = resp1.response; + if (!response) + return -EINVAL; + + /* + * Change user buffer to comply with new extension format. + */ + if (sizeof(resp1.comp_mask) != sizeof(resp1.response)) + return -EFAULT; + buf += sizeof(resp1.comp_mask); + if (copy_to_user(__DECONST(void __user *, buf), &resp1.comp_mask, + sizeof(resp1.response))) + return -EFAULT; + + } else { + response = 0; + } + + INIT_UDATA_EX(&ucore, + (hxl.in_words) ? buf : 0, + response, + hxl.in_words * 4, + hxl.out_words * 4); + + INIT_UDATA_EX(&uhw, + (hxl.provider_in_words) ? buf + ucore.inlen : 0, + (hxl.provider_out_words) ? response + ucore.outlen : 0, + hxl.provider_in_words * 4, + hxl.provider_out_words * 4); + + ret = uverbs_exp_cmd_table[command](file, &ucore, &uhw); + /* + * UnChange user buffer + */ + if (response && copy_to_user(__DECONST(void __user *, buf), &resp1.response, sizeof(resp1.response))) + return -EFAULT; + + return ret; + } else { + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + + buf += sizeof(hdr) + sizeof(ex_hdr); + + if ((hdr->in_words + ex_hdr.provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr.response) { + if (!hdr->out_words && !ex_hdr.provider_out_words) + return -EINVAL; + } else { + if (hdr->out_words || ex_hdr.provider_out_words) + return -EINVAL; + } + + INIT_UDATA_EX(&ucore, + (hdr->in_words) ? buf : 0, + (unsigned long)ex_hdr.response, + hdr->in_words * 8, + hdr->out_words * 8); + + INIT_UDATA_EX(&uhw, + (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, + (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + return uverbs_exp_cmd_table[command](file, &ucore, &uhw); + } +} + static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; + struct ib_device *dev = file->device->ib_dev; struct ib_uverbs_cmd_hdr hdr; + struct timespec ts1; + struct timespec ts2; + ktime_t t1, t2, delta; + s64 ds; + ssize_t ret; + u64 dividend; + u32 divisor; + __u32 flags; + __u32 command; + int legacy_ex_cmd = 0; + size_t written_count = count; if (count < sizeof hdr) return -EINVAL; @@ -628,20 +887,126 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; + /* + * For BWD compatibility change old style extension verbs commands + * to their equivalent experimental command. + */ + if ((hdr.command >= IB_USER_VERBS_LEGACY_CMD_FIRST) && + (hdr.command <= IB_USER_VERBS_LEGACY_EX_CMD_LAST)) { + hdr.command += IB_USER_VERBS_EXP_CMD_FIRST - + IB_USER_VERBS_LEGACY_CMD_FIRST; + legacy_ex_cmd = 1; + } + + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + ktime_get_ts(&ts1); + if (!flags && (command >= IB_USER_VERBS_EXP_CMD_FIRST)) { + ret = ib_uverbs_exp_handle_cmd(file, buf, dev, &hdr, count, legacy_ex_cmd); + } else if (!flags) { + if (command >= ARRAY_SIZE(uverbs_cmd_table) || + !uverbs_cmd_table[command]) + return -EINVAL; + + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) + return -EINVAL; + + if (!(dev->uverbs_cmd_mask & (1ull << command))) + return -ENOSYS; + if (hdr.in_words * 4 != count) return -EINVAL; - if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command] || - !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) + ret = uverbs_cmd_table[command](file, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); + } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + struct ib_udata ucore; + struct ib_udata uhw; + struct ib_uverbs_ex_cmd_hdr ex_hdr; + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) return -EINVAL; - if (!file->ucontext && - hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || + !uverbs_ex_cmd_table[command]) + return -EINVAL; - return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr, - hdr.in_words * 4, hdr.out_words * 4); + if (!file->ucontext) + return -EINVAL; + + if (!(dev->uverbs_ex_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + + count -= sizeof(hdr) + sizeof(ex_hdr); + buf += sizeof(hdr) + sizeof(ex_hdr); + + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr.response) { + if (!hdr.out_words && !ex_hdr.provider_out_words) + return -EINVAL; + } else { + if (hdr.out_words || ex_hdr.provider_out_words) + return -EINVAL; + } + + INIT_UDATA_EX(&ucore, + (hdr.in_words) ? buf : 0, + (unsigned long)ex_hdr.response, + hdr.in_words * 8, + hdr.out_words * 8); + + INIT_UDATA_EX(&uhw, + (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, + (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw); + + if (ret) + return ret; + + return written_count; + + } else { + return -EFAULT; + } + + if ((dev->cmd_perf & (COMMAND_INFO_MASK - 1)) == hdr.command) { + ktime_get_ts(&ts2); + t1 = timespec_to_ktime(ts1); + t2 = timespec_to_ktime(ts2); + delta = ktime_sub(t2, t1); + ds = ktime_to_ns(delta); + spin_lock(&dev->cmd_perf_lock); + dividend = dev->cmd_avg * dev->cmd_n + ds; + ++dev->cmd_n; + divisor = dev->cmd_n; + do_div(dividend, divisor); + dev->cmd_avg = dividend; + spin_unlock(&dev->cmd_perf_lock); + if (dev->cmd_perf & COMMAND_INFO_MASK) { + pr_info("%s: %s execution time = %lld nsec\n", + file->device->ib_dev->name, + verbs_cmd_str(hdr.command), + (long long)ds); + } + } + return ret; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) @@ -653,18 +1018,51 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) else return file->device->ib_dev->mmap(file->ucontext, vma); } +/* XXX Not supported in FreeBSD */ +#if 0 +static unsigned long ib_uverbs_get_unmapped_area(struct file *filp, + unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct ib_uverbs_file *file = filp->private_data; + + if (!file->ucontext) + return -ENODEV; + else { + if (!file->device->ib_dev->get_unmapped_area) + return current->mm->get_unmapped_area(filp, addr, len, + pgoff, flags); + + return file->device->ib_dev->get_unmapped_area(filp, addr, len, + pgoff, flags); + } +} +#endif + +static long ib_uverbs_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct ib_uverbs_file *file = filp->private_data; + + if (!file->device->ib_dev->ioctl) + return -ENOTSUPP; + + if (!file->ucontext) + return -ENODEV; + else + /* provider should provide it's own locking mechanism */ + return file->device->ib_dev->ioctl(file->ucontext, cmd, arg); +} /* * ib_uverbs_open() does not need the BKL: * - * - dev_table[] accesses are protected by map_lock, the - * ib_uverbs_device structures are properly reference counted, and + * - the ib_uverbs_device structures are properly reference counted and * everything else is purely local to the file being created, so * races against other open calls are not a problem; * - there is no ioctl method to race against; - * - the device is added to dev_table[] as the last part of module - * initialization, the open method will either immediately run - * -ENXIO, or all required initialization will be done. + * - the open method will either immediately run -ENXIO, or all + * required initialization will be done. */ static int ib_uverbs_open(struct inode *inode, struct file *filp) { @@ -672,13 +1070,10 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) struct ib_uverbs_file *file; int ret; - spin_lock(&map_lock); - dev = dev_table[iminor(inode) - IB_UVERBS_BASE_MINOR]; + dev = container_of(inode->i_cdev->si_drv1, struct ib_uverbs_device, cdev); if (dev) kref_get(&dev->ref); - spin_unlock(&map_lock); - - if (!dev) + else return -ENXIO; if (!try_module_get(dev->ib_dev->owner)) { @@ -700,7 +1095,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) filp->private_data = file; - return 0; + return nonseekable_open(inode, filp); err_module: module_put(dev->ib_dev->owner); @@ -728,7 +1123,9 @@ static const struct file_operations uverbs_fops = { .owner = THIS_MODULE, .write = ib_uverbs_write, .open = ib_uverbs_open, - .release = ib_uverbs_close + .release = ib_uverbs_close, + .llseek = no_llseek, + .unlocked_ioctl = ib_uverbs_ioctl, }; static const struct file_operations uverbs_mmap_fops = { @@ -736,7 +1133,13 @@ static const struct file_operations uverbs_mmap_fops = { .write = ib_uverbs_write, .mmap = ib_uverbs_mmap, .open = ib_uverbs_open, - .release = ib_uverbs_close + .release = ib_uverbs_close, + .llseek = no_llseek, +/* XXX Not supported in FreeBSD */ +#if 0 + .get_unmapped_area = ib_uverbs_get_unmapped_area, +#endif + .unlocked_ioctl = ib_uverbs_ioctl, }; static struct ib_client uverbs_client = { @@ -757,6 +1160,18 @@ static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); +static ssize_t show_dev_ref_cnt(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_uverbs_device *dev = dev_get_drvdata(device); + + if (!dev) + return -ENODEV; + + return sprintf(buf, "%d\n", dev->ref.count); +} +static DEVICE_ATTR(ref_cnt, S_IRUGO, show_dev_ref_cnt, NULL); + static ssize_t show_dev_abi_version(struct device *device, struct device_attribute *attr, char *buf) { @@ -773,8 +1188,36 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *att { return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION); } + static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +static dev_t overflow_maj; +static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES); + +/* + * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by + * requesting a new major number and doubling the number of max devices we + * support. It's stupid, but simple. + */ +static int find_overflow_devnum(void) +{ + int ret; + + if (!overflow_maj) { + ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, + "infiniband_verbs"); + if (ret) { + printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n"); + return ret; + } + } + + ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES); + if (ret >= IB_UVERBS_MAX_DEVICES) + return -1; + + return ret; +} #include static ssize_t @@ -801,6 +1244,7 @@ show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf) return sprintf(buf, "0x%04x\n", ((struct pci_dev *)dev->ib_dev->dma_device)->vendor); } + static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL); struct attribute *device_attrs[] = @@ -817,6 +1261,8 @@ static struct attribute_group device_group = { static void ib_uverbs_add_one(struct ib_device *device) { + int devnum; + dev_t base; struct ib_uverbs_device *uverbs_dev; if (!device->alloc_ucontext) @@ -828,55 +1274,66 @@ static void ib_uverbs_add_one(struct ib_device *device) kref_init(&uverbs_dev->ref); init_completion(&uverbs_dev->comp); + uverbs_dev->xrcd_tree = RB_ROOT; + mutex_init(&uverbs_dev->xrcd_tree_mutex); spin_lock(&map_lock); - uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) { + devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); + if (devnum >= IB_UVERBS_MAX_DEVICES) { spin_unlock(&map_lock); + devnum = find_overflow_devnum(); + if (devnum < 0) goto err; + + spin_lock(&map_lock); + uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; + base = devnum + overflow_maj; + set_bit(devnum, overflow_map); + } else { + uverbs_dev->devnum = devnum; + base = devnum + IB_UVERBS_BASE_DEV; + set_bit(devnum, dev_map); } - set_bit(uverbs_dev->devnum, dev_map); spin_unlock(&map_lock); uverbs_dev->ib_dev = device; uverbs_dev->num_comp_vectors = device->num_comp_vectors; - uverbs_dev->cdev = cdev_alloc(); - if (!uverbs_dev->cdev) - goto err; - uverbs_dev->cdev->owner = THIS_MODULE; - uverbs_dev->cdev->ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; - kobject_set_name(&uverbs_dev->cdev->kobj, "uverbs%d", uverbs_dev->devnum); - if (cdev_add(uverbs_dev->cdev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1)) + cdev_init(&uverbs_dev->cdev, NULL); + uverbs_dev->cdev.owner = THIS_MODULE; + uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; + kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); + if (cdev_add(&uverbs_dev->cdev, base, 1)) goto err_cdev; uverbs_dev->dev = device_create(uverbs_class, device->dma_device, - uverbs_dev->cdev->dev, uverbs_dev, + uverbs_dev->cdev.dev, uverbs_dev, "uverbs%d", uverbs_dev->devnum); if (IS_ERR(uverbs_dev->dev)) goto err_cdev; if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) goto err_class; + if (device_create_file(uverbs_dev->dev, &dev_attr_ref_cnt)) + goto err_class; if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group)) goto err_class; - spin_lock(&map_lock); - dev_table[uverbs_dev->devnum] = uverbs_dev; - spin_unlock(&map_lock); - ib_set_client_data(device, &uverbs_client, uverbs_dev); return; err_class: - device_destroy(uverbs_class, uverbs_dev->cdev->dev); + device_destroy(uverbs_class, uverbs_dev->cdev.dev); err_cdev: - cdev_del(uverbs_dev->cdev); - clear_bit(uverbs_dev->devnum, dev_map); + cdev_del(&uverbs_dev->cdev); + if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) + clear_bit(devnum, dev_map); + else + clear_bit(devnum, overflow_map); err: kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); @@ -894,42 +1351,30 @@ static void ib_uverbs_remove_one(struct ib_device *device) sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group); dev_set_drvdata(uverbs_dev->dev, NULL); - device_destroy(uverbs_class, uverbs_dev->cdev->dev); - cdev_del(uverbs_dev->cdev); - - spin_lock(&map_lock); - dev_table[uverbs_dev->devnum] = NULL; - spin_unlock(&map_lock); + device_destroy(uverbs_class, uverbs_dev->cdev.dev); + cdev_del(&uverbs_dev->cdev); + if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) clear_bit(uverbs_dev->devnum, dev_map); + else + clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); wait_for_completion(&uverbs_dev->comp); kfree(uverbs_dev); } -#ifdef __linux__ -static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, - struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "infinibandevent:", NULL, - INFINIBANDEVENTFS_MAGIC, mnt); -} -static struct file_system_type uverbs_event_fs = { - /* No owner field so module can be unloaded */ - .name = "infinibandeventfs", - .get_sb = uverbs_event_get_sb, - .kill_sb = kill_litter_super -}; -#endif +static char *uverbs_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} static int __init ib_uverbs_init(void) { int ret; - spin_lock_init(&map_lock); - ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { @@ -944,43 +1389,22 @@ static int __init ib_uverbs_init(void) goto out_chrdev; } + uverbs_class->devnode = uverbs_devnode; + ret = class_create_file(uverbs_class, &class_attr_abi_version); if (ret) { printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n"); goto out_class; } -#ifdef __linux__ - ret = register_filesystem(&uverbs_event_fs); - if (ret) { - printk(KERN_ERR "user_verbs: couldn't register infinibandeventfs\n"); - goto out_class; - } - - uverbs_event_mnt = kern_mount(&uverbs_event_fs); - if (IS_ERR(uverbs_event_mnt)) { - ret = PTR_ERR(uverbs_event_mnt); - printk(KERN_ERR "user_verbs: couldn't mount infinibandeventfs\n"); - goto out_fs; - } -#endif - ret = ib_register_client(&uverbs_client); if (ret) { printk(KERN_ERR "user_verbs: couldn't register client\n"); - goto out_mnt; + goto out_class; } return 0; -out_mnt: -#ifdef __linux__ - mntput(uverbs_event_mnt); - -out_fs: - unregister_filesystem(&uverbs_event_fs); -#endif - out_class: class_destroy(uverbs_class); @@ -994,12 +1418,10 @@ static int __init ib_uverbs_init(void) static void __exit ib_uverbs_cleanup(void) { ib_unregister_client(&uverbs_client); -#ifdef __linux__ - mntput(uverbs_event_mnt); - unregister_filesystem(&uverbs_event_fs); -#endif class_destroy(uverbs_class); unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); + if (overflow_maj) + unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES); idr_destroy(&ib_uverbs_pd_idr); idr_destroy(&ib_uverbs_mr_idr); idr_destroy(&ib_uverbs_mw_idr); diff --git a/sys/ofed/drivers/infiniband/core/uverbs_marshall.c b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c index 5440da0e59b4..a541882f63fc 100644 --- a/sys/ofed/drivers/infiniband/core/uverbs_marshall.c +++ b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include #include void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, @@ -40,18 +41,21 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, dst->grh.sgid_index = src->grh.sgid_index; dst->grh.hop_limit = src->grh.hop_limit; dst->grh.traffic_class = src->grh.traffic_class; + memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); dst->dlid = src->dlid; dst->sl = src->sl; dst->src_path_bits = src->src_path_bits; dst->static_rate = src->static_rate; dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0; dst->port_num = src->port_num; + dst->reserved = 0; } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src) { + dst->qp_state = src->qp_state; dst->cur_qp_state = src->cur_qp_state; dst->path_mtu = src->path_mtu; dst->path_mig_state = src->path_mig_state; @@ -83,6 +87,7 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, dst->rnr_retry = src->rnr_retry; dst->alt_port_num = src->alt_port_num; dst->alt_timeout = src->alt_timeout; + memset(dst->reserved, 0, sizeof(dst->reserved)); } EXPORT_SYMBOL(ib_copy_qp_attr_to_user); diff --git a/sys/ofed/drivers/infiniband/core/verbs.c b/sys/ofed/drivers/infiniband/core/verbs.c index 023564f814d6..51a0ed50b7b4 100644 --- a/sys/ofed/drivers/infiniband/core/verbs.c +++ b/sys/ofed/drivers/infiniband/core/verbs.c @@ -38,10 +38,13 @@ #include #include +#include #include +#include #include #include +#include int ib_rate_to_mult(enum ib_rate rate) { @@ -77,6 +80,31 @@ enum ib_rate mult_to_ib_rate(int mult) } EXPORT_SYMBOL(mult_to_ib_rate); +int ib_rate_to_mbps(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 2500; + case IB_RATE_5_GBPS: return 5000; + case IB_RATE_10_GBPS: return 10000; + case IB_RATE_20_GBPS: return 20000; + case IB_RATE_30_GBPS: return 30000; + case IB_RATE_40_GBPS: return 40000; + case IB_RATE_60_GBPS: return 60000; + case IB_RATE_80_GBPS: return 80000; + case IB_RATE_120_GBPS: return 120000; + case IB_RATE_14_GBPS: return 14062; + case IB_RATE_56_GBPS: return 56250; + case IB_RATE_112_GBPS: return 112500; + case IB_RATE_168_GBPS: return 168750; + case IB_RATE_25_GBPS: return 25781; + case IB_RATE_100_GBPS: return 103125; + case IB_RATE_200_GBPS: return 206250; + case IB_RATE_300_GBPS: return 309375; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mbps); + enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { @@ -87,6 +115,8 @@ rdma_node_get_transport(enum rdma_node_type node_type) return RDMA_TRANSPORT_IB; case RDMA_NODE_RNIC: return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_MIC: + return RDMA_TRANSPORT_SCIF; default: BUG(); return 0; @@ -104,6 +134,8 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_ return IB_LINK_LAYER_INFINIBAND; case RDMA_TRANSPORT_IWARP: return IB_LINK_LAYER_ETHERNET; + case RDMA_TRANSPORT_SCIF: + return IB_LINK_LAYER_SCIF; default: return IB_LINK_LAYER_UNSPECIFIED; } @@ -162,8 +194,29 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, u32 flow_class; u16 gid_index; int ret; + int is_eth = (rdma_port_get_link_layer(device, port_num) == + IB_LINK_LAYER_ETHERNET); memset(ah_attr, 0, sizeof *ah_attr); + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) + return -EPROTOTYPE; + + if (wc->wc_flags & IB_WC_WITH_SMAC && + wc->wc_flags & IB_WC_WITH_VLAN) { + memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); + ah_attr->vlan_id = wc->vlan_id; + } else { + ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, + ah_attr->dmac, &ah_attr->vlan_id); + if (ret) + return ret; + } + } else { + ah_attr->vlan_id = 0xffff; + } + + ah_attr->dlid = wc->slid; ah_attr->sl = wc->sl; ah_attr->src_path_bits = wc->dlid_path_bits; @@ -250,8 +303,13 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->uobject = NULL; srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; - srq->ext.xrc.cq = NULL; - srq->ext.xrc.xrcd = NULL; + srq->srq_type = srq_init_attr->srq_type; + if (srq->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; + atomic_inc(&srq->ext.xrc.xrcd->usecnt); + atomic_inc(&srq->ext.xrc.cq->usecnt); + } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); } @@ -260,36 +318,6 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, } EXPORT_SYMBOL(ib_create_srq); -struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd, - struct ib_cq *xrc_cq, - struct ib_xrcd *xrcd, - struct ib_srq_init_attr *srq_init_attr) -{ - struct ib_srq *srq; - - if (!pd->device->create_xrc_srq) - return ERR_PTR(-ENOSYS); - - srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, srq_init_attr, NULL); - - if (!IS_ERR(srq)) { - srq->device = pd->device; - srq->pd = pd; - srq->uobject = NULL; - srq->event_handler = srq_init_attr->event_handler; - srq->srq_context = srq_init_attr->srq_context; - srq->ext.xrc.cq = xrc_cq; - srq->ext.xrc.xrcd = xrcd; - atomic_inc(&pd->usecnt); - atomic_inc(&xrcd->usecnt); - atomic_inc(&xrc_cq->usecnt); - atomic_set(&srq->usecnt, 0); - } - - return srq; -} -EXPORT_SYMBOL(ib_create_xrc_srq); - int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask) @@ -308,27 +336,39 @@ int ib_query_srq(struct ib_srq *srq, } EXPORT_SYMBOL(ib_query_srq); +int ib_query_values(struct ib_device *device, + int q_values, struct ib_device_values *values) +{ + return device->query_values ? + device->query_values(device, q_values, values) : -ENOSYS; +} +EXPORT_SYMBOL(ib_query_values); + int ib_destroy_srq(struct ib_srq *srq) { struct ib_pd *pd; - struct ib_cq *xrc_cq; - struct ib_xrcd *xrcd; + enum ib_srq_type srq_type; + struct ib_xrcd *uninitialized_var(xrcd); + struct ib_cq *uninitialized_var(cq); int ret; if (atomic_read(&srq->usecnt)) return -EBUSY; pd = srq->pd; - xrc_cq = srq->ext.xrc.cq; + srq_type = srq->srq_type; + if (srq_type == IB_SRQT_XRC) { xrcd = srq->ext.xrc.xrcd; + cq = srq->ext.xrc.cq; + } ret = srq->device->destroy_srq(srq); if (!ret) { atomic_dec(&pd->usecnt); - if (xrc_cq) - atomic_dec(&xrc_cq->usecnt); - if (xrcd) + if (srq_type == IB_SRQT_XRC) { atomic_dec(&xrcd->usecnt); + atomic_dec(&cq->usecnt); + } } return ret; @@ -337,32 +377,130 @@ EXPORT_SYMBOL(ib_destroy_srq); /* Queue pairs */ +static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = context; + unsigned long flags; + + /* The code below must be synced with deletions of existing qps (ib_close_qp) -- + * because a qp from the list may be closed during the scan, resulting in a kernel Oops. + */ + spin_lock_irqsave(&qp->device->event_handler_lock, flags); + list_for_each_entry(event->element.qp, &qp->open_list, open_list) + if (event->element.qp->event_handler) + event->element.qp->event_handler(event, event->element.qp->qp_context); + spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); +} + +static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) +{ + mutex_lock(&xrcd->tgt_qp_mutex); + list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); + mutex_unlock(&xrcd->tgt_qp_mutex); +} + +static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, + void (*event_handler)(struct ib_event *, void *), + void *qp_context) +{ + struct ib_qp *qp; + unsigned long flags; + + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->real_qp = real_qp; + atomic_inc(&real_qp->usecnt); + qp->device = real_qp->device; + qp->event_handler = event_handler; + qp->qp_context = qp_context; + qp->qp_num = real_qp->qp_num; + qp->qp_type = real_qp->qp_type; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_add(&qp->open_list, &real_qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + return qp; +} + +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr) +{ + struct ib_qp *qp, *real_qp; + + if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) + return ERR_PTR(-EINVAL); + + qp = ERR_PTR(-EINVAL); + mutex_lock(&xrcd->tgt_qp_mutex); + list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { + if (real_qp->qp_num == qp_open_attr->qp_num) { + qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, + qp_open_attr->qp_context); + break; + } + } + mutex_unlock(&xrcd->tgt_qp_mutex); + return qp; +} +EXPORT_SYMBOL(ib_open_qp); + struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { - struct ib_qp *qp; + struct ib_qp *qp, *real_qp; + struct ib_device *device; - qp = pd->device->create_qp(pd, qp_init_attr, NULL); + device = pd ? pd->device : qp_init_attr->xrcd->device; + qp = device->create_qp(pd, qp_init_attr, NULL); if (!IS_ERR(qp)) { - qp->device = pd->device; - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->recv_cq = qp_init_attr->recv_cq; - qp->srq = qp_init_attr->srq; + qp->device = device; + qp->real_qp = qp; qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; + + atomic_set(&qp->usecnt, 0); + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + real_qp = qp; + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); + } else { qp->event_handler = qp_init_attr->event_handler; qp->qp_context = qp_init_attr->qp_context; - qp->qp_type = qp_init_attr->qp_type; - qp->xrcd = qp->qp_type == IB_QPT_XRC ? - qp_init_attr->xrcd : NULL; - atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; atomic_inc(&qp_init_attr->recv_cq->usecnt); - if (qp_init_attr->srq) + qp->srq = qp_init_attr->srq; + if (qp->srq) atomic_inc(&qp_init_attr->srq->usecnt); - if (qp->qp_type == IB_QPT_XRC) - atomic_inc(&qp->xrcd->usecnt); + } + + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; + + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); + } } return qp; @@ -371,8 +509,10 @@ EXPORT_SYMBOL(ib_create_qp); static const struct { int valid; - enum ib_qp_attr_mask req_param[IB_QPT_RAW_PACKET + 1]; - enum ib_qp_attr_mask opt_param[IB_QPT_RAW_PACKET + 1]; + enum ib_qp_attr_mask req_param[IB_QPT_MAX]; + enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -389,13 +529,24 @@ static const struct { [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), - [IB_QPT_XRC] = (IB_QP_PKEY_INDEX | + [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS | + IB_QP_DC_KEY), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + }, + .opt_param = { + [IB_QPT_UD] = IB_QP_GROUP_RSS, + [IB_QPT_RAW_PACKET] = IB_QP_GROUP_RSS } }, }, @@ -414,7 +565,13 @@ static const struct { [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), - [IB_QPT_XRC] = (IB_QP_PKEY_INDEX | + [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | @@ -436,13 +593,26 @@ static const struct { IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), - [IB_QPT_XRC] = (IB_QP_AV | + [IB_QPT_DC_INI] = (IB_QP_PATH_MTU | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_XRC_TGT] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), }, + .req_param_add_eth = { + [IB_QPT_RC] = (IB_QP_SMAC), + [IB_QPT_UC] = (IB_QP_SMAC), + [IB_QPT_XRC_INI] = (IB_QP_SMAC), + [IB_QPT_XRC_TGT] = (IB_QP_SMAC) + }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), @@ -452,13 +622,34 @@ static const struct { [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), - [IB_QPT_XRC] = (IB_QP_ALT_PATH | + [IB_QPT_DC_INI] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), + [IB_QPT_RAW_PACKET] = IB_QP_AV, + }, + .opt_param_add_eth = { + [IB_QPT_RC] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_UC] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID) } } }, @@ -475,11 +666,17 @@ static const struct { IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), - [IB_QPT_XRC] = (IB_QP_TIMEOUT | + [IB_QPT_DC_INI] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | + IB_QP_SQ_PSN), [IB_QPT_SMI] = IB_QP_SQ_PSN, [IB_QPT_GSI] = IB_QP_SQ_PSN, }, @@ -495,7 +692,16 @@ static const struct { IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC] = (IB_QP_CUR_STATE | + [IB_QPT_DC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | @@ -524,7 +730,16 @@ static const struct { IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), - [IB_QPT_XRC] = (IB_QP_CUR_STATE | + [IB_QPT_DC_INI] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | @@ -541,7 +756,8 @@ static const struct { [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_XRC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY } @@ -564,7 +780,11 @@ static const struct { IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC] = (IB_QP_CUR_STATE | + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | @@ -597,12 +817,19 @@ static const struct { IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC] = (IB_QP_PORT | + [IB_QPT_XRC_INI] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | @@ -640,7 +867,8 @@ static const struct { }; int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask) + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll) { enum ib_qp_attr_mask req_param, opt_param; @@ -659,6 +887,13 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + if (ll == IB_LINK_LAYER_ETHERNET) { + req_param |= qp_state_table[cur_state][next_state]. + req_param_add_eth[type]; + opt_param |= qp_state_table[cur_state][next_state]. + opt_param_add_eth[type]; + } + if ((mask & req_param) != req_param) return 0; @@ -673,7 +908,13 @@ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { - return qp->device->modify_qp(qp, qp_attr, qp_attr_mask, NULL); + int ret; + + ret = qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); + if (!ret && (qp_attr_mask & IB_QP_PORT)) + qp->port_num = qp_attr->port_num; + + return ret; } EXPORT_SYMBOL(ib_modify_qp); @@ -683,35 +924,87 @@ int ib_query_qp(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr) { return qp->device->query_qp ? - qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) : + qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : -ENOSYS; } EXPORT_SYMBOL(ib_query_qp); +int ib_close_qp(struct ib_qp *qp) +{ + struct ib_qp *real_qp; + unsigned long flags; + + real_qp = qp->real_qp; + if (real_qp == qp) + return -EINVAL; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_del(&qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + atomic_dec(&real_qp->usecnt); + kfree(qp); + + return 0; +} +EXPORT_SYMBOL(ib_close_qp); + +static int __ib_destroy_shared_qp(struct ib_qp *qp) +{ + struct ib_xrcd *xrcd; + struct ib_qp *real_qp; + int ret; + + real_qp = qp->real_qp; + xrcd = real_qp->xrcd; + + mutex_lock(&xrcd->tgt_qp_mutex); + ib_close_qp(qp); + if (atomic_read(&real_qp->usecnt) == 0) + list_del(&real_qp->xrcd_list); + else + real_qp = NULL; + mutex_unlock(&xrcd->tgt_qp_mutex); + + if (real_qp) { + ret = ib_destroy_qp(real_qp); + if (!ret) + atomic_dec(&xrcd->usecnt); + else + __ib_insert_xrcd_qp(xrcd, real_qp); + } + + return 0; +} + int ib_destroy_qp(struct ib_qp *qp) { struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; - struct ib_xrcd *xrcd; - enum ib_qp_type qp_type = qp->qp_type; int ret; + if (atomic_read(&qp->usecnt)) + return -EBUSY; + + if (qp->real_qp != qp) + return __ib_destroy_shared_qp(qp); + pd = qp->pd; scq = qp->send_cq; rcq = qp->recv_cq; srq = qp->srq; - xrcd = qp->xrcd; ret = qp->device->destroy_qp(qp); if (!ret) { + if (pd) atomic_dec(&pd->usecnt); + if (scq) atomic_dec(&scq->usecnt); + if (rcq) atomic_dec(&rcq->usecnt); if (srq) atomic_dec(&srq->usecnt); - if (qp_type == IB_QPT_XRC) - atomic_dec(&xrcd->usecnt); } return ret; @@ -726,8 +1019,13 @@ struct ib_cq *ib_create_cq(struct ib_device *device, void *cq_context, int cqe, int comp_vector) { struct ib_cq *cq; + struct ib_cq_init_attr attr = { + .cqe = cqe, + .comp_vector = comp_vector, + .flags = 0, + }; - cq = device->create_cq(device, cqe, comp_vector, NULL, NULL); + cq = device->create_cq(device, &attr, NULL, NULL); if (!IS_ERR(cq)) { cq->device = device; @@ -742,10 +1040,12 @@ struct ib_cq *ib_create_cq(struct ib_device *device, } EXPORT_SYMBOL(ib_create_cq); -int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +int ib_modify_cq(struct ib_cq *cq, + struct ib_cq_attr *cq_attr, + int cq_attr_mask) { return cq->device->modify_cq ? - cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS; + cq->device->modify_cq(cq, cq_attr, cq_attr_mask) : -ENOSYS; } EXPORT_SYMBOL(ib_modify_cq); @@ -770,6 +1070,11 @@ EXPORT_SYMBOL(ib_resize_cq); struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); mr = pd->device->get_dma_mr(pd, mr_access_flags); @@ -792,6 +1097,11 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, u64 *iova_start) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); if (!pd->device->reg_phys_mr) return ERR_PTR(-ENOSYS); @@ -822,6 +1132,10 @@ int ib_rereg_phys_mr(struct ib_mr *mr, struct ib_pd *old_pd; int ret; + ret = ib_check_mr_access(mr_access_flags); + if (ret) + return ret; + if (!mr->device->rereg_phys_mr) return -ENOSYS; @@ -867,6 +1181,45 @@ int ib_dereg_mr(struct ib_mr *mr) } EXPORT_SYMBOL(ib_dereg_mr); +struct ib_mr *ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct ib_mr *mr; + + if (!pd->device->create_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->create_mr(pd, mr_init_attr); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_create_mr); + +int ib_destroy_mr(struct ib_mr *mr) +{ + struct ib_pd *pd; + int ret; + + if (atomic_read(&mr->usecnt)) + return -EBUSY; + + pd = mr->pd; + ret = mr->device->destroy_mr(mr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_destroy_mr); + struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { struct ib_mr *mr; @@ -915,18 +1268,19 @@ EXPORT_SYMBOL(ib_free_fast_reg_page_list); /* Memory windows */ -struct ib_mw *ib_alloc_mw(struct ib_pd *pd) +struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct ib_mw *mw; if (!pd->device->alloc_mw) return ERR_PTR(-ENOSYS); - mw = pd->device->alloc_mw(pd); + mw = pd->device->alloc_mw(pd, type); if (!IS_ERR(mw)) { mw->device = pd->device; mw->pd = pd; mw->uobject = NULL; + mw->type = type; atomic_inc(&pd->usecnt); } @@ -1000,59 +1354,59 @@ EXPORT_SYMBOL(ib_dealloc_fmr); int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { + int ret; + if (!qp->device->attach_mcast) return -ENOSYS; switch (rdma_node_get_transport(qp->device->node_type)) { case RDMA_TRANSPORT_IB: - if (qp->qp_type == IB_QPT_RAW_PACKET) { - /* In raw Etherent mgids the 63 msb's should be 0 */ - if (gid->global.subnet_prefix & cpu_to_be64(~1ULL)) - return -EINVAL; - } else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && + qp->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; break; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: if (qp->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; break; } - return qp->device->attach_mcast(qp, gid, lid); + + ret = qp->device->attach_mcast(qp, gid, lid); + if (!ret) + atomic_inc(&qp->usecnt); + return ret; } EXPORT_SYMBOL(ib_attach_mcast); int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { + int ret; + if (!qp->device->detach_mcast) return -ENOSYS; switch (rdma_node_get_transport(qp->device->node_type)) { case RDMA_TRANSPORT_IB: - if (qp->qp_type == IB_QPT_RAW_PACKET) { - /* In raw Etherent mgids the 63 msb's should be 0 */ - if (gid->global.subnet_prefix & cpu_to_be64(~1ULL)) - return -EINVAL; - } else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && + qp->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; break; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_SCIF: + if (qp->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; break; } - return qp->device->detach_mcast(qp, gid, lid); + + ret = qp->device->detach_mcast(qp, gid, lid); + if (!ret) + atomic_dec(&qp->usecnt); + return ret; } EXPORT_SYMBOL(ib_detach_mcast); -int ib_dealloc_xrcd(struct ib_xrcd *xrcd) -{ - if (atomic_read(&xrcd->usecnt)) - return -EBUSY; - - return xrcd->device->dealloc_xrcd(xrcd); -} -EXPORT_SYMBOL(ib_dealloc_xrcd); - struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) { struct ib_xrcd *xrcd; @@ -1064,10 +1418,119 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) if (!IS_ERR(xrcd)) { xrcd->device = device; xrcd->inode = NULL; - xrcd->uobject = NULL; atomic_set(&xrcd->usecnt, 0); + mutex_init(&xrcd->tgt_qp_mutex); + INIT_LIST_HEAD(&xrcd->tgt_qp_list); } + return xrcd; } EXPORT_SYMBOL(ib_alloc_xrcd); +int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct ib_qp *qp; + int ret; + + if (atomic_read(&xrcd->usecnt)) + return -EBUSY; + + while (!list_empty(&xrcd->tgt_qp_list)) { + qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); + ret = ib_destroy_qp(qp); + if (ret) + return ret; + } + + return xrcd->device->dealloc_xrcd(xrcd); +} +EXPORT_SYMBOL(ib_dealloc_xrcd); + +struct ib_flow *ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + struct ib_flow *flow_id; + if (!qp->device->create_flow) + return ERR_PTR(-ENOSYS); + + flow_id = qp->device->create_flow(qp, flow_attr, domain); + if (!IS_ERR(flow_id)) + atomic_inc(&qp->usecnt); + return flow_id; +} +EXPORT_SYMBOL(ib_create_flow); + +int ib_destroy_flow(struct ib_flow *flow_id) +{ + int err; + struct ib_qp *qp; + + if (!flow_id) + return -EINVAL; + qp = flow_id->qp; + if (!qp->device->destroy_flow) + return -ENOSYS; + err = qp->device->destroy_flow(flow_id); + if (!err) + atomic_dec(&qp->usecnt); + return err; +} +EXPORT_SYMBOL(ib_destroy_flow); + +struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr, + struct ib_udata *udata) +{ + struct ib_dct *dct; + + if (!pd->device->exp_create_dct) + return ERR_PTR(-ENOSYS); + + dct = pd->device->exp_create_dct(pd, attr, udata); + if (!IS_ERR(dct)) { + dct->pd = pd; + dct->srq = attr->srq; + dct->cq = attr->cq; + atomic_inc(&dct->srq->usecnt); + atomic_inc(&dct->cq->usecnt); + atomic_inc(&dct->pd->usecnt); + } + + return dct; +} +EXPORT_SYMBOL(ib_create_dct); + +int ib_destroy_dct(struct ib_dct *dct) +{ + int err; + + if (!dct->device->exp_destroy_dct) + return -ENOSYS; + + err = dct->device->exp_destroy_dct(dct); + if (!err) { + atomic_dec(&dct->srq->usecnt); + atomic_dec(&dct->cq->usecnt); + atomic_dec(&dct->pd->usecnt); + } + + return err; +} +EXPORT_SYMBOL(ib_destroy_dct); + +int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr) +{ + if (!dct->device->exp_query_dct) + return -ENOSYS; + + return dct->device->exp_query_dct(dct, attr); +} +EXPORT_SYMBOL(ib_query_dct); + +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + return mr->device->check_mr_status ? + mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; +} +EXPORT_SYMBOL(ib_check_mr_status); diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.c b/sys/ofed/drivers/infiniband/debug/memtrack.c index 199b33b7709d..7082856680b9 100644 --- a/sys/ofed/drivers/infiniband/debug/memtrack.c +++ b/sys/ofed/drivers/infiniband/debug/memtrack.c @@ -24,12 +24,21 @@ #ifdef kmalloc #undef kmalloc #endif +#ifdef kmemdup + #undef kmemdup +#endif #ifdef kfree #undef kfree #endif #ifdef vmalloc #undef vmalloc #endif +#ifdef vzalloc + #undef vzalloc +#endif +#ifdef vzalloc_node + #undef vzalloc_node +#endif #ifdef vfree #undef vfree #endif @@ -39,16 +48,59 @@ #ifdef kmem_cache_free #undef kmem_cache_free #endif +#ifdef ioremap + #undef ioremap +#endif +#ifdef io_mapping_create_wc + #undef io_mapping_create_wc +#endif +#ifdef io_mapping_free + #undef io_mapping_free +#endif +#ifdef ioremap_nocache + #undef ioremap_nocache +#endif +#ifdef iounmap + #undef iounmap +#endif +#ifdef alloc_pages + #undef alloc_pages +#endif +#ifdef free_pages + #undef free_pages +#endif +#ifdef get_page + #undef get_page +#endif +#ifdef put_page + #undef put_page +#endif +#ifdef create_workqueue + #undef create_workqueue +#endif +#ifdef create_rt_workqueue + #undef create_rt_workqueue +#endif +#ifdef create_freezeable_workqueue + #undef create_freezeable_workqueue +#endif +#ifdef create_singlethread_workqueue + #undef create_singlethread_workqueue +#endif +#ifdef destroy_workqueue + #undef destroy_workqueue +#endif #include #include #include #include #include -#include +#include #include #include -#include +#include +#include "memtrack.h" #include @@ -67,7 +119,7 @@ MODULE_LICENSE("GPL"); bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */ static unsigned long track_mask = -1; /* effectively everything */ module_param(track_mask, ulong, 0444); -MODULE_PARM_DESC(track_mask, "bitmask definenig what is tracked"); +MODULE_PARM_DESC(track_mask, "bitmask defining what is tracked"); /* if a bit is set then the corresponding allocation is strictly tracked. That is, before inserting the whole range is checked to not overlap any @@ -76,59 +128,95 @@ static unsigned long strict_track_mask = 0; /* no strict tracking */ module_param(strict_track_mask, ulong, 0444); MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking"); -typedef struct memtrack_meminfo_st { +/* Sets the frequency of allocations failures injections + if set to 0 all allocation should succeed */ +static unsigned int inject_freq = 0; +module_param(inject_freq, uint, 0644); +MODULE_PARM_DESC(inject_freq, "Error injection frequency, default is 0 (disabled)"); + +static int random_mem = 1; +module_param(random_mem, uint, 0644); +MODULE_PARM_DESC(random_mem, "When set, randomize allocated memory, default is 1 (enabled)"); + +struct memtrack_meminfo_t { unsigned long addr; unsigned long size; unsigned long line_num; - struct memtrack_meminfo_st *next; + unsigned long dev; + unsigned long addr2; + int direction; + struct memtrack_meminfo_t *next; struct list_head list; /* used to link all items from a certain type together */ char filename[MAX_FILENAME_LEN + 1]; /* putting the char array last is better for struct. packing */ -} memtrack_meminfo_t; + char ext_info[32]; +}; static struct kmem_cache *meminfo_cache; -typedef struct { - memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ]; +struct tracked_obj_desc_t { + struct memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ]; spinlock_t hash_lock; unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */ struct list_head tracked_objs_head; /* head of list of all objects */ int strict_track; /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */ -} tracked_obj_desc_t; +}; -static tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES]; +static struct tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES]; static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = { "kmalloc", "vmalloc", - "kmem_cache_alloc" + "kmem_cache_alloc", + "io_remap", + "create_workqueue", + "alloc_pages", + "ib_dma_map_single", + "ib_dma_map_page", + "ib_dma_map_sg" }; - static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = { "kfree", "vfree", - "kmem_cache_free" + "kmem_cache_free", + "io_unmap", + "destory_workqueue", + "free_pages", + "ib_dma_unmap_single", + "ib_dma_unmap_page", + "ib_dma_unmap_sg" }; - -static inline const char *memtype_alloc_str(memtrack_memtype_t memtype) +static inline const char *memtype_alloc_str(enum memtrack_memtype_t memtype) { switch (memtype) { - case MEMTRACK_KMALLOC: - case MEMTRACK_VMALLOC: - case MEMTRACK_KMEM_OBJ: + case MEMTRACK_KMALLOC: + case MEMTRACK_VMALLOC: + case MEMTRACK_KMEM_OBJ: + case MEMTRACK_IOREMAP: + case MEMTRACK_WORK_QUEUE: + case MEMTRACK_PAGE_ALLOC: + case MEMTRACK_DMA_MAP_SINGLE: + case MEMTRACK_DMA_MAP_PAGE: + case MEMTRACK_DMA_MAP_SG: return rsc_names[memtype]; default: return "(Unknown allocation type)"; } } -static inline const char *memtype_free_str(memtrack_memtype_t memtype) +static inline const char *memtype_free_str(enum memtrack_memtype_t memtype) { switch (memtype) { - case MEMTRACK_KMALLOC: - case MEMTRACK_VMALLOC: - case MEMTRACK_KMEM_OBJ: + case MEMTRACK_KMALLOC: + case MEMTRACK_VMALLOC: + case MEMTRACK_KMEM_OBJ: + case MEMTRACK_IOREMAP: + case MEMTRACK_WORK_QUEUE: + case MEMTRACK_PAGE_ALLOC: + case MEMTRACK_DMA_MAP_SINGLE: + case MEMTRACK_DMA_MAP_PAGE: + case MEMTRACK_DMA_MAP_SG: return rsc_free_names[memtype]; default: return "(Unknown allocation type)"; @@ -138,56 +226,56 @@ static inline const char *memtype_free_str(memtrack_memtype_t memtype) /* * overlap_a_b */ -static int overlap_a_b(unsigned long a_start, unsigned long a_end, +static inline int overlap_a_b(unsigned long a_start, unsigned long a_end, unsigned long b_start, unsigned long b_end) { - if ((b_start > a_end) || (a_start > b_end)) { + if ((b_start > a_end) || (a_start > b_end)) return 0; - } + return 1; } /* * check_overlap */ -static void check_overlap(memtrack_memtype_t memtype, - memtrack_meminfo_t * mem_info_p, - tracked_obj_desc_t * obj_desc_p) +static void check_overlap(enum memtrack_memtype_t memtype, + struct memtrack_meminfo_t *mem_info_p, + struct tracked_obj_desc_t *obj_desc_p) { struct list_head *pos, *next; - memtrack_meminfo_t *cur; + struct memtrack_meminfo_t *cur; unsigned long start_a, end_a, start_b, end_b; - list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) { - cur = list_entry(pos, memtrack_meminfo_t, list); - start_a = mem_info_p->addr; end_a = mem_info_p->addr + mem_info_p->size - 1; + + list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) { + cur = list_entry(pos, struct memtrack_meminfo_t, list); + start_b = cur->addr; end_b = cur->addr + cur->size - 1; - if (overlap_a_b(start_a, end_a, start_b, end_b)) { - printk - ("%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n", + if (overlap_a_b(start_a, end_a, start_b, end_b)) + printk(KERN_ERR "%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n", memtype_alloc_str(memtype), mem_info_p->addr, mem_info_p->addr + mem_info_p->size - 1, cur->addr, cur->addr + cur->size - 1); } - } } /* Invoke on memory allocation */ -void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr, - unsigned long size, const char *filename, +void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, + unsigned long addr, unsigned long size, unsigned long addr2, + int direction, const char *filename, const unsigned long line_num, int alloc_flags) { unsigned long hash_val; - memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p; - tracked_obj_desc_t *obj_desc_p; + struct memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk("%s: Invalid memory type (%d)\n", __func__, memtype); + printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); return; } @@ -199,11 +287,9 @@ void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr, hash_val = addr % MEMTRACK_HASH_SZ; - new_mem_info_p = (memtrack_meminfo_t *) - kmem_cache_alloc(meminfo_cache, alloc_flags); + new_mem_info_p = (struct memtrack_meminfo_t *)kmem_cache_alloc(meminfo_cache, alloc_flags); if (new_mem_info_p == NULL) { - printk - ("%s: Failed allocating kmem_cache item for new mem_info. " + printk(KERN_ERR "%s: Failed allocating kmem_cache item for new mem_info. " "Lost tracking on allocation at %s:%lu...\n", __func__, filename, line_num); return; @@ -211,26 +297,34 @@ void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr, /* save allocation properties */ new_mem_info_p->addr = addr; new_mem_info_p->size = size; + new_mem_info_p->dev = dev; + new_mem_info_p->addr2 = addr2; + new_mem_info_p->direction = direction; + new_mem_info_p->line_num = line_num; + *new_mem_info_p->ext_info = '\0'; /* Make sure that we will print out the path tail if the given filename is longer * than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file * in the printout -- only the path head! */ - if (strlen(filename) > MAX_FILENAME_LEN) { + if (strlen(filename) > MAX_FILENAME_LEN) strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN); - } else { + else strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN); - } + new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */ memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* make sure given memory location is not already allocated */ + if ((memtype != MEMTRACK_DMA_MAP_SINGLE) && (memtype != MEMTRACK_DMA_MAP_PAGE) && + (memtype != MEMTRACK_DMA_MAP_SG)) { + + /* make sure given memory location is not already allocated */ cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; while (cur_mem_info_p != NULL) { - if (cur_mem_info_p->addr == addr) { + if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { /* Found given address in the database */ - printk - ("mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n", + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n", __func__, filename, line_num, memtype_alloc_str(memtype), addr, cur_mem_info_p->filename, @@ -241,31 +335,33 @@ void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr, } cur_mem_info_p = cur_mem_info_p->next; } + } /* not found - we can put in the hash bucket */ /* link as first */ new_mem_info_p->next = obj_desc_p->mem_hash[hash_val]; obj_desc_p->mem_hash[hash_val] = new_mem_info_p; - if (obj_desc_p->strict_track) { + if (obj_desc_p->strict_track) check_overlap(memtype, new_mem_info_p, obj_desc_p); - } obj_desc_p->count += size; list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head); memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return; } +EXPORT_SYMBOL(memtrack_alloc); /* Invoke on memory free */ -void memtrack_free(memtrack_memtype_t memtype, unsigned long addr, +void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, + unsigned long addr, unsigned long size, int direction, const char *filename, const unsigned long line_num) { unsigned long hash_val; - memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p; - tracked_obj_desc_t *obj_desc_p; + struct memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk("%s: Invalid memory type (%d)\n", __func__, memtype); + printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); return; } @@ -282,13 +378,27 @@ void memtrack_free(memtrack_memtype_t memtype, unsigned long addr, prev_mem_info_p = NULL; cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; while (cur_mem_info_p != NULL) { - if (cur_mem_info_p->addr == addr) { - /* Found given address in the database - remove from the bucket/list */ - if (prev_mem_info_p == NULL) { - obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */ - } else { - prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */ + if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { + /* Found given address in the database */ + if ((memtype == MEMTRACK_DMA_MAP_SINGLE) || (memtype == MEMTRACK_DMA_MAP_PAGE) || + (memtype == MEMTRACK_DMA_MAP_SG)) { + if (direction != cur_mem_info_p->direction) + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad direction for addr 0x%lX: alloc:0x%x, free:0x%x (allocated in %s::%lu)\n", + __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->direction, direction, + cur_mem_info_p->filename, cur_mem_info_p->line_num); + + if (size != cur_mem_info_p->size) + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad size for addr 0x%lX: size:%lu, free:%lu (allocated in %s::%lu)\n", + __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->size, size, + cur_mem_info_p->filename, cur_mem_info_p->line_num); } + + /* Remove from the bucket/list */ + if (prev_mem_info_p == NULL) + obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */ + else + prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */ + list_del(&cur_mem_info_p->list); obj_desc_p->count -= cur_mem_info_p->size; @@ -301,64 +411,317 @@ void memtrack_free(memtrack_memtype_t memtype, unsigned long addr, } /* not found */ - printk - ("mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n", - __func__, filename, line_num, memtype_free_str(memtype), addr); + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX, device=0x%lX\n", + __func__, filename, line_num, memtype_free_str(memtype), addr, dev); memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); return; } +EXPORT_SYMBOL(memtrack_free); + +/* + * This function recognizes allocations which + * may be released by kernel (e.g. skb) and + * therefore not trackable by memtrack. + * The allocations are recognized by the name + * of their calling function. + */ +int is_non_trackable_alloc_func(const char *func_name) +{ + static const char * const str_str_arr[] = { + /* functions containing these strings consider non trackable */ + "skb", + }; + static const char * const str_str_excep_arr[] = { + /* functions which are exception to the str_str_arr table */ + "ipoib_cm_skb_too_long" + }; + static const char * const str_cmp_arr[] = { + /* functions that allocate SKBs */ + "mlx4_en_alloc_frags", + "mlx4_en_alloc_frag", + "mlx4_en_init_allocator", + "mlx4_en_free_frag", + "mlx4_en_free_rx_desc", + "mlx4_en_destroy_allocator", + "mlx4_en_complete_rx_desc", + /* vnic skb functions */ + "free_single_frag", + "vnic_alloc_rx_skb", + "vnic_rx_skb", + "vnic_alloc_frag", + "vnic_empty_rx_entry", + "vnic_init_allocator", + "vnic_destroy_allocator", + "sdp_post_recv", + "sdp_rx_ring_purge", + "sdp_post_srcavail", + "sk_stream_alloc_page", + "update_send_head", + "sdp_bcopy_get", + "sdp_destroy_resources", + + /* function that allocate memory for RDMA device context */ + "ib_alloc_device" + }; + size_t str_str_arr_size = sizeof(str_str_arr)/sizeof(char *); + size_t str_str_excep_size = sizeof(str_str_excep_arr)/sizeof(char *); + size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); + + int i, j; + + for (i = 0; i < str_str_arr_size; ++i) + if (strstr(func_name, str_str_arr[i])) { + for (j = 0; j < str_str_excep_size; ++j) + if (!strcmp(func_name, str_str_excep_arr[j])) + return 0; + return 1; + } + for (i = 0; i < str_cmp_arr_size; ++i) + if (!strcmp(func_name, str_cmp_arr[i])) + return 1; + return 0; +} +EXPORT_SYMBOL(is_non_trackable_alloc_func); + +/* + * In some cases we need to free a memory + * we defined as "non trackable" (see + * is_non_trackable_alloc_func). + * This function recognizes such releases + * by the name of their calling function. + */ +int is_non_trackable_free_func(const char *func_name) +{ + + static const char * const str_cmp_arr[] = { + /* function that deallocate memory for RDMA device context */ + "ib_dealloc_device" + }; + size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); + + int i; + + for (i = 0; i < str_cmp_arr_size; ++i) + if (!strcmp(func_name, str_cmp_arr[i])) + return 1; + return 0; +} +EXPORT_SYMBOL(is_non_trackable_free_func); + + +/* WA - In this function handles confirm + the the function name is + '__ib_umem_release' or 'ib_umem_get' + In this case we won't track the + memory there because the kernel + was the one who allocated it. + Return value: + 1 - if the function name is match, else 0 */ +int is_umem_put_page(const char *func_name) +{ + const char func_str[18] = "__ib_umem_release"; + /* In case of error flow put_page is called as part of ib_umem_get */ + const char func_str1[12] = "ib_umem_get"; + + return ((strstr(func_name, func_str) != NULL) || + (strstr(func_name, func_str1) != NULL)) ? 1 : 0; +} +EXPORT_SYMBOL(is_umem_put_page); + +/* Check page order size + When Freeing a page allocation it checks whether + we are trying to free the same size + we asked to allocate */ +int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, + unsigned long size, const char *filename, + const unsigned long line_num) +{ + unsigned long hash_val; + struct memtrack_meminfo_t *cur_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + int ret = 0; + + if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { + printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); + return 1; + } + + if (!tracked_objs_arr[memtype]) { + /* object is not tracked */ + return 1; + } + obj_desc_p = tracked_objs_arr[memtype]; + + hash_val = addr % MEMTRACK_HASH_SZ; + + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); + /* find mem_info of given memory location */ + cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; + while (cur_mem_info_p != NULL) { + if (cur_mem_info_p->addr == addr) { + /* Found given address in the database - check size */ + if (cur_mem_info_p->size != size) { + printk(KERN_ERR "mtl size inconsistency: %s: %s::%lu: try to %s at address=0x%lX with size %lu while was created with size %lu\n", + __func__, filename, line_num, memtype_free_str(memtype), + addr, size, cur_mem_info_p->size); + snprintf(cur_mem_info_p->ext_info, sizeof(cur_mem_info_p->ext_info), + "invalid free size %lu\n", size); + ret = 1; + } + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return ret; + } + cur_mem_info_p = cur_mem_info_p->next; + } + + /* not found - This function will not give any indication + but will only check the correct size\order + For inconsistency the 'free' function will check that */ + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return 1; +} +EXPORT_SYMBOL(memtrack_check_size); + +/* Search for a specific addr whether it exist in the + current data-base. + It will print an error msg if we get an unexpected result, + Return value: 0 - if addr exist, else 1 */ +int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, + const char *filename, const unsigned long line_num) +{ + unsigned long hash_val; + struct memtrack_meminfo_t *cur_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + + if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { + printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); + return 1; + } + + if (!tracked_objs_arr[memtype]) { + /* object is not tracked */ + return 0; + } + obj_desc_p = tracked_objs_arr[memtype]; + + hash_val = addr % MEMTRACK_HASH_SZ; + + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); + /* find mem_info of given memory location */ + cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; + while (cur_mem_info_p != NULL) { + if (cur_mem_info_p->addr == addr) { + /* Found given address in the database - exiting */ + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return 0; + } + cur_mem_info_p = cur_mem_info_p->next; + } + + /* not found */ + if (expect_exist) + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n", + __func__, filename, line_num, memtype_free_str(memtype), addr); + + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return 1; +} +EXPORT_SYMBOL(memtrack_is_new_addr); + +/* Return current page reference counter */ +int memtrack_get_page_ref_count(unsigned long addr) +{ + unsigned long hash_val; + struct memtrack_meminfo_t *cur_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + /* This function is called only for page allocation */ + enum memtrack_memtype_t memtype = MEMTRACK_PAGE_ALLOC; + int ref_conut = 0; + + if (!tracked_objs_arr[memtype]) { + /* object is not tracked */ + return ref_conut; + } + obj_desc_p = tracked_objs_arr[memtype]; + + hash_val = addr % MEMTRACK_HASH_SZ; + + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); + /* find mem_info of given memory location */ + cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; + while (cur_mem_info_p != NULL) { + if (cur_mem_info_p->addr == addr) { + /* Found given address in the database - check ref-count */ + struct page *page = (struct page *)(cur_mem_info_p->addr); + ref_conut = atomic_read(&page->_count); + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return ref_conut; + } + cur_mem_info_p = cur_mem_info_p->next; + } + + /* not found */ + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return ref_conut; +} +EXPORT_SYMBOL(memtrack_get_page_ref_count); /* Report current allocations status (for all memory types) */ static void memtrack_report(void) { - memtrack_memtype_t memtype; + enum memtrack_memtype_t memtype; unsigned long cur_bucket; - memtrack_meminfo_t *cur_mem_info_p; + struct memtrack_meminfo_t *cur_mem_info_p; int serial = 1; - tracked_obj_desc_t *obj_desc_p; + struct tracked_obj_desc_t *obj_desc_p; unsigned long flags; + unsigned long detected_leaks = 0; - printk("%s: Currently known allocations:\n", __func__); + printk(KERN_INFO "%s: Currently known allocations:\n", __func__); for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { if (tracked_objs_arr[memtype]) { - printk("%d) %s:\n", serial, memtype_alloc_str(memtype)); + printk(KERN_INFO "%d) %s:\n", serial, memtype_alloc_str(memtype)); obj_desc_p = tracked_objs_arr[memtype]; /* Scan all buckets to find existing allocations */ /* TBD: this may be optimized by holding a linked list of all hash items */ - for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; - cur_bucket++) { + for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ - cur_mem_info_p = - obj_desc_p->mem_hash[cur_bucket]; + cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; while (cur_mem_info_p != NULL) { /* scan bucket */ - printk("%s::%lu: %s(%lu)==%lX\n", + printk(KERN_INFO "%s::%lu: %s(%lu)==%lX dev=%lX %s\n", cur_mem_info_p->filename, cur_mem_info_p->line_num, memtype_alloc_str(memtype), cur_mem_info_p->size, - cur_mem_info_p->addr); + cur_mem_info_p->addr, + cur_mem_info_p->dev, + cur_mem_info_p->ext_info); cur_mem_info_p = cur_mem_info_p->next; + ++ detected_leaks; } /* while cur_mem_info_p */ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); } /* for cur_bucket */ serial++; } } /* for memtype */ + printk(KERN_INFO "%s: Summary: %lu leak(s) detected\n", __func__, detected_leaks); } static struct proc_dir_entry *memtrack_tree; -static memtrack_memtype_t get_rsc_by_name(const char *name) +static enum memtrack_memtype_t get_rsc_by_name(const char *name) { - memtrack_memtype_t i; + enum memtrack_memtype_t i; - for (i=0; if_dentry->d_name.name; + fname = filp->f_dentry->d_name.name; - memtype= get_rsc_by_name(fname); + memtype = get_rsc_by_name(fname); if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk("invalid file name\n"); + printk(KERN_ERR "invalid file name\n"); return -EINVAL; } - if ( pos == 0 ) { + if (pos == 0) { memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags); - cur= tracked_objs_arr[memtype]->count; + cur = tracked_objs_arr[memtype]->count; memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags); _read = sprintf(kbuf, "%lu\n", cur); - if ( _read < 0 ) { + if (_read < 0) return _read; - } - else { + else file_len = _read; } - } left = file_len - pos; to_ret = (left < size) ? left : size; - if ( copy_to_user(buf, kbuf+pos, to_ret) ) { + if (copy_to_user(buf, kbuf+pos, to_ret)) return -EFAULT; - } else { *offset = pos + to_ret; return to_ret; } } -static struct file_operations memtrack_proc_fops = { +static const struct file_operations memtrack_proc_fops = { .read = memtrack_read, }; @@ -426,30 +786,28 @@ static int create_procfs_tree(void) unsigned long bit_mask; dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL); - if ( !dir_ent ) { + if (!dir_ent) return -1; - } memtrack_tree = dir_ent; - for (i=0, bit_mask=1; iproc_fops = &memtrack_proc_fops; + proc_ent->proc_fops = &memtrack_proc_fops; } } goto exit_ok; undo_create_root: - for (j=0, bit_mask=1; jhash_lock); INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head); - if (bit_mask & strict_track_mask) { + if (bit_mask & strict_track_mask) tracked_objs_arr[i]->strict_track = 1; - } else { + else tracked_objs_arr[i]->strict_track = 0; } } - } - if ( create_procfs_tree() ) { - printk("%s: create_procfs_tree() failed\n", __FILE__); + if (create_procfs_tree()) { + printk(KERN_ERR "%s: create_procfs_tree() failed\n", __FILE__); goto undo_cache_create; } - - printk("memtrack::%s done.\n", __func__); + printk(KERN_INFO "memtrack::%s done.\n", __func__); return 0; undo_cache_create: - for (j=0; jhash_lock, flags); /* protect per bucket/list */ - cur_mem_info_p = - obj_desc_p->mem_hash[cur_bucket]; + cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; while (cur_mem_info_p != NULL) { /* scan bucket */ next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */ - kmem_cache_free(meminfo_cache, - cur_mem_info_p); + kmem_cache_free(meminfo_cache, cur_mem_info_p); cur_mem_info_p = next_mem_info_p; } /* while cur_mem_info_p */ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); @@ -581,20 +948,11 @@ void cleanup_module(void) } } /* for memtype */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) - if (kmem_cache_destroy(meminfo_cache) != 0) { - printk - ("memtrack::cleanup_module: Failed on kmem_cache_destroy !\n"); - } +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) + if (kmem_cache_destroy(meminfo_cache) != 0) + printk(KERN_ERR "memtrack::cleanup_module: Failed on kmem_cache_destroy!\n"); #else kmem_cache_destroy(meminfo_cache); #endif - printk("memtrack::cleanup_module done.\n"); + printk(KERN_INFO "memtrack::cleanup_module done.\n"); } - -EXPORT_SYMBOL(memtrack_alloc); -EXPORT_SYMBOL(memtrack_free); - -//module_init(memtrack_init) -//module_exit(memtrack_exit) - diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.h b/sys/ofed/drivers/infiniband/debug/memtrack.h index e443a314b9f5..76265ae06a18 100644 --- a/sys/ofed/drivers/infiniband/debug/memtrack.h +++ b/sys/ofed/drivers/infiniband/debug/memtrack.h @@ -22,24 +22,85 @@ #ifndef H_MEMTRACK_H #define H_MEMTRACK_H -typedef enum { +enum memtrack_memtype_t { MEMTRACK_KMALLOC, MEMTRACK_VMALLOC, MEMTRACK_KMEM_OBJ, + MEMTRACK_IOREMAP, /* IO-RE/UN-MAP */ + MEMTRACK_WORK_QUEUE, /* Handle work-queue create & destroy */ + MEMTRACK_PAGE_ALLOC, /* Handle page allocation and free */ + MEMTRACK_DMA_MAP_SINGLE,/* Handle ib_dma_single map and unmap */ + MEMTRACK_DMA_MAP_PAGE, /* Handle ib_dma_page map and unmap */ + MEMTRACK_DMA_MAP_SG, /* Handle ib_dma_sg map and unmap with and without attributes */ MEMTRACK_NUM_OF_MEMTYPES -} memtrack_memtype_t; +}; /* Invoke on memory allocation */ -void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr, - unsigned long size, const char *filename, +void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, + unsigned long addr, unsigned long size, unsigned long addr2, + int direction, const char *filename, const unsigned long line_num, int alloc_flags); /* Invoke on memory free */ -void memtrack_free(memtrack_memtype_t memtype, unsigned long addr, +void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, + unsigned long addr, unsigned long size, int direction, const char *filename, const unsigned long line_num); +/* + * This function recognizes allocations which + * may be released by kernel (e.g. skb & vnic) and + * therefore not trackable by memtrack. + * The allocations are recognized by the name + * of their calling function. + */ +int is_non_trackable_alloc_func(const char *func_name); +/* + * In some cases we need to free a memory + * we defined as "non trackable" (see + * is_non_trackable_alloc_func). + * This function recognizes such releases + * by the name of their calling function. + */ +int is_non_trackable_free_func(const char *func_name); + +/* WA - In this function handles confirm + the the function name is + '__ib_umem_release' or 'ib_umem_get' + In this case we won't track the + memory there because the kernel + was the one who allocated it. + Return value: + 1 - if the function name is match, else 0 */ +int is_umem_put_page(const char *func_name); + +/* Check page order size + When Freeing a page allocation it checks whether + we are trying to free the same amount of pages + we ask to allocate (In log2(order)). + In case an error if found it will print + an error msg */ +int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, + unsigned long size, const char *filename, + const unsigned long line_num); + +/* Search for a specific addr whether it exist in the + current data-base. + If not it will print an error msg, + Return value: 0 - if addr exist, else 1 */ +int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, + const char *filename, const unsigned long line_num); + +/* Return current page reference counter */ +int memtrack_get_page_ref_count(unsigned long addr); + /* Report current allocations status (for all memory types) */ /* we do not export this function since it is used by cleanup_module only */ /* void memtrack_report(void); */ +/* Allow support of error injections */ +int memtrack_inject_error(void); + +/* randomize allocated memory */ +int memtrack_randomize_mem(void); + #endif diff --git a/sys/ofed/drivers/infiniband/debug/mtrack.h b/sys/ofed/drivers/infiniband/debug/mtrack.h index 337d9c3986a0..5c0cd20110be 100644 --- a/sys/ofed/drivers/infiniband/debug/mtrack.h +++ b/sys/ofed/drivers/infiniband/debug/mtrack.h @@ -1,46 +1,84 @@ #ifndef __mtrack_h_ #define __mtrack_h_ -#include +#include "memtrack.h" #include #include -#include +#include +#include /* For ioremap_nocache, ioremap, iounmap */ +#include +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 27) +# include /* For ioremap_nocache, ioremap, iounmap */ +#endif +#include /* For all page handling */ +#include /* For all work-queue handling */ +#include /* For using scatterlists */ +#include /* For skbufs handling */ +#include /* For copy from/to user */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) +#define MEMTRACK_ERROR_INJECTION_MESSAGE(file, line, func) ({ \ + printk(KERN_ERR "%s failure injected at %s:%d\n", func, file, line); \ + dump_stack(); \ +}) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 14) #define RDMA_KZALLOC_H #define kzalloc(size, flags) ({ \ - void *__memtrack_kz_addr; \ + void *__memtrack_kz_addr = NULL; \ \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\ + else \ __memtrack_kz_addr = kmalloc(size, flags); \ - if ( __memtrack_kz_addr ) { \ - memset( __memtrack_kz_addr, 0, size) ; \ + if (__memtrack_kz_addr && !is_non_trackable_alloc_func(__func__)) { \ + memset(__memtrack_kz_addr, 0, size); \ } \ __memtrack_kz_addr; \ }) #else #define kzalloc(size, flags) ({ \ - void *__memtrack_addr; \ + void *__memtrack_addr = NULL; \ \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\ + else \ __memtrack_addr = kzalloc(size, flags); \ - if ( __memtrack_addr && (size)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), size, __FILE__, __LINE__, flags); \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ } \ __memtrack_addr; \ }) #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) +#define kzalloc_node(size, flags, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc_node"); \ + else \ + __memtrack_addr = kzalloc_node(size, flags, node); \ + if (__memtrack_addr && (size) && \ + !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) #define kcalloc(n, size, flags) kzalloc((n)*(size), flags) #else #define kcalloc(n, size, flags) ({ \ - void *__memtrack_addr; \ + void *__memtrack_addr = NULL; \ \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kcalloc");\ + else \ __memtrack_addr = kcalloc(n, size, flags); \ - if ( __memtrack_addr && (size)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), (n)*(size), __FILE__, __LINE__, flags); \ + if (__memtrack_addr && (size)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), (n)*(size), 0UL, 0, __FILE__, __LINE__, flags); \ } \ __memtrack_addr; \ }) @@ -50,76 +88,208 @@ #ifdef ZERO_OR_NULL_PTR #define kmalloc(sz, flgs) ({ \ - void *__memtrack_addr; \ + void *__memtrack_addr = NULL; \ \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\ + else \ __memtrack_addr = kmalloc(sz, flgs); \ - if ( !ZERO_OR_NULL_PTR(__memtrack_addr)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), sz, __FILE__, __LINE__, flgs); \ + if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + if (memtrack_randomize_mem()) \ + get_random_bytes(__memtrack_addr, sz); \ } \ __memtrack_addr; \ }) #else #define kmalloc(sz, flgs) ({ \ - void *__memtrack_addr; \ + void *__memtrack_addr = NULL; \ \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\ + else \ __memtrack_addr = kmalloc(sz, flgs); \ - if ( __memtrack_addr ) { \ - memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), sz, __FILE__, __LINE__, flgs); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + if (memtrack_randomize_mem()) \ + get_random_bytes(__memtrack_addr, sz); \ } \ __memtrack_addr; \ }) #endif +#define kmalloc_node(sz, flgs, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc_node"); \ + else \ + __memtrack_addr = kmalloc_node(sz, flgs, node); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + if (memtrack_randomize_mem() && ((flgs) == GFP_KERNEL)) \ + get_random_bytes(__memtrack_addr, sz); \ + } \ + __memtrack_addr; \ +}) + +#ifdef ZERO_OR_NULL_PTR +#define kmemdup(src, sz, flgs) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\ + else \ + __memtrack_addr = kmemdup(src, sz, flgs); \ + if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + } \ + __memtrack_addr; \ +}) +#else +#define kmemdup(src, sz, flgs) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\ + else \ + __memtrack_addr = kmemdup(src, sz, flgs); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + } \ + __memtrack_addr; \ +}) +#endif + #ifdef ZERO_OR_NULL_PTR #define kfree(addr) ({ \ void *__memtrack_addr = (void *)addr; \ - if ( !ZERO_OR_NULL_PTR(__memtrack_addr) ) { \ - memtrack_free(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \ + \ + if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \ + !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ } \ kfree(__memtrack_addr); \ }) #else #define kfree(addr) ({ \ void *__memtrack_addr = (void *)addr; \ - if ( __memtrack_addr ) { \ - memtrack_free(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \ + \ + if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ } \ kfree(__memtrack_addr); \ }) #endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) || defined (CONFIG_COMPAT_RCU) +#ifdef kfree_rcu + #undef kfree_rcu +#endif - - - +#ifdef ZERO_OR_NULL_PTR +#define kfree_rcu(addr, rcu_head) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \ + !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \ +}) +#else +#define kfree_rcu(addr, rcu_head) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \ +}) +#endif +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) */ #define vmalloc(size) ({ \ - void *__memtrack_addr; \ + void *__memtrack_addr = NULL; \ \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc");\ + else \ __memtrack_addr = vmalloc(size); \ - if ( __memtrack_addr ) { \ - memtrack_alloc(MEMTRACK_VMALLOC, (unsigned long)(__memtrack_addr), size, __FILE__, __LINE__, GFP_ATOMIC); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + if (memtrack_randomize_mem()) \ + get_random_bytes(__memtrack_addr, size); \ + } \ + __memtrack_addr; \ +}) + +#ifndef vzalloc +#define vzalloc(size) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc");\ + else \ + __memtrack_addr = vzalloc(size); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ } \ __memtrack_addr; \ }) +#endif +#ifndef vzalloc_node +#define vzalloc_node(size, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc_node"); \ + else \ + __memtrack_addr = vzalloc_node(size, node); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + if (memtrack_randomize_mem()) \ + get_random_bytes(__memtrack_addr, size); \ + } \ + __memtrack_addr; \ +}) +#endif + +#define vmalloc_node(size, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc_node"); \ + else \ + __memtrack_addr = vmalloc_node(size, node); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + if (memtrack_randomize_mem()) \ + get_random_bytes(__memtrack_addr, size); \ + } \ + __memtrack_addr; \ +}) #define vfree(addr) ({ \ void *__memtrack_addr = (void *)addr; \ - if ( __memtrack_addr ) { \ - memtrack_free(MEMTRACK_VMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \ + if (__memtrack_addr) { \ + memtrack_free(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ } \ vfree(__memtrack_addr); \ }) #define kmem_cache_alloc(cache, flags) ({ \ - void *__memtrack_addr; \ + void *__memtrack_addr = NULL; \ \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmem_cache_alloc"); \ + else \ __memtrack_addr = kmem_cache_alloc(cache, flags); \ - if ( __memtrack_addr ) { \ - memtrack_alloc(MEMTRACK_KMEM_OBJ, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__, flags); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 1, 0UL, 0, __FILE__, __LINE__, flags); \ } \ __memtrack_addr; \ }) @@ -127,12 +297,548 @@ #define kmem_cache_free(cache, addr) ({ \ void *__memtrack_addr = (void *)addr; \ - if ( __memtrack_addr ) { \ - memtrack_free(MEMTRACK_KMEM_OBJ, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \ + \ + if (__memtrack_addr) { \ + memtrack_free(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ } \ kmem_cache_free(cache, __memtrack_addr); \ }) +/* All IO-MAP handling */ +#define ioremap(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap");\ + else \ + __memtrack_addr = ioremap(phys_addr, size); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) + +#define io_mapping_create_wc(base, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "io_mapping_create_wc"); \ + else \ + __memtrack_addr = io_mapping_create_wc(base, size); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) + +#define io_mapping_free(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr) { \ + memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + io_mapping_free(__memtrack_addr); \ +}) + +#ifdef CONFIG_PPC +#ifdef ioremap_nocache + #undef ioremap_nocache +#endif +#define ioremap_nocache(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ + else \ + __memtrack_addr = ioremap(phys_addr, size); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) +#else +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 18) /* 2.6.16 - 2.6.17 */ +#ifdef ioremap_nocache + #undef ioremap_nocache +#endif +#define ioremap_nocache(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ + else \ + __memtrack_addr = ioremap(phys_addr, size); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) +#else +#define ioremap_nocache(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ + else \ + __memtrack_addr = ioremap_nocache(phys_addr, size); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) +#endif /* Kernel version is under 2.6.18 */ +#endif /* PPC */ + +#define iounmap(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr) { \ + memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + iounmap(__memtrack_addr); \ +}) + + +/* All Page handlers */ +/* TODO: Catch netif_rx for page dereference */ +#define alloc_pages_node(nid, gfp_mask, order) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages_node"); \ + else \ + page_addr = (struct page *)alloc_pages_node(nid, gfp_mask, order); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + page_addr; \ +}) + +#ifdef CONFIG_NUMA +#define alloc_pages(gfp_mask, order) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages"); \ + else \ + page_addr = (struct page *)alloc_pages(gfp_mask, order); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + page_addr; \ +}) +#else +#ifdef alloc_pages + #undef alloc_pages +#endif +#define alloc_pages(gfp_mask, order) ({ \ + struct page *page_addr; \ + \ + page_addr = (struct page *)alloc_pages_node(numa_node_id(), gfp_mask, order); \ + page_addr; \ +}) +#endif + +#define __get_free_pages(gfp_mask, order) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "__get_free_pages"); \ + else \ + page_addr = (struct page *)__get_free_pages(gfp_mask, order); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + page_addr; \ +}) + +#define get_zeroed_page(gfp_mask) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_zeroed_page"); \ + else \ + page_addr = (struct page *)get_zeroed_page(gfp_mask); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + (unsigned long)page_addr; \ +}) + +#define __free_pages(addr, order) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \ + memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + __free_pages(addr, order); \ +}) + + +#define free_pages(addr, order) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \ + memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + free_pages(addr, order); \ +}) + + +#define get_page(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + if (memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 0, __FILE__, __LINE__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + } \ + get_page(addr); \ +}) + +#define get_user_pages_fast(start, nr_pages, write, pages) ({ \ + int __memtrack_rc = -1; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_user_pages_fast"); \ + else \ + __memtrack_rc = get_user_pages_fast(start, nr_pages, write, pages); \ + if (__memtrack_rc > 0 && !is_non_trackable_alloc_func(__func__)) { \ + int __memtrack_i; \ + \ + for (__memtrack_i = 0; __memtrack_i < __memtrack_rc; __memtrack_i++) \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(pages[__memtrack_i]), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_rc; \ +}) + +#define put_page(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + /* Check whether this is not part of umem put page & not */\ + /* a new addr and the ref-count is 1 then we'll free this addr */\ + /* Don't change the order these conditions */ \ + if (!is_umem_put_page(__func__) && \ + !memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__) && \ + (memtrack_get_page_ref_count((unsigned long)(__memtrack_addr)) == 1)) { \ + memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + } \ + put_page(addr); \ +}) + + +/* Work-Queue handlers */ +#ifdef create_workqueue + #undef create_workqueue +#endif +#ifdef create_rt_workqueue + #undef create_rt_workqueue +#endif +#ifdef create_freezeable_workqueue + #undef create_freezeable_workqueue +#endif +#ifdef create_singlethread_workqueue + #undef create_singlethread_workqueue +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20) /* 2.6.18 - 2.6.19 */ +#define create_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 0); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) + +#define create_singlethread_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 1); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) + +#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) /* 2.6.20 - 2.6.27 */ +#define create_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 0, 0); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 22) /* 2.6.20 - 2.6.21 */ +#define create_freezeable_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 0, 1); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) +#else /* 2.6.22 - 2.6.27 */ +#define create_freezeable_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 1, 1); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) +#endif /* 2.6.20 - 2.6.27 */ + +#define create_singlethread_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 1, 0); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) + +#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 36) /* 2.6.28 - 2.6.35 */ + +#ifdef alloc_workqueue + #undef alloc_workqueue +#endif + +#define alloc_workqueue(name, flags, max_active) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), (flags), (max_active), 0); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) + +#define create_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 0, 0, 0); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) + +#define create_rt_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_rt_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 0, 0, 1); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) + +#define create_freezeable_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 1, 1, 0); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) + +#define create_singlethread_workqueue(name) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ + else \ + wq_addr = __create_workqueue((name), 1, 0, 0); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) +#else /* 2.6.36 */ +#ifdef alloc_workqueue + #undef alloc_workqueue +#endif +#ifdef CONFIG_LOCKDEP +#define alloc_workqueue(name, flags, max_active) \ +({ \ + static struct lock_class_key __key; \ + const char *__lock_name; \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (__builtin_constant_p(name)) \ + __lock_name = (name); \ + else \ + __lock_name = #name; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ + else \ + wq_addr = __alloc_workqueue_key((name), (flags), (max_active), \ + &__key, __lock_name); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) +#else +#define alloc_workqueue(name, flags, max_active) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ + else \ + wq_addr = __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) +#endif + +#define create_workqueue(name) \ + alloc_workqueue((name), WQ_RESCUER, 1); + +#define create_freezeable_workqueue(name) \ + alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1); + +#define create_singlethread_workqueue(name) \ + alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1); + +#endif /* Work-Queue Kernel Versions */ + +#define destroy_workqueue(wq_addr) ({ \ + void *__memtrack_addr = (void *)wq_addr; \ + \ + if (__memtrack_addr) { \ + memtrack_free(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + destroy_workqueue(wq_addr); \ +}) + +/* ONLY error injection to functions that we don't monitor */ +#define alloc_skb(size, prio) ({ \ + struct sk_buff *__memtrack_skb = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb"); \ + else \ + __memtrack_skb = alloc_skb(size, prio); \ + __memtrack_skb; \ +}) + +#define dev_alloc_skb(size) ({ \ + struct sk_buff *__memtrack_skb = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "dev_alloc_skb"); \ + else \ + __memtrack_skb = dev_alloc_skb(size); \ + __memtrack_skb; \ +}) + +#define alloc_skb_fclone(size, prio) ({ \ + struct sk_buff *__memtrack_skb = NULL; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb_fclone"); \ + else \ + __memtrack_skb = alloc_skb_fclone(size, prio); \ + __memtrack_skb; \ +}) + +#define copy_from_user(to, from, n) ({ \ + int ret = n; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_from_user"); \ + else \ + ret = copy_from_user(to, from, n); \ + ret; \ +}) + +#define copy_to_user(to, from, n) ({ \ + int ret = n; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_to_user"); \ + else \ + ret = copy_to_user(to, from, n); \ + ret; \ +}) + +#define sysfs_create_file(kobj, attr) ({ \ + int ret = -ENOSYS; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_file"); \ + else \ + ret = sysfs_create_file(kobj, attr); \ + ret; \ +}) + +#define sysfs_create_link(kobj, target, name) ({ \ + int ret = -ENOSYS; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_link"); \ + else \ + ret = sysfs_create_link(kobj, target, name); \ + ret; \ +}) + +#define sysfs_create_group(kobj, grp) ({ \ + int ret = -ENOSYS; \ + \ + if (memtrack_inject_error()) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_group"); \ + else \ + ret = sysfs_create_group(kobj, grp); \ + ret; \ +}) + #endif /* __mtrack_h_ */ diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/Makefile b/sys/ofed/drivers/infiniband/hw/mlx4/Makefile deleted file mode 100644 index 7b81da0c060c..000000000000 --- a/sys/ofed/drivers/infiniband/hw/mlx4/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# $FreeBSD$ -#.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4 -#.PATH: ${.CURDIR}/../../../../include/linux - -.include - -KMOD = mlx4ib -SRCS = device_if.h bus_if.h pci_if.h vnode_if.h -#SRCS+= linux_compat.c linux_radix.c -SRCS+= ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c -SRCS+= opt_inet.h opt_inet6.h - -#CFLAGS+= -I${.CURDIR}/../../ofed/include/ -CFLAGS+= -I${.CURDIR}/../../../../include -CFLAGS+= -DCONFIG_INFINIBAND_USER_MEM - -.if !defined(KERNBUILDDIR) -.if ${MK_INET_SUPPORT} != "no" -opt_inet.h: - @echo "#define INET 1" > ${.TARGET} -.endif - -.if ${MK_INET6_SUPPORT} != "no" -opt_inet6.h: - @echo "#define INET6 1" > ${.TARGET} -.endif -.endif - -.include - -CFLAGS+= -Wno-cast-qual -Wno-pointer-arith ${GCC_MS_EXTENSIONS} diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/ah.c b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c index fe35e62d4ae5..1c30fa996796 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/ah.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c @@ -30,7 +30,6 @@ * SOFTWARE. */ - #include #include #include @@ -95,21 +94,18 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); struct mlx4_dev *dev = ibdev->dev; - union ib_gid sgid; - u8 mac[6]; - int err; - int is_mcast; + int is_mcast = 0; + struct in6_addr in6; u16 vlan_tag; - err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num); - if (err) - return ERR_PTR(err); - - memcpy(ah->av.eth.mac, mac, 6); - err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid); - if (err) - return ERR_PTR(err); - vlan_tag = rdma_get_vlan_id(&sgid); + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) { + is_mcast = 1; + resolve_mcast_mac(&in6, ah->av.eth.mac); + } else { + memcpy(ah->av.eth.mac, ah_attr->dmac, 6); + } + vlan_tag = ah_attr->vlan_id; if (vlan_tag < 0x1000) vlan_tag |= (ah_attr->sl & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c b/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c index 0738adc5cd03..17e646a02805 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -57,6 +57,7 @@ struct mlx4_alias_guid_work_context { int query_id; struct list_head list; int block_num; + u8 method; }; struct mlx4_next_alias_guid_work { @@ -80,7 +81,8 @@ void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. ports_guid[port_num - 1]. all_rec_per_port[block_num].guid_indexes); - pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, (long long)guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, + (unsigned long long)guid_indexes); for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { /* The location of the specific index starts from bit number 4 @@ -144,7 +146,8 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. ports_guid[port_num - 1]. all_rec_per_port[block_num].guid_indexes); - pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, (long long)guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, + (unsigned long long)guid_indexes); /*calculate the slaves and notify them*/ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { @@ -201,7 +204,7 @@ static void aliasguid_query_handler(int status, { struct mlx4_ib_dev *dev; struct mlx4_alias_guid_work_context *cb_ctx = context; - u8 port_index ; + u8 port_index; int i; struct mlx4_sriov_alias_guid_info_rec_det *rec; unsigned long flags, flags1; @@ -240,6 +243,18 @@ static void aliasguid_query_handler(int status, for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) { __be64 tmp_cur_ag; tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE]; + if ((cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) + && (MLX4_NOT_SET_GUID == tmp_cur_ag)) { + pr_debug("%s:Record num %d in block_num:%d " + "was deleted by SM,ownership by %d " + "(0 = driver, 1=sysAdmin, 2=None)\n", + __func__, i, guid_rec->block_num, + rec->ownership); + rec->guid_indexes = rec->guid_indexes & + ~mlx4_ib_get_aguid_comp_mask_from_ix(i); + continue; + } + /* check if the SM didn't assign one of the records. * if it didn't, if it was not sysadmin request: * ask the SM to give a new GUID, (instead of the driver request). @@ -379,7 +394,7 @@ static int set_guid_rec(struct ib_device *ibdev, callback_context->port = port; callback_context->dev = dev; callback_context->block_num = index; - + callback_context->method = rec_det->method; memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); guid_info_rec.lid = cpu_to_be16(attr.lid); diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/cm.c b/sys/ofed/drivers/infiniband/hw/mlx4/cm.c index 1bfbeee57107..3ff7600dd775 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/cm.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/cm.c @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -60,6 +61,11 @@ struct cm_generic_msg { __be32 remote_comm_id; }; +struct cm_sidr_generic_msg { + struct ib_mad_hdr hdr; + __be32 request_id; +}; + struct cm_req_msg { unsigned char unused[0x60]; union ib_gid primary_path_sgid; @@ -68,28 +74,62 @@ struct cm_req_msg { static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) { + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return; + } else { struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; msg->local_comm_id = cpu_to_be32(cm_id); + } } static u32 get_local_comm_id(struct ib_mad *mad) { + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return -1; + } else { struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - return be32_to_cpu(msg->local_comm_id); + } } static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) { + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return; + } else { struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; msg->remote_comm_id = cpu_to_be32(cm_id); + } } static u32 get_remote_comm_id(struct ib_mad *mad) { + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return -1; + } else { struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - return be32_to_cpu(msg->remote_comm_id); + } } static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) @@ -285,19 +325,22 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id u32 sl_cm_id; int pv_cm_id = -1; - sl_cm_id = get_local_comm_id(mad); - if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || - mad->mad_hdr.attr_id == CM_REP_ATTR_ID) { + mad->mad_hdr.attr_id == CM_REP_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + sl_cm_id = get_local_comm_id(mad); id = id_map_alloc(ibdev, slave_id, sl_cm_id); if (IS_ERR(id)) { mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", __func__, slave_id, sl_cm_id); return PTR_ERR(id); } - } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) { + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { return 0; } else { + sl_cm_id = get_local_comm_id(mad); id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); } @@ -323,7 +366,8 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, u32 pv_cm_id; struct id_map_entry *id; - if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) { + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { union ib_gid gid; if (is_eth) @@ -333,7 +377,7 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); if (*slave < 0) { mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n", - (long long)gid.global.interface_id); + (unsigned long long)gid.global.interface_id); return -ENOENT; } return 0; diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/cq.c b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c index 293917a2c682..52788c291638 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/cq.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c @@ -33,6 +33,7 @@ #include #include +#include #include #include "mlx4_ib.h" @@ -92,12 +93,33 @@ static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq) return get_sw_cqe(cq, cq->mcq.cons_index); } -int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +int mlx4_ib_modify_cq(struct ib_cq *cq, + struct ib_cq_attr *cq_attr, + int cq_attr_mask) { + int err = 0; struct mlx4_ib_cq *mcq = to_mcq(cq); struct mlx4_ib_dev *dev = to_mdev(cq->device); - return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period); + if (cq_attr_mask & IB_CQ_CAP_FLAGS) { + if (cq_attr->cq_cap_flags & IB_CQ_TIMESTAMP) + return -ENOTSUPP; + + if (cq_attr->cq_cap_flags & IB_CQ_IGNORE_OVERRUN) { + if (dev->dev->caps.cq_flags & MLX4_DEV_CAP_CQ_FLAG_IO) + err = mlx4_cq_ignore_overrun(dev->dev, &mcq->mcq); + else + err = -ENOSYS; + } + } + + if (!err) + if (cq_attr_mask & IB_CQ_MODERATION) + err = mlx4_cq_modify(dev->dev, &mcq->mcq, + cq_attr->moderation.cq_count, + cq_attr->moderation.cq_period); + + return err; } static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent) @@ -173,7 +195,11 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont return err; } -struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, +/* we don't support system timestamping */ +#define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_TIMESTAMP + +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, + struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata) { @@ -181,11 +207,16 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector struct mlx4_ib_cq *cq; struct mlx4_uar *uar; int err; + int entries = attr->cqe; + int vector = attr->comp_vector; if (entries < 1 || entries > dev->dev->caps.max_cqes) return ERR_PTR(-EINVAL); - cq = kmalloc(sizeof *cq, GFP_KERNEL); + if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED) + return ERR_PTR(-EINVAL); + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); if (!cq) return ERR_PTR(-ENOMEM); @@ -195,6 +226,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector spin_lock_init(&cq->lock); cq->resize_buf = NULL; cq->resize_umem = NULL; + cq->create_flags = attr->flags; if (context) { struct mlx4_ib_create_cq ucmd; @@ -236,7 +268,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector vector = dev->eq_table[vector % ibdev->num_comp_vectors]; err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, - cq->db.dma, &cq->mcq, vector, 0, 0); + cq->db.dma, &cq->mcq, vector, 0, + !!(cq->create_flags & IB_CQ_TIMESTAMP)); if (err) goto err_dbmap; @@ -331,21 +364,23 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) u32 i; i = cq->mcq.cons_index; - while (get_sw_cqe(cq, i & cq->ibcq.cqe)) + while (get_sw_cqe(cq, i)) ++i; return i - cq->mcq.cons_index; } -static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) +static int mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) { struct mlx4_cqe *cqe, *new_cqe; int i; int cqe_size = cq->buf.entry_size; int cqe_inc = cqe_size == 64 ? 1 : 0; + struct mlx4_cqe *start_cqe; i = cq->mcq.cons_index; cqe = get_cqe(cq, i & cq->ibcq.cqe); + start_cqe = cqe; cqe += cqe_inc; while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { @@ -357,9 +392,15 @@ static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); cqe = get_cqe(cq, ++i & cq->ibcq.cqe); + if (cqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", cq->mcq.cqn); + return -ENOMEM; + } cqe += cqe_inc; + } ++cq->mcq.cons_index; + return 0; } int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) @@ -374,7 +415,6 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) return -ENOSYS; mutex_lock(&cq->resize_mutex); - if (entries < 1 || entries > dev->dev->caps.max_cqes) { err = -EINVAL; goto out; @@ -386,6 +426,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) goto out; } + if (entries > dev->dev->caps.max_cqes + 1) { + err = -EINVAL; + goto out; + } + if (ibcq->uobject) { err = mlx4_alloc_resize_umem(dev, cq, entries, udata); if (err) @@ -425,7 +470,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) spin_lock_irq(&cq->lock); if (cq->resize_buf) { - mlx4_ib_cq_resize_copy_cqes(cq); + err = mlx4_ib_cq_resize_copy_cqes(cq); tmp_buf = cq->buf; tmp_cqe = cq->ibcq.cqe; cq->buf = cq->resize_buf->buf; @@ -580,7 +625,7 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) } static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, - unsigned tail, struct mlx4_cqe *cqe) + unsigned tail, struct mlx4_cqe *cqe, int is_eth) { struct mlx4_ib_proxy_sqp_hdr *hdr; @@ -590,12 +635,19 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct DMA_FROM_DEVICE); hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); - wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); - wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; wc->dlid_path_bits = 0; + if (is_eth) { + wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); + memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); + memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); + } else { + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + } + return 0; } @@ -607,11 +659,14 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_qp *mqp; struct mlx4_ib_wq *wq; struct mlx4_ib_srq *srq; + struct mlx4_srq *msrq = NULL; int is_send; int is_error; u32 g_mlpath_rqpn; u16 wqe_ctr; unsigned tail = 0; + int timestamp_en = !!(cq->create_flags & IB_CQ_TIMESTAMP); + repoll: cqe = next_cqe_sw(cq); @@ -675,6 +730,20 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, wc->qp = &(*cur_qp)->ibqp; + if (wc->qp->qp_type == IB_QPT_XRC_TGT) { + u32 srq_num; + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + srq_num = g_mlpath_rqpn & 0xffffff; + /* SRQ is also in the radix tree */ + msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, + srq_num); + if (unlikely(!msrq)) { + pr_warn("CQ %06x with entry for unknown SRQN %06x\n", + cq->mcq.cqn, srq_num); + return -EINVAL; + } + } + if (is_send) { wq = &(*cur_qp)->sq; if (!(*cur_qp)->sq_signal_bits) { @@ -688,6 +757,11 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, wqe_ctr = be16_to_cpu(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_ctr]; mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else if (msrq) { + srq = to_mibsrq(msrq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else { wq = &(*cur_qp)->rq; tail = wq->tail & (wq->wqe_cnt - 1); @@ -707,6 +781,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { case MLX4_OPCODE_RDMA_WRITE_IMM: wc->wc_flags |= IB_WC_WITH_IMM; + /* fall through */ case MLX4_OPCODE_RDMA_WRITE: wc->opcode = IB_WC_RDMA_WRITE; break; @@ -778,10 +853,31 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, if ((*cur_qp)->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) - return use_tunnel_data(*cur_qp, cq, wc, tail, cqe); + return use_tunnel_data + (*cur_qp, cq, wc, tail, cqe, + rdma_port_get_link_layer + (wc->qp->device, + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET); } + if (timestamp_en) { + /* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is + * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't + * supported */ + if (cq->create_flags & IB_CQ_TIMESTAMP_TO_SYS_TIME) { + wc->ts.timestamp = 0; + } else { + wc->ts.timestamp = + ((u64)(be32_to_cpu(cqe->timestamp_16_47) + + !cqe->timestamp_0_15) << 16) + | be16_to_cpu(cqe->timestamp_0_15); + wc->wc_flags |= IB_WC_WITH_TIMESTAMP; + } + } else { + wc->wc_flags |= IB_WC_WITH_SLID; wc->slid = be16_to_cpu(cqe->rlid); + } g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); wc->src_qp = g_mlpath_rqpn & 0xffffff; wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; @@ -789,11 +885,27 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; + if (!timestamp_en) { if (rdma_port_get_link_layer(wc->qp->device, - (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET) + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET) wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; else wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; + wc->wc_flags |= IB_WC_WITH_SL; + } + if ((be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) && !timestamp_en) { + wc->vlan_id = be16_to_cpu(cqe->sl_vid) & + MLX4_CQE_VID_MASK; + wc->wc_flags |= IB_WC_WITH_VLAN; + } else { + wc->vlan_id = 0xffff; + } + if (!timestamp_en) { + memcpy(wc->smac, cqe->smac, 6); + wc->wc_flags |= IB_WC_WITH_SMAC; + } } return 0; diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c b/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c index 8aee4233b388..c51740986367 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c @@ -45,7 +45,6 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db) { struct mlx4_ib_user_db_page *page; - struct ib_umem_chunk *chunk; int err = 0; mutex_lock(&context->db_page_mutex); @@ -73,8 +72,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, list_add(&page->list, &context->db_page_list); found: - chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list); - db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK); + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); db->u.user_page = page; ++page->refcnt; diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mad.c b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c index 74bbf5c5f352..bd3693147aec 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/mad.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c @@ -545,11 +545,32 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, /* adjust tunnel data */ tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); - tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); - tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + if (is_eth) { + u16 vlan = 0; + if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, + NULL)) { + if (vlan != wc->vlan_id) + /* VST and default vlan is not the packet vlan drop the + * packet*/ + goto out; + else + /* VST , remove hide the vlan from the VF */ + vlan = 0; + } else { + vlan = wc->vlan_id; + } + + tun_mad->hdr.sl_vid = cpu_to_be16(vlan); + memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); + memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); + } else { + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + } + ib_dma_sync_single_for_device(&dev->ib_dev, tun_qp->tx_ring[tun_tx_ix].buf.map, sizeof (struct mlx4_rcv_tunnel_mad), @@ -696,12 +717,11 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, be16_to_cpu(in_mad->mad_hdr.attr_id)); if (in_wc->wc_flags & IB_WC_GRH) { pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", - (long long)be64_to_cpu(in_grh->sgid.global.subnet_prefix), - (long long) - be64_to_cpu(in_grh->sgid.global.interface_id)); + (unsigned long long)be64_to_cpu(in_grh->sgid.global.subnet_prefix), + (unsigned long long)be64_to_cpu(in_grh->sgid.global.interface_id)); pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", - (long long)be64_to_cpu(in_grh->dgid.global.subnet_prefix), - (long long)be64_to_cpu(in_grh->dgid.global.interface_id)); + (unsigned long long)be64_to_cpu(in_grh->dgid.global.subnet_prefix), + (unsigned long long)be64_to_cpu(in_grh->dgid.global.interface_id)); } } @@ -946,7 +966,7 @@ int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index, err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0, MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, - MLX4_CMD_WRAPPED); + MLX4_CMD_NATIVE); if (!err) memcpy(counter, mailbox->buf, MLX4_IF_STAT_SZ(1)); @@ -961,7 +981,7 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, { struct mlx4_ib_dev *dev = to_mdev(ibdev); int err; - u32 counter_index = dev->counters[port_num - 1] & 0xffff; + u32 counter_index = dev->counters[port_num - 1].counter_index & 0xffff; u8 mode; char counter_buf[MLX4_IF_STAT_SZ(1)]; union mlx4_counter *counter = (union mlx4_counter *) @@ -970,10 +990,16 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) return -EINVAL; - if (mlx4_ib_query_if_stat(dev, counter_index, counter, 0)) { - err = IB_MAD_RESULT_FAILURE; - } else { + /* in case of default counter IB shares the counter with ETH */ + /* the state could be -EEXIST or -ENOSPC */ + if (dev->counters[port_num - 1].status) { memset(out_mad->data, 0, sizeof out_mad->data); + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + } else { + if (mlx4_ib_query_if_stat(dev, counter_index, counter, 0)) + return IB_MAD_RESULT_FAILURE; + + memset(out_mad->data, 0, sizeof(out_mad->data)); mode = counter->control.cnt_mode & 0xFF; err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; switch (mode & 0xf) { @@ -992,7 +1018,6 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, } } - return err; } @@ -1179,6 +1204,11 @@ void handle_port_mgmt_change_event(struct work_struct *work) u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; update_sm_ah(dev, port, lid, sl); + mlx4_ib_dispatch_event(dev, port, IB_EVENT_SM_CHANGE); + if (mlx4_is_master(dev->dev)) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port, + changed_attr & MSTR_SM_CHANGE_MASK, + lid, sl); } /* Check if it is a lid change event */ @@ -1295,8 +1325,9 @@ static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, - enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, - u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad) + enum ib_qp_type dest_qpt, u16 pkey_index, + u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, + u8 *s_mac, struct ib_mad *mad) { struct ib_sge list; struct ib_send_wr wr, *bad_wr; @@ -1385,6 +1416,9 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, wr.num_sge = 1; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; + if (s_mac) + memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); + ret = ib_post_send(send_qp, &wr, &bad_wr); out: @@ -1512,6 +1546,11 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc if (ah_attr.ah_flags & IB_AH_GRH) if (get_real_sgid_index(dev, slave, ctx->port, &ah_attr)) return; + memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); + ah_attr.vlan_id = tunnel->hdr.vlan; + /* if slave have default vlan use it */ + mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, + &ah_attr.vlan_id, &ah_attr.sl); mlx4_ib_send_to_wire(dev, slave, ctx->port, is_proxy_qp0(dev, wc->src_qp, slave) ? @@ -1519,7 +1558,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc be16_to_cpu(tunnel->hdr.pkey_index), be32_to_cpu(tunnel->hdr.remote_qpn), be32_to_cpu(tunnel->hdr.qkey), - &ah_attr, &tunnel->mad); + &ah_attr, wc->smac, &tunnel->mad); } static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, @@ -1564,6 +1603,12 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, tun_qp->ring[i].addr, rx_buf_size, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(ctx->ib_dev, + tun_qp->ring[i].map))) { + mlx4_ib_warn(ctx->ib_dev, "ib_dma_map_single failed\n"); + kfree(tun_qp->ring[i].addr); + goto err; + } } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { @@ -1576,6 +1621,12 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, tun_qp->tx_ring[i].buf.addr, tx_buf_size, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ctx->ib_dev, + tun_qp->tx_ring[i].buf.map))) { + mlx4_ib_warn(ctx->ib_dev, "ib_dma_map_single failed\n"); + kfree(tun_qp->tx_ring[i].buf.addr); + goto tx_err; + } tun_qp->tx_ring[i].ah = NULL; } spin_lock_init(&tun_qp->tx_lock); @@ -1664,12 +1715,12 @@ static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) (MLX4_NUM_TUNNEL_BUFS - 1)); if (ret) pr_err("Failed reposting tunnel " - "buf:%lld\n", (long long)wc.wr_id); + "buf:%lld\n", (unsigned long long)wc.wr_id); break; case IB_WC_SEND: pr_debug("received tunnel send completion:" "wrid=0x%llx, status=0x%x\n", - (long long)wc.wr_id, wc.status); + (unsigned long long)wc.wr_id, wc.status); ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah @@ -1685,7 +1736,7 @@ static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) } else { pr_debug("mlx4_ib: completion error in tunnel: %d." " status = %d, wrid = 0x%llx\n", - ctx->slave, wc.status, (long long)wc.wr_id); + ctx->slave, wc.status, (unsigned long long)wc.wr_id); if (!MLX4_TUN_IS_RECV(wc.wr_id)) { ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); @@ -1757,6 +1808,11 @@ static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, memset(&attr, 0, sizeof attr); attr.qp_state = IB_QPS_INIT; + ret = 0; + if (create_tun) + ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave, + ctx->port, 0xFFFF, &attr.pkey_index); + if (ret || !create_tun) attr.pkey_index = to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; attr.qkey = IB_QP1_QKEY; @@ -1837,7 +1893,7 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work) if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1))) pr_err("Failed reposting SQP " - "buf:%lld\n", (long long)wc.wr_id); + "buf:%lld\n", (unsigned long long)wc.wr_id); break; default: BUG_ON(1); @@ -1846,7 +1902,7 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work) } else { pr_debug("mlx4_ib: completion error in tunnel: %d." " status = %d, wrid = 0x%llx\n", - ctx->slave, wc.status, (long long)wc.wr_id); + ctx->slave, wc.status, (unsigned long long)wc.wr_id); if (!MLX4_TUN_IS_RECV(wc.wr_id)) { ib_destroy_ah(sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah); diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/main.c b/sys/ofed/drivers/infiniband/hw/mlx4/main.c index fd0b7235873a..bdcffbe84f92 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/main.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/main.c @@ -32,37 +32,37 @@ */ #include - -#ifdef __linux__ -#include -#endif - #include #include #include #include #include -#include -#include #include +#include #include #include +#include #include #include #include #include +#include +#include #include "mlx4_ib.h" +#include "mlx4_exp.h" #include "user.h" #include "wc.h" #define DRV_NAME MLX4_IB_DRV_NAME #define DRV_VERSION "1.0" -#define DRV_RELDATE "April 4, 2008" +#define DRV_RELDATE __DATE__ #define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib" #define MLX4_IB_MRS_PROC_DIR_NAME "mrs" +#define MLX4_IB_FLOW_MAX_PRIO 0xFFF +#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); @@ -73,20 +73,30 @@ MODULE_VERSION(DRV_VERSION); int mlx4_ib_sm_guid_assign = 1; -#ifdef __linux__ -struct proc_dir_entry *mlx4_mrs_dir_entry; -static struct proc_dir_entry *mlx4_ib_driver_dir_entry; -#endif - module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); -static char dev_assign_str[512]; -//module_param_string(dev_assign_str, dev_assign_str, sizeof(dev_assign_str), 0644); -MODULE_PARM_DESC(dev_assign_str, "Map all device function numbers to " - "IB device numbers following the pattern: " - "bb:dd.f-0,bb:dd.f-1,... (all numbers are hexadecimals)." - " Max supported devices - 32"); +enum { + MAX_NUM_STR_BITMAP = 1 << 15, + DEFAULT_TBL_VAL = -1 +}; + +static struct mlx4_dbdf2val_lst dev_assign_str = { + .name = "dev_assign_str param", + .num_vals = 1, + .def_val = {DEFAULT_TBL_VAL}, + .range = {0, MAX_NUM_STR_BITMAP - 1} +}; +module_param_string(dev_assign_str, dev_assign_str.str, + sizeof(dev_assign_str.str), 0444); +MODULE_PARM_DESC(dev_assign_str, + "Map device function numbers to IB device numbers (e.g. '0000:04:00.0-0,002b:1c:0b.a-1,...').\n" + "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for IB device numbers (e.g. 1).\n" + "\t\tMax supported devices - 32"); + + +static unsigned long *dev_num_str_bitmap; +static spinlock_t dev_num_str_lock; static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" @@ -106,11 +116,16 @@ struct dev_rec { int nr; }; -#define MAX_DR 32 -static struct dev_rec dr[MAX_DR]; +static int dr_active; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device*, + unsigned long); + +static u8 mlx4_ib_get_dev_port(struct net_device *dev, + struct mlx4_ib_dev *ibdev); + static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) @@ -123,7 +138,30 @@ static void init_query_mad(struct ib_smp *mad) static union ib_gid zgid; -static int mlx4_ib_query_device(struct ib_device *ibdev, +static int check_flow_steering_support(struct mlx4_dev *dev) +{ + int eth_num_ports = 0; + int ib_num_ports = 0; + int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; + + if (dmfs) { + int i; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + eth_num_ports++; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + dmfs &= (!ib_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && + (!eth_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); + if (ib_num_ports && mlx4_is_mfunc(dev)) { + dmfs = 0; + } + } + return dmfs; +} + +int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct mlx4_ib_dev *dev = to_mdev(ibdev); @@ -174,12 +212,26 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL) + props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + + if (check_flow_steering_support(dev->dev)) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + props->device_cap_flags |= IB_DEVICE_QPG; if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { props->device_cap_flags |= IB_DEVICE_UD_RSS; props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz; } + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; + else + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; + } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; props->vendor_part_id = dev->dev->pdev->device; @@ -213,6 +265,13 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; + props->hca_core_clock = dev->dev->caps.hca_core_clock; + if (dev->dev->caps.hca_core_clock > 0) + props->comp_mask |= IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; + if (dev->dev->caps.cq_timestamp) { + props->timestamp_mask = 0xFFFFFFFFFFFF; + props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK; + } out: kfree(in_mad); @@ -334,6 +393,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, struct net_device *ndev; enum ib_mtu tmp; struct mlx4_cmd_mailbox *mailbox; + unsigned long flags; int err = 0; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); @@ -362,7 +422,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; - spin_lock(&iboe->lock); + spin_lock_irqsave(&iboe->lock, flags); ndev = iboe->netdevs[port - 1]; if (!ndev) goto out_unlock; @@ -374,7 +434,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); out_unlock: - spin_unlock(&iboe->lock); + spin_unlock_irqrestore(&iboe->lock, flags); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; @@ -674,7 +734,9 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) return 0; } -#ifdef __linux__ + +/* XXX FBSD has no support for get_unmapped_area function */ +#if 0 static unsigned long mlx4_ib_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -732,7 +794,6 @@ static unsigned long mlx4_ib_get_unmapped_area(struct file *file, static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); - int err; /* Last 8 bits hold the command others are data per that command */ unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK; @@ -758,31 +819,81 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) dev->dev->caps.num_uars, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; - } else if (command == MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { - /* Getting contiguous physical pages */ - unsigned long total_size = vma->vm_end - vma->vm_start; - unsigned long page_size_order = (vma->vm_pgoff) >> - MLX4_IB_MMAP_CMD_BITS; - struct ib_cmem *ib_cmem; - ib_cmem = ib_cmem_alloc_contiguous_pages(context, total_size, - page_size_order); - if (IS_ERR(ib_cmem)) { - err = PTR_ERR(ib_cmem); - return err; - } + } else if (command == MLX4_IB_MMAP_GET_HW_CLOCK) { + struct mlx4_clock_params params; + int ret; - err = ib_cmem_map_contiguous_pages_to_vma(ib_cmem, vma); - if (err) { - ib_cmem_release_contiguous_pages(ib_cmem); - return err; - } - return 0; + ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); + if (ret) + return ret; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + (pci_resource_start(dev->dev->pdev, + params.bar) + params.offset) + >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; } else return -EINVAL; return 0; } +static int mlx4_ib_ioctl(struct ib_ucontext *context, unsigned int cmd, + unsigned long arg) +{ + struct mlx4_ib_dev *dev = to_mdev(context->device); + int ret; + int offset; + + switch (cmd) { + case MLX4_IOCHWCLOCKOFFSET: { + struct mlx4_clock_params params; + int ret; + ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); + if (!ret) { + offset = params.offset % PAGE_SIZE; + ret = put_user(offset, + (int *)arg); + return sizeof(int); + } else { + return ret; + } + } + default: { + pr_err("mlx4_ib: invalid ioctl %u command with arg %lX\n", + cmd, arg); + return -ENOTTY; + } + } + + return ret; +} + +static int mlx4_ib_query_values(struct ib_device *device, int q_values, + struct ib_device_values *values) +{ + struct mlx4_ib_dev *dev = to_mdev(device); + cycle_t cycles; + + values->values_mask = 0; + if (q_values & IBV_VALUES_HW_CLOCK) { + cycles = mlx4_read_clock(dev->dev); + if (cycles < 0) { + values->hwclock = cycles & CORE_CLOCK_MASK; + values->values_mask |= IBV_VALUES_HW_CLOCK; + } + q_values &= ~IBV_VALUES_HW_CLOCK; + } + + if (q_values) + return -ENOTTY; + + return 0; +} + static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) @@ -926,256 +1037,218 @@ struct mlx4_ib_steering { union ib_gid gid; }; -static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +static int parse_flow_attr(struct mlx4_dev *dev, + union ib_flow_spec *ib_spec, + struct _rule_hw *mlx4_spec) { - int err; - struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); - struct mlx4_ib_qp *mqp = to_mqp(ibqp); - u64 reg_id; - struct mlx4_ib_steering *ib_steering = NULL; + enum mlx4_net_trans_rule_id type; - if (mdev->dev->caps.steering_mode == - MLX4_STEERING_MODE_DEVICE_MANAGED) { - ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); - if (!ib_steering) - return -ENOMEM; - } - - err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, - !!(mqp->flags & - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), - MLX4_PROT_IB_IPV6, ®_id); - if (err) - goto err_malloc; - - err = add_gid_entry(ibqp, gid); - if (err) - goto err_add; - - if (ib_steering) { - memcpy(ib_steering->gid.raw, gid->raw, 16); - ib_steering->reg_id = reg_id; - mutex_lock(&mqp->mutex); - list_add(&ib_steering->list, &mqp->steering_rules); - mutex_unlock(&mqp->mutex); - } - return 0; - -err_add: - mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - MLX4_PROT_IB_IPV6, reg_id); -err_malloc: - kfree(ib_steering); - - return err; -} - -enum { - IBV_FLOW_L4_NONE = 0, - IBV_FLOW_L4_OTHER = 3, - IBV_FLOW_L4_UDP = 5, - IBV_FLOW_L4_TCP = 6 -}; - -struct mlx4_cm_steering { - struct list_head list; - u64 reg_id; - struct ib_flow_spec spec; -}; - -static int flow_spec_to_net_rule(struct ib_device *dev, struct ib_flow_spec *flow_spec, - struct list_head *rule_list_h) -{ - struct mlx4_spec_list *spec_l2, *spec_l3, *spec_l4; - u64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16); - - spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL); - if (!spec_l2) - return -ENOMEM; - - switch (flow_spec->type) { - case IB_FLOW_ETH: - spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH; - memcpy(spec_l2->eth.dst_mac, flow_spec->l2_id.eth.mac, ETH_ALEN); - memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN); - spec_l2->eth.ether_type = flow_spec->l2_id.eth.ethertype; - if (flow_spec->l2_id.eth.vlan_present) { - spec_l2->eth.vlan_id = flow_spec->l2_id.eth.vlan; - spec_l2->eth.vlan_id_msk = cpu_to_be16(0x0fff); - } + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + type = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, + ETH_ALEN); + memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, + ETH_ALEN); + mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; + mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; break; - case IB_FLOW_IB_UC: - spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB; - if(flow_spec->l2_id.ib_uc.qpn) { - spec_l2->ib.l3_qpn = cpu_to_be32(flow_spec->l2_id.ib_uc.qpn); - spec_l2->ib.qpn_msk = cpu_to_be32(0xffffff); - } + + case IB_FLOW_SPEC_IB: + type = MLX4_NET_TRANS_RULE_ID_IB; + mlx4_spec->ib.l3_qpn = ib_spec->ib.val.l3_type_qpn; + mlx4_spec->ib.qpn_mask = ib_spec->ib.mask.l3_type_qpn; + memcpy(&mlx4_spec->ib.dst_gid, ib_spec->ib.val.dst_gid, 16); + memcpy(&mlx4_spec->ib.dst_gid_msk, + ib_spec->ib.mask.dst_gid, 16); break; - case IB_FLOW_IB_MC_IPV4: - case IB_FLOW_IB_MC_IPV6: - spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB; - memcpy(spec_l2->ib.dst_gid, flow_spec->l2_id.ib_mc.mgid, 16); - memset(spec_l2->ib.dst_gid_msk, 0xff, 16); + + case IB_FLOW_SPEC_IPV4: + type = MLX4_NET_TRANS_RULE_ID_IPV4; + mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; + mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; + mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; + mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; break; - } + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + type = ib_spec->type == IB_FLOW_SPEC_TCP ? + MLX4_NET_TRANS_RULE_ID_TCP : + MLX4_NET_TRANS_RULE_ID_UDP; + mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; + mlx4_spec->tcp_udp.dst_port_msk = + ib_spec->tcp_udp.mask.dst_port; + mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; + mlx4_spec->tcp_udp.src_port_msk = + ib_spec->tcp_udp.mask.src_port; + break; - list_add_tail(&spec_l2->list, rule_list_h); - - if (flow_spec->l2_id.eth.ethertype == cpu_to_be16(ETH_P_IP) || - flow_spec->type != IB_FLOW_ETH) { - spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL); - if (!spec_l3) - return -ENOMEM; - - spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4; - spec_l3->ipv4.src_ip = flow_spec->src_ip; - if (flow_spec->type != IB_FLOW_IB_MC_IPV4 && - flow_spec->type != IB_FLOW_IB_MC_IPV6) - spec_l3->ipv4.dst_ip = flow_spec->dst_ip; - - if (spec_l3->ipv4.src_ip) - spec_l3->ipv4.src_ip_msk = MLX4_BE_WORD_MASK; - if (spec_l3->ipv4.dst_ip) - spec_l3->ipv4.dst_ip_msk = MLX4_BE_WORD_MASK; - - list_add_tail(&spec_l3->list, rule_list_h); - } - - if (flow_spec->l4_protocol) { - spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL); - if (!spec_l4) - return -ENOMEM; - - spec_l4->tcp_udp.src_port = flow_spec->src_port; - spec_l4->tcp_udp.dst_port = flow_spec->dst_port; - if (spec_l4->tcp_udp.src_port) - spec_l4->tcp_udp.src_port_msk = - MLX4_BE_SHORT_MASK; - if (spec_l4->tcp_udp.dst_port) - spec_l4->tcp_udp.dst_port_msk = - MLX4_BE_SHORT_MASK; - - switch (flow_spec->l4_protocol) { - case IBV_FLOW_L4_UDP: - spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP; - break; - case IBV_FLOW_L4_TCP: - spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP; - break; - default: - dev_err(dev->dma_device, - "Unsupported l4 protocol.\n"); - kfree(spec_l4); - return -EPROTONOSUPPORT; - } - list_add_tail(&spec_l4->list, rule_list_h); - } - return 0; -} - -static int __mlx4_ib_flow_attach(struct mlx4_ib_dev *mdev, - struct mlx4_ib_qp *mqp, - struct ib_flow_spec *flow_spec, - int priority, int lock_qp) -{ - u64 reg_id = 0; - int err = 0; - struct mlx4_cm_steering *cm_flow; - struct mlx4_spec_list *spec, *tmp_spec; - - struct mlx4_net_trans_rule rule = - { .queue_mode = MLX4_NET_TRANS_Q_FIFO, - .exclusive = 0, - }; - - rule.promisc_mode = flow_spec->rule_type; - rule.port = mqp->port; - rule.qpn = mqp->mqp.qpn; - INIT_LIST_HEAD(&rule.list); - - cm_flow = kmalloc(sizeof(*cm_flow), GFP_KERNEL); - if (!cm_flow) - return -ENOMEM; - - if (rule.promisc_mode == MLX4_FS_REGULAR) { - rule.allow_loopback = !flow_spec->block_mc_loopback; - rule.priority = MLX4_DOMAIN_UVERBS | priority; - err = flow_spec_to_net_rule(&mdev->ib_dev, flow_spec, - &rule.list); - if (err) - goto free_list; - } - - err = mlx4_flow_attach(mdev->dev, &rule, ®_id); - if (err) - goto free_list; - - memcpy(&cm_flow->spec, flow_spec, sizeof(*flow_spec)); - cm_flow->reg_id = reg_id; - - if (lock_qp) - mutex_lock(&mqp->mutex); - list_add(&cm_flow->list, &mqp->rules_list); - if (lock_qp) - mutex_unlock(&mqp->mutex); - -free_list: - list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) { - list_del(&spec->list); - kfree(spec); - } - if (err) { - kfree(cm_flow); - dev_err(mdev->ib_dev.dma_device, - "Fail to attach flow steering rule\n"); - } - return err; -} - -static int __mlx4_ib_flow_detach(struct mlx4_ib_dev *mdev, - struct mlx4_ib_qp *mqp, - struct ib_flow_spec *spec, int priority, - int lock_qp) -{ - struct mlx4_cm_steering *cm_flow; - int ret; - - if (lock_qp) - mutex_lock(&mqp->mutex); - list_for_each_entry(cm_flow, &mqp->rules_list, list) { - if (!memcmp(&cm_flow->spec, spec, sizeof(*spec))) { - list_del(&cm_flow->list); - break; - } - } - if (lock_qp) - mutex_unlock(&mqp->mutex); - - if (&cm_flow->list == &mqp->rules_list) { - dev_err(mdev->ib_dev.dma_device, "Couldn't find reg_id for flow spec. " - "Steering rule is left attached\n"); + default: return -EINVAL; } + if (map_sw_to_hw_steering_id(dev, type) < 0 || + hw_rule_sz(dev, type) < 0) + return -EINVAL; + mlx4_spec->id = cpu_to_be16(map_sw_to_hw_steering_id(dev, type)); + mlx4_spec->size = hw_rule_sz(dev, type) >> 2; + return hw_rule_sz(dev, type); +} - ret = mlx4_flow_detach(mdev->dev, cm_flow->reg_id); +static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + int domain, + enum mlx4_net_trans_promisc_mode flow_type, + u64 *reg_id) +{ + int ret, i; + int size = 0; + void *ib_flow; + struct mlx4_ib_dev *mdev = to_mdev(qp->device); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) + + (sizeof(struct _rule_hw) * flow_attr->num_of_specs); - kfree(cm_flow); + static const u16 __mlx4_domain[] = { + [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, + [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, + [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, + [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, + }; + + if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { + pr_err("Invalid priority value.\n"); + return -EINVAL; + } + if (domain >= IB_FLOW_DOMAIN_NUM) { + pr_err("Invalid domain value.\n"); + return -EINVAL; + } + if (map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) + return -EINVAL; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + memset(mailbox->buf, 0, rule_size); + ctrl = mailbox->buf; + + ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | + flow_attr->priority); + ctrl->type = map_sw_to_hw_steering_mode(mdev->dev, flow_type); + ctrl->port = flow_attr->port; + ctrl->qpn = cpu_to_be32(qp->qp_num); + + if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK) + ctrl->flags = (1 << 3); + + ib_flow = flow_attr + 1; + size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + for (i = 0; i < flow_attr->num_of_specs; i++) { + ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + ib_flow += ((union ib_flow_spec *)ib_flow)->size; + size += ret; + } + + ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (ret == -ENOMEM) + pr_err("mcg table is full. Fail to register network rule.\n"); + else if (ret == -ENXIO) + pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); + else if (ret) + pr_err("Invalid argumant. Fail to register network rule.\n"); + mlx4_free_cmd_mailbox(mdev->dev, mailbox); return ret; } -static int mlx4_ib_flow_attach(struct ib_qp *qp, struct ib_flow_spec *flow_spec, - int priority) +static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) { - return __mlx4_ib_flow_attach(to_mdev(qp->device), to_mqp(qp), - flow_spec, priority, 1); + int err; + err = mlx4_cmd(dev, reg_id, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + pr_err("Fail to detach network rule. registration id = 0x%llx\n", + (unsigned long long)reg_id); + return err; } -static int mlx4_ib_flow_detach(struct ib_qp *qp, struct ib_flow_spec *spec, - int priority) +static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) { - return __mlx4_ib_flow_detach(to_mdev(qp->device), to_mqp(qp), - spec, priority, 1); + int err = 0, i = 0; + struct mlx4_ib_flow *mflow; + enum mlx4_net_trans_promisc_mode type[2]; + + memset(type, 0, sizeof(type)); + + mflow = kzalloc(sizeof(struct mlx4_ib_flow), GFP_KERNEL); + if (!mflow) { + err = -ENOMEM; + goto err_free; + } + + switch (flow_attr->type) { + case IB_FLOW_ATTR_NORMAL: + type[0] = MLX4_FS_REGULAR; + break; + + case IB_FLOW_ATTR_ALL_DEFAULT: + type[0] = MLX4_FS_ALL_DEFAULT; + break; + + case IB_FLOW_ATTR_MC_DEFAULT: + type[0] = MLX4_FS_MC_DEFAULT; + break; + + case IB_FLOW_ATTR_SNIFFER: + type[0] = MLX4_FS_UC_SNIFFER; + type[1] = MLX4_FS_MC_SNIFFER; + break; + + default: + err = -EINVAL; + goto err_free; + } + + while (i < ARRAY_SIZE(type) && type[i]) { + err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], + &mflow->reg_id[i]); + if (err) + goto err_free; + i++; + } + + return &mflow->ibflow; + +err_free: + kfree(mflow); + return ERR_PTR(err); +} + +static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) +{ + int err, ret = 0; + int i = 0; + struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); + struct mlx4_ib_flow *mflow = to_mflow(flow_id); + + while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) { + err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]); + if (err) + ret = err; + i++; + } + + kfree(mflow); + return ret; } static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) @@ -1194,40 +1267,14 @@ static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) return ret; } -static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) + +static int del_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { - int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); - u8 mac[6]; - struct net_device *ndev; struct mlx4_ib_gid_entry *ge; - u64 reg_id = 0; - - if (mdev->dev->caps.steering_mode == - MLX4_STEERING_MODE_DEVICE_MANAGED) { - struct mlx4_ib_steering *ib_steering; - - mutex_lock(&mqp->mutex); - list_for_each_entry(ib_steering, &mqp->steering_rules, list) { - if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { - list_del(&ib_steering->list); - break; - } - } - mutex_unlock(&mqp->mutex); - if (&ib_steering->list == &mqp->steering_rules) { - pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); - return -EINVAL; - } - reg_id = ib_steering->reg_id; - kfree(ib_steering); - } - - err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - MLX4_PROT_IB_IPV6, reg_id); - if (err) - return err; + struct net_device *ndev; + u8 mac[6]; mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); @@ -1250,8 +1297,174 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) pr_warn("could not find mgid entry\n"); mutex_unlock(&mqp->mutex); + return ge != 0 ? 0 : -EINVAL; +} + +static int _mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid, + int count) +{ + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + u64 reg_id = 0; + int record_err = 0; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + struct mlx4_ib_steering *ib_steering; + struct mlx4_ib_steering *tmp; + LIST_HEAD(temp); + + mutex_lock(&mqp->mutex); + list_for_each_entry_safe(ib_steering, tmp, &mqp->steering_rules, + list) { + if (memcmp(ib_steering->gid.raw, gid->raw, 16)) + continue; + + if (--count < 0) + break; + + list_del(&ib_steering->list); + list_add(&ib_steering->list, &temp); + } + mutex_unlock(&mqp->mutex); + list_for_each_entry_safe(ib_steering, tmp, &temp, + list) { + reg_id = ib_steering->reg_id; + + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, + gid->raw, + (ibqp->qp_type == IB_QPT_RAW_PACKET) ? + MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, + reg_id); + if (err) { + record_err = record_err ?: err; + continue; + } + + err = del_gid_entry(ibqp, gid); + if (err) { + record_err = record_err ?: err; + continue; + } + + list_del(&ib_steering->list); + kfree(ib_steering); + } + mutex_lock(&mqp->mutex); + list_for_each_entry(ib_steering, &temp, list) { + list_add(&ib_steering->list, &mqp->steering_rules); + } + mutex_unlock(&mqp->mutex); + if (count) { + pr_warn("Couldn't release all reg_ids for mgid. Steering rule is left attached\n"); + return -EINVAL; + } + + } else { + if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 && + ibqp->qp_type == IB_QPT_RAW_PACKET) + gid->raw[5] = mqp->port; + + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + (ibqp->qp_type == IB_QPT_RAW_PACKET) ? + MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, + reg_id); + if (err) + return err; + + err = del_gid_entry(ibqp, gid); + + if (err) + return err; + } + + return record_err; +} + +static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + int count = (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) ? + mdev->dev->caps.num_ports : 1; + + return _mlx4_ib_mcg_detach(ibqp, gid, lid, count); +} + +static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + int err = -ENODEV; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + DECLARE_BITMAP(ports, MLX4_MAX_PORTS); + int i = 0; + + if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 && + ibqp->qp_type == IB_QPT_RAW_PACKET) + gid->raw[5] = mqp->port; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + bitmap_fill(ports, mdev->dev->caps.num_ports); + } else { + if (mqp->port <= mdev->dev->caps.num_ports) { + bitmap_zero(ports, mdev->dev->caps.num_ports); + set_bit(0, ports); + } else { + return -EINVAL; + } + } + + for (; i < mdev->dev->caps.num_ports; i++) { + u64 reg_id; + struct mlx4_ib_steering *ib_steering = NULL; + if (!test_bit(i, ports)) + continue; + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); + if (!ib_steering) + goto err_add; + } + + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, + gid->raw, i + 1, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + (ibqp->qp_type == IB_QPT_RAW_PACKET) ? + MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, + ®_id); + if (err) { + kfree(ib_steering); + goto err_add; + } + + err = add_gid_entry(ibqp, gid); + if (err) { + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + MLX4_PROT_IB_IPV6, reg_id); + kfree(ib_steering); + goto err_add; + } + + if (ib_steering) { + memcpy(ib_steering->gid.raw, gid->raw, 16); + mutex_lock(&mqp->mutex); + list_add(&ib_steering->list, &mqp->steering_rules); + mutex_unlock(&mqp->mutex); + ib_steering->reg_id = reg_id; + } + } + return 0; + +err_add: + if (i > 0) + _mlx4_ib_mcg_detach(ibqp, gid, lid, i); + + return err; } static int init_node_data(struct mlx4_ib_dev *dev) @@ -1327,27 +1540,39 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr, dev->dev->board_id); } +static ssize_t show_vsd(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + ssize_t len = MLX4_VSD_LEN; + + if (dev->dev->vsd_vendor_id == PCI_VENDOR_ID_MELLANOX) + len = sprintf(buf, "%.*s\n", MLX4_VSD_LEN, dev->dev->vsd); + else + memcpy(buf, dev->dev->vsd, MLX4_VSD_LEN); + + return len; +} + static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(vsd, S_IRUGO, show_vsd, NULL); static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, - &dev_attr_board_id + &dev_attr_board_id, + &dev_attr_vsd }; -static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) +static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev, u8 port) { -#ifdef __linux__ - memcpy(eui, dev->dev_addr, 3); - memcpy(eui + 5, dev->dev_addr + 3, 3); -#else memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); -#endif if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; @@ -1366,147 +1591,307 @@ static void update_gids_task(struct work_struct *work) int err; struct mlx4_dev *dev = gw->dev->dev; + mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); - return; + goto free; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof gw->gids); - err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | gw->port, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + if (err) pr_warn("set port command failed\n"); - else { - memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); - mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); + else + mlx4_ib_dispatch_event(gw->dev, gw->port, + IB_EVENT_GID_CHANGE); } mlx4_free_cmd_mailbox(dev, mailbox); +free: kfree(gw); } -static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) +static void reset_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = + container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + pr_warn("reset gid table failed\n"); + goto free; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof(gw->gids)); + + if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET && + dev->caps.num_ports > 0) { + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | 1, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + pr_warn("set port 1 command failed\n"); + } + + if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 2) == + IB_LINK_LAYER_ETHERNET && + dev->caps.num_ports > 1) { + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | 2, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + pr_warn("set port 2 command failed\n"); + } + + mlx4_free_cmd_mailbox(dev, mailbox); +free: + kfree(gw); +} + +static int update_gid_table(struct mlx4_ib_dev *dev, int port, + union ib_gid *gid, int clear, int default_gid) { - struct net_device *ndev = dev->iboe.netdevs[port - 1]; struct update_gid_work *work; - struct net_device *tmp; int i; - u8 *hits; - union ib_gid gid; - int index_free; - int found; int need_update = 0; + int free = -1; + int found = -1; int max_gids; - u16 vid; + int start_index = !default_gid; + + max_gids = dev->dev->caps.gid_table_len[port]; + for (i = start_index; i < max_gids; ++i) { + if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, + sizeof(*gid))) + found = i; + + if (clear) { + if (found >= 0) { + need_update = 1; + dev->iboe.gid_table[port - 1][found] = zgid; + break; + } + } else { + if (found >= 0) + break; + + if (free < 0 && + !memcmp(&dev->iboe.gid_table[port - 1][i], + &zgid, sizeof(*gid))) + free = i; + } + } + + if (found == -1 && !clear && free < 0) { + pr_err("GID table of port %d is full. Can't add "GID_PRINT_FMT"\n", + port, GID_PRINT_ARGS(gid)); + return -ENOMEM; + } + if (found == -1 && clear) { + pr_err(GID_PRINT_FMT" is not in GID table of port %d\n", GID_PRINT_ARGS(gid), port); + return -EINVAL; + } + if (found == -1 && !clear && free >= 0) { + dev->iboe.gid_table[port - 1][free] = *gid; + need_update = 1; + } + + if (!need_update) + return 0; work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; - hits = kzalloc(128, GFP_ATOMIC); - if (!hits) { - kfree(work); - return -ENOMEM; - } - - max_gids = dev->dev->caps.gid_table_len[port]; - -#ifdef __linux__ - rcu_read_lock(); - for_each_netdev_rcu(&init_net, tmp) { -#else - IFNET_RLOCK(); - TAILQ_FOREACH(tmp, &V_ifnet, if_link) { -#endif - if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { - gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - vid = rdma_vlan_dev_vlan_id(tmp); - mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); - found = 0; - index_free = -1; - for (i = 0; i < max_gids; ++i) { - if (index_free < 0 && - !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - index_free = i; - if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { - hits[i] = 1; - found = 1; - break; - } - } - - if (!found) { - if (tmp == ndev && - (memcmp(&dev->iboe.gid_table[port - 1][0], - &gid, sizeof gid) || - !memcmp(&dev->iboe.gid_table[port - 1][0], - &zgid, sizeof gid))) { - dev->iboe.gid_table[port - 1][0] = gid; - ++need_update; - hits[0] = 1; - } else if (index_free >= 0) { - dev->iboe.gid_table[port - 1][index_free] = gid; - hits[index_free] = 1; - ++need_update; - } - } - } -#ifdef __linux__ - } - rcu_read_unlock(); -#else - } - IFNET_RUNLOCK(); -#endif - - for (i = 0; i < max_gids; ++i) - if (!hits[i]) { - if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - ++need_update; - dev->iboe.gid_table[port - 1][i] = zgid; - } - - if (need_update) { - memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); + memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); INIT_WORK(&work->work, update_gids_task); work->port = port; work->dev = dev; queue_work(wq, &work->work); - } else - kfree(work); - kfree(hits); return 0; } -static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) +static int reset_gid_table(struct mlx4_ib_dev *dev) { - switch (event) { - case NETDEV_UP: -#ifdef __linux__ - case NETDEV_CHANGEADDR: + struct update_gid_work *work; + + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memset(dev->iboe.gid_table, 0, sizeof(dev->iboe.gid_table)); + memset(work->gids, 0, sizeof(work->gids)); + INIT_WORK(&work->work, reset_gids_task); + work->dev = dev; + queue_work(wq, &work->work); + return 0; +} + +/* XXX BOND Related - stub (no support for these flags in FBSD)*/ +static inline int netif_is_bond_master(struct net_device *dev) +{ +#if 0 + return (dev->flags & IFF_MASTER) && (dev->priv_flags & IFF_BONDING); #endif - update_ipv6_gids(dev, port, 0); + return 0; +} + +static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid, u8 port) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev, port); +} + +static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev) +{ + u8 port = 0; + struct mlx4_ib_iboe *iboe; + struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? + rdma_vlan_dev_real_dev(dev) : dev; + + iboe = &ibdev->iboe; + + for (port = 1; port <= MLX4_MAX_PORTS; ++port) + if ((netif_is_bond_master(real_dev) && (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && (real_dev == iboe->netdevs[port - 1]))) break; - case NETDEV_DOWN: - update_ipv6_gids(dev, port, 1); - dev->iboe.netdevs[port - 1] = NULL; + return port > MLX4_MAX_PORTS ? 0 : port; +} + +static void mlx4_ib_get_dev_addr(struct net_device *dev, struct mlx4_ib_dev *ibdev, u8 port) +{ + struct ifaddr *ifa; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; +#endif + union ib_gid gid; + + + if ((port == 0) || (port > MLX4_MAX_PORTS)) + return; + + /* IPv4 gids */ + TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { + if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET){ + ipv6_addr_set_v4mapped( + ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr, + (struct in6_addr *)&gid); + update_gid_table(ibdev, port, &gid, 0, 0); + } + + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + /* IPv6 gids */ + in6_dev = in6_dev_get(dev); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + update_gid_table(ibdev, port, pgid, 0, 0); } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif } -static void netdev_added(struct mlx4_ib_dev *dev, int port) +static void mlx4_set_default_gid(struct mlx4_ib_dev *ibdev, + struct net_device *dev, u8 port) { - update_ipv6_gids(dev, port, 0); + union ib_gid gid; + mlx4_make_default_gid(dev, &gid, port); + update_gid_table(ibdev, port, &gid, 0, 1); } -static void netdev_removed(struct mlx4_ib_dev *dev, int port) +static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) { - update_ipv6_gids(dev, port, 1); + struct net_device *dev; + + if (reset_gid_table(ibdev)) + return -1; + + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(dev, &V_ifnet, if_link) { + u8 port = mlx4_ib_get_dev_port(dev, ibdev); + if (port) { + if (!rdma_vlan_dev_real_dev(dev) && + !netif_is_bond_master(dev)) + mlx4_set_default_gid(ibdev, dev, port); + mlx4_ib_get_dev_addr(dev, ibdev, port); + } + } + + IFNET_RUNLOCK_NOSLEEP(); + + return 0; +} + +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, + struct net_device *dev, unsigned long event) +{ + struct mlx4_ib_iboe *iboe; + int port; + int init = 0; + unsigned long flags; + + iboe = &ibdev->iboe; + + spin_lock_irqsave(&iboe->lock, flags); + mlx4_foreach_ib_transport_port(port, ibdev->dev) { + struct net_device *old_netdev = iboe->netdevs[port - 1]; +/* XXX BOND related */ +#if 0 + struct net_device *old_master = iboe->masters[port - 1]; +#endif + iboe->masters[port - 1] = NULL; + iboe->netdevs[port - 1] = + mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); + + + if (old_netdev != iboe->netdevs[port - 1]) + init = 1; + if (dev == iboe->netdevs[port - 1] && + event == NETDEV_CHANGEADDR) + init = 1; +/* XXX BOND related */ +#if 0 + if (iboe->netdevs[port - 1] && netif_is_bond_slave(iboe->netdevs[port - 1])) + iboe->masters[port - 1] = iboe->netdevs[port - 1]->master; + + /* if bonding is used it is possible that we add it to masters only after + IP address is assigned to the net bonding interface */ + if (old_master != iboe->masters[port - 1]) + init = 1; +#endif + } + + spin_unlock_irqrestore(&iboe->lock, flags); + + if (init) + if (mlx4_ib_init_gid_table(ibdev)) + pr_warn("Fail to reset gid table\n"); } static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, @@ -1514,43 +1899,44 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event { struct net_device *dev = ptr; struct mlx4_ib_dev *ibdev; - struct net_device *oldnd; - struct mlx4_ib_iboe *iboe; - int port; - -#ifdef __linux__ - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; -#endif ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); - iboe = &ibdev->iboe; - spin_lock(&iboe->lock); - mlx4_foreach_ib_transport_port(port, ibdev->dev) { - oldnd = iboe->netdevs[port - 1]; - iboe->netdevs[port - 1] = - mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); - if (oldnd != iboe->netdevs[port - 1]) { - if (iboe->netdevs[port - 1]) - netdev_added(ibdev, port); - else - netdev_removed(ibdev, port); - } - } - - if (dev == iboe->netdevs[0] || - (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) - handle_en_event(ibdev, 1, event); - else if (dev == iboe->netdevs[1] - || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) - handle_en_event(ibdev, 2, event); - - spin_unlock(&iboe->lock); + mlx4_ib_scan_netdevs(ibdev, dev, event); return NOTIFY_DONE; } +/* This function initializes the gid table only if the event_netdev real device is an iboe + * device, will be invoked by the inet/inet6 events */ +static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *event_netdev = ptr; + struct mlx4_ib_dev *ibdev; + struct mlx4_ib_iboe *ibdev_iboe; + int port = 0; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); + + struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? + rdma_vlan_dev_real_dev(event_netdev) : + event_netdev; + + ibdev_iboe = &ibdev->iboe; + + port = mlx4_ib_get_dev_port(real_dev, ibdev); + + /* Perform init_gid_table if the event real_dev is the net_device which represents this port, + * otherwise this event is not related and would be ignored.*/ + if(port && (real_dev == ibdev_iboe->netdevs[port - 1])) + if (mlx4_ib_init_gid_table(ibdev)) + pr_warn("Fail to reset gid table\n"); + + return NOTIFY_DONE; +} + + static void init_pkeys(struct mlx4_ib_dev *ibdev) { int port; @@ -1615,7 +2001,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { - snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j, + sprintf(name, "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j, pci_get_domain(dev->pdev->dev.bsddev), pci_get_bus(dev->pdev->dev.bsddev), PCI_SLOT(dev->pdev->devfn), @@ -1779,89 +2165,61 @@ static struct attribute_group diag_counters_group = { .attrs = diag_rprt_attrs }; -#ifdef __linux__ -static int mlx4_ib_proc_init(void) -{ - /* Creating procfs directories /proc/drivers/mlx4_ib/ && - /proc/drivers/mlx4_ib/mrs for further use by the driver. - */ - int err; - - mlx4_ib_driver_dir_entry = proc_mkdir(MLX4_IB_DRIVER_PROC_DIR_NAME, - NULL); - if (!mlx4_ib_driver_dir_entry) { - pr_err("mlx4_ib_proc_init has failed for %s\n", - MLX4_IB_DRIVER_PROC_DIR_NAME); - err = -ENODEV; - goto error; - } - - mlx4_mrs_dir_entry = proc_mkdir(MLX4_IB_MRS_PROC_DIR_NAME, - mlx4_ib_driver_dir_entry); - if (!mlx4_mrs_dir_entry) { - pr_err("mlx4_ib_proc_init has failed for %s\n", - MLX4_IB_MRS_PROC_DIR_NAME); - err = -ENODEV; - goto remove_entry; - } - - return 0; - -remove_entry: - remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, - NULL); -error: - return err; -} -#endif - static void init_dev_assign(void) { - int bus, slot, fn, ib_idx; - char *p = dev_assign_str, *t; - char curr_val[32] = {0}; - int ret; - int j, i = 0; - - memset(dr, 0, sizeof dr); - - if (dev_assign_str[0] == 0) + int i = 1; + + spin_lock_init(&dev_num_str_lock); + if (mlx4_fill_dbdf2val_tbl(&dev_assign_str)) return; - - while (strlen(p)) { - ret = sscanf(p, "%02x:%02x.%x-%x", &bus, &slot, &fn, &ib_idx); - if (ret != 4 || ib_idx < 0) + dev_num_str_bitmap = + kmalloc(BITS_TO_LONGS(MAX_NUM_STR_BITMAP) * sizeof(long), + GFP_KERNEL); + if (!dev_num_str_bitmap) { + pr_warn("bitmap alloc failed -- cannot apply dev_assign_str parameter\n"); + return; + } + bitmap_zero(dev_num_str_bitmap, MAX_NUM_STR_BITMAP); + while ((i < MLX4_DEVS_TBL_SIZE) && (dev_assign_str.tbl[i].dbdf != + MLX4_ENDOF_TBL)) { + if (bitmap_allocate_region(dev_num_str_bitmap, + dev_assign_str.tbl[i].val[0], 0)) goto err; + i++; + } + dr_active = 1; + return; - for (j = 0; j < i; j++) - if (dr[j].nr == ib_idx) - goto err; +err: + kfree(dev_num_str_bitmap); + dev_num_str_bitmap = NULL; + pr_warn("mlx4_ib: The value of 'dev_assign_str' parameter " + "is incorrect. The parameter value is discarded!"); +} - dr[i].bus = bus; - dr[i].dev = slot; - dr[i].func = fn; - dr[i].nr = ib_idx; +static int mlx4_ib_dev_idx(struct mlx4_dev *dev) +{ + int i, val; - t = strchr(p, ','); - sprintf(curr_val, "%02x:%02x.%x-%x", bus, slot, fn, ib_idx); - if ((!t) && strlen(p) == strlen(curr_val)) - return; + if (!dr_active) + return -1; + if (!dev) + return -1; + if (mlx4_get_val(dev_assign_str.tbl, dev->pdev, 0, &val)) + return -1; - if (!t || (t + 1) >= dev_assign_str + sizeof dev_assign_str) - goto err; - - ++i; - if (i >= MAX_DR) - goto err; - - p = t + 1; + if (val != DEFAULT_TBL_VAL) { + dev->flags |= MLX4_FLAG_DEV_NUM_STR; + return val; } - return; -err: - memset(dr, 0, sizeof dr); - printk(KERN_WARNING "mlx4_ib: The value of 'dev_assign_str' parameter " - "is incorrect. The parameter value is discarded!"); + spin_lock(&dev_num_str_lock); + i = bitmap_find_free_region(dev_num_str_bitmap, MAX_NUM_STR_BITMAP, 0); + spin_unlock(&dev_num_str_lock); + if (i >= 0) + return i; + + return -1; } static void *mlx4_ib_add(struct mlx4_dev *dev) @@ -1871,8 +2229,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) int i, j; int err; struct mlx4_ib_iboe *iboe; + int dev_idx; - printk(KERN_INFO "%s", mlx4_ib_version); + pr_info_once("%s", mlx4_ib_version); mlx4_foreach_ib_transport_port(i, dev) num_ports++; @@ -1905,7 +2264,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->dev = dev; + dev_idx = mlx4_ib_dev_idx(dev); + if (dev_idx >= 0) + sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx); + else strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); + ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; @@ -1942,10 +2306,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | - (1ull << IB_USER_VERBS_CMD_OPEN_QP) | - (1ull << IB_USER_VERBS_CMD_ATTACH_FLOW) | - (1ull << IB_USER_VERBS_CMD_DETACH_FLOW) | - (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + (1ull << IB_USER_VERBS_CMD_OPEN_QP); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; @@ -1957,7 +2318,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; ibdev->ib_dev.mmap = mlx4_ib_mmap; -#ifdef __linux__ +/* XXX FBSD has no support for get_unmapped_area function */ +#if 0 ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area; #endif ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; @@ -1990,9 +2352,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; - ibdev->ib_dev.attach_flow = mlx4_ib_flow_attach; - ibdev->ib_dev.detach_flow = mlx4_ib_flow_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; + ibdev->ib_dev.ioctl = mlx4_ib_ioctl; + ibdev->ib_dev.query_values = mlx4_ib_query_values; if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; @@ -2001,6 +2363,16 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) { + ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; + ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; + ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; + + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; @@ -2009,6 +2381,29 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } + /* + * Set experimental data + */ + ibdev->ib_dev.uverbs_exp_cmd_mask = + (1ull << IB_USER_VERBS_EXP_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_CQ) | + (1ull << IB_USER_VERBS_EXP_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ); + ibdev->ib_dev.exp_create_qp = mlx4_ib_exp_create_qp; + ibdev->ib_dev.exp_query_device = mlx4_ib_exp_query_device; + if (check_flow_steering_support(dev)) { + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + ibdev->ib_dev.create_flow = mlx4_ib_create_flow; + ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; + } else { + pr_debug("Device managed flow steering is unavailable for this configuration.\n"); + } + /* + * End of experimental data + */ + mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); @@ -2019,18 +2414,29 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) for (i = 0; i < ibdev->num_ports; ++i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { - err = mlx4_counter_alloc(ibdev->dev, i + 1, &ibdev->counters[i]); - if (err) - ibdev->counters[i] = -1; - } else - ibdev->counters[i] = -1; + if (mlx4_is_slave(dev)) { + ibdev->counters[i].status = mlx4_counter_alloc(ibdev->dev, + i + 1, + &ibdev->counters[i].counter_index); + } else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */ + ibdev->counters[i].counter_index = ((i + 1) << 1) - 1; + ibdev->counters[i].status = 0; + } + + dev_info(&dev->pdev->dev, + "%s: allocated counter index %d for port %d\n", + __func__, ibdev->counters[i].counter_index, i+1); + } else { + ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX; + ibdev->counters[i].status = -ENOSPC; + } } spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && - !mlx4_is_slave(dev)) { + !mlx4_is_mfunc(dev)) { ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0); @@ -2063,20 +2469,32 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_init_sriov(ibdev)) goto err_mad; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { + if (!iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); - if (err) - goto err_sriov; + if (err) { + iboe->nb.notifier_call = NULL; + goto err_notify; + } + } + if (!iboe->nb_inet.notifier_call) { + iboe->nb_inet.notifier_call = mlx4_ib_inet_event; + err = register_inetaddr_notifier(&iboe->nb_inet); + if (err) { + iboe->nb_inet.notifier_call = NULL; + goto err_notify; + } + } + mlx4_ib_scan_netdevs(ibdev, NULL, 0); } - for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j])) - goto err_notif; + goto err_notify; } if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) - goto err_notif; + goto err_notify; ibdev->ib_active = true; @@ -2094,12 +2512,24 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) } return ibdev; -err_notif: +err_notify: + for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { + device_remove_file(&ibdev->ib_dev.dev, + mlx4_class_attributes[j]); + } + + if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } flush_workqueue(wq); -err_sriov: mlx4_ib_close_sriov(ibdev); err_mad: @@ -2116,9 +2546,14 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); err_counter: - for (; i; --i) - if (ibdev->counters[i - 1] != -1) - mlx4_counter_free(ibdev->dev, i, ibdev->counters[i - 1]); + for (; i; --i) { + if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i) == + IB_LINK_LAYER_ETHERNET) { + mlx4_counter_free(ibdev->dev, + i, + ibdev->counters[i - 1].counter_index); + } + } err_map: iounmap(ibdev->priv_uar.map); @@ -2167,30 +2602,71 @@ void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach) { - struct ib_flow_spec spec = { - .type = IB_FLOW_IB_UC, - .l2_id.ib_uc.qpn = mqp->ibqp.qp_num, - }; + int err; + size_t flow_size; + struct ib_flow_attr *flow = NULL; + struct ib_flow_spec_ib *ib_spec; - return is_attach ? - __mlx4_ib_flow_attach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0) - : __mlx4_ib_flow_detach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0); + if (is_attach) { + flow_size = sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_ib); + flow = kzalloc(flow_size, GFP_KERNEL); + if (!flow) + return -ENOMEM; + flow->port = mqp->port; + flow->num_of_specs = 1; + flow->size = flow_size; + ib_spec = (struct ib_flow_spec_ib *)(flow + 1); + ib_spec->type = IB_FLOW_SPEC_IB; + ib_spec->size = sizeof(struct ib_flow_spec_ib); + ib_spec->val.l3_type_qpn = mqp->ibqp.qp_num; + ib_spec->mask.l3_type_qpn = MLX4_IB_FLOW_QPN_MASK; + + err = __mlx4_ib_create_flow(&mqp->ibqp, flow, + IB_FLOW_DOMAIN_NIC, + MLX4_FS_REGULAR, + &mqp->reg_id); + } else { + err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); + } + kfree(flow); + return err; } static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; - int p,j; + int p, j; + int dev_idx, ret; + + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } mlx4_ib_close_sriov(ibdev); sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group); mlx4_ib_mad_cleanup(ibdev); for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { - device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); + device_remove_file(&ibdev->ib_dev.dev, + mlx4_class_attributes[j]); } + + dev_idx = -1; + if (dr_active && !(ibdev->dev->flags & MLX4_FLAG_DEV_NUM_STR)) { + ret = sscanf(ibdev->ib_dev.name, "mlx4_%d", &dev_idx); + if (ret != 1) + dev_idx = -1; + } ib_unregister_device(&ibdev->ib_dev); + if (dev_idx >= 0) { + spin_lock(&dev_num_str_lock); + bitmap_release_region(dev_num_str_bitmap, dev_idx, 0); + spin_unlock(&dev_num_str_lock); + } if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { mlx4_qp_release_range(dev, ibdev->steer_qpn_base, @@ -2204,9 +2680,16 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) ibdev->iboe.nb.notifier_call = NULL; } iounmap(ibdev->priv_uar.map); - for (p = 0; p < ibdev->num_ports; ++p) - if (ibdev->counters[p] != -1) - mlx4_counter_free(ibdev->dev, p + 1, ibdev->counters[p]); + + for (p = 0; p < ibdev->num_ports; ++p) { + if (mlx4_ib_port_link_layer(&ibdev->ib_dev, p + 1) == + IB_LINK_LAYER_ETHERNET) { + mlx4_counter_free(ibdev->dev, + p + 1, + ibdev->counters[p].counter_index); + } + } + mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); @@ -2355,12 +2838,6 @@ static int __init mlx4_ib_init(void) if (!wq) return -ENOMEM; -#ifdef __linux__ - err = mlx4_ib_proc_init(); - if (err) - goto clean_wq; -#endif - err = mlx4_ib_mcg_init(); if (err) goto clean_proc; @@ -2377,13 +2854,6 @@ static int __init mlx4_ib_init(void) mlx4_ib_mcg_destroy(); clean_proc: -#ifdef __linux__ - remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME, - mlx4_ib_driver_dir_entry); - remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); - -clean_wq: -#endif destroy_workqueue(wq); return err; } @@ -2394,13 +2864,7 @@ static void __exit mlx4_ib_cleanup(void) mlx4_ib_mcg_destroy(); destroy_workqueue(wq); - /* Remove proc entries */ -#ifdef __linux__ - remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME, - mlx4_ib_driver_dir_entry); - remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); -#endif - + kfree(dev_num_str_bitmap); } module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE); @@ -2417,7 +2881,7 @@ static moduledata_t mlx4ib_mod = { .evhand = mlx4ib_evhand, }; -DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY); +DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY); MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1); MODULE_DEPEND(mlx4ib, linuxapi, 1, 1, 1); diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c b/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c index e70dfe9f4ab5..207db3c529b7 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c @@ -36,6 +36,7 @@ #include #include +#include #include #include "mlx4_ib.h" @@ -53,6 +54,7 @@ #define mcg_error_group(group, format, arg...) \ pr_err(" %16s: " format, (group)->name, ## arg) + static union ib_gid mgid0; static struct workqueue_struct *clean_wq; @@ -214,7 +216,7 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); spin_unlock(&dev->sm_lock); return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port, - IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad); + IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, 0, mad); } static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, @@ -567,7 +569,7 @@ static void mlx4_ib_mcg_timeout_handler(struct work_struct *work) mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state)); group->state = MCAST_IDLE; atomic_inc(&group->refcount); - queue_work(group->demux->mcg_wq, &group->work); + if (!queue_work(group->demux->mcg_wq, &group->work)) safe_atomic_dec(&group->refcount); mutex_unlock(&group->lock); @@ -656,8 +658,9 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work) method = group->response_sa_mad.mad_hdr.method; if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) { mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n", - (long long unsigned int)be64_to_cpu(group->response_sa_mad.mad_hdr.tid), - (long long unsigned int)be64_to_cpu(group->last_req_tid)); + (long long)be64_to_cpu( + group->response_sa_mad.mad_hdr.tid), + (long long)be64_to_cpu(group->last_req_tid)); group->state = group->prev_state; goto process_requests; } @@ -665,7 +668,7 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work) if (!list_empty(&group->pending_list)) req = list_first_entry(&group->pending_list, struct mcast_req, group_list); - if (method == IB_MGMT_METHOD_GET_RESP) { + if ((method == IB_MGMT_METHOD_GET_RESP)) { if (req) { send_reply_to_slave(req->func, group, &req->sa_mad, status); --group->func[req->func].num_pend_reqs; @@ -752,8 +755,8 @@ static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { group->rec.mgid = *new_mgid; sprintf(group->name, "%016llx%016llx", - (long long unsigned int)be64_to_cpu(group->rec.mgid.global.subnet_prefix), - (long long unsigned int)be64_to_cpu(group->rec.mgid.global.interface_id)); + (long long)be64_to_cpu(group->rec.mgid.global.subnet_prefix), + (long long)be64_to_cpu(group->rec.mgid.global.interface_id)); list_del_init(&group->mgid0_list); cur_group = mcast_insert(ctx, group); if (cur_group) { @@ -834,8 +837,10 @@ static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler); mutex_init(&group->lock); sprintf(group->name, "%016llx%016llx", - (long long unsigned int)be64_to_cpu(group->rec.mgid.global.subnet_prefix), - (long long unsigned int)be64_to_cpu(group->rec.mgid.global.interface_id)); + (long long)be64_to_cpu( + group->rec.mgid.global.subnet_prefix), + (long long)be64_to_cpu( + group->rec.mgid.global.interface_id)); sysfs_attr_init(&group->dentry.attr); group->dentry.show = sysfs_show_group; group->dentry.store = NULL; @@ -871,7 +876,7 @@ static void queue_req(struct mcast_req *req) list_add_tail(&req->group_list, &group->pending_list); list_add_tail(&req->func_list, &group->func[req->func].pending); /* calls mlx4_ib_mcg_work_handler */ - queue_work(group->demux->mcg_wq, &group->work); + if (!queue_work(group->demux->mcg_wq, &group->work)) safe_atomic_dec(&group->refcount); } @@ -907,7 +912,7 @@ int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, group->state = MCAST_RESP_READY; /* calls mlx4_ib_mcg_work_handler */ atomic_inc(&group->refcount); - queue_work(ctx->mcg_wq, &group->work); + if (!queue_work(ctx->mcg_wq, &group->work)) safe_atomic_dec(&group->refcount); mutex_unlock(&group->lock); release_group(group, 0); @@ -998,13 +1003,14 @@ static ssize_t sysfs_show_group(struct device *dev, else sprintf(state_str, "%s(TID=0x%llx)", get_state_string(group->state), - (long long unsigned int)be64_to_cpu(group->last_req_tid)); + (long long)be64_to_cpu(group->last_req_tid)); if (list_empty(&group->pending_list)) { sprintf(pending_str, "No"); } else { req = list_first_entry(&group->pending_list, struct mcast_req, group_list); sprintf(pending_str, "Yes(TID=0x%llx)", - (long long unsigned int)be64_to_cpu(req->sa_mad.mad_hdr.tid)); + (long long)be64_to_cpu( + req->sa_mad.mad_hdr.tid)); } len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ", group->rec.scope_join_state & 0xf, diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c new file mode 100644 index 000000000000..b6a6962addf6 --- /dev/null +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx4_ib.h" +#include "mlx4_exp.h" +#include + +int mlx4_ib_exp_query_device(struct ib_device *ibdev, + struct ib_exp_device_attr *props) +{ + struct ib_device_attr *base = &props->base; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int ret = mlx4_ib_query_device(ibdev, &props->base); + + props->exp_comp_mask = IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ; + props->inline_recv_sz = dev->dev->caps.max_rq_sg * sizeof(struct mlx4_wqe_data_seg); + props->device_cap_flags2 = 0; + + /* move RSS device cap from device_cap to device_cap_flags2 */ + if (base->device_cap_flags & IB_DEVICE_QPG) { + props->device_cap_flags2 |= IB_EXP_DEVICE_QPG; + if (base->device_cap_flags & IB_DEVICE_UD_RSS) + props->device_cap_flags2 |= IB_EXP_DEVICE_UD_RSS; + } + base->device_cap_flags &= ~(IB_DEVICE_QPG | + IB_DEVICE_UD_RSS | + IB_DEVICE_UD_TSS); + + if (base->max_rss_tbl_sz > 0) { + props->max_rss_tbl_sz = base->max_rss_tbl_sz; + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ; + } else { + props->max_rss_tbl_sz = 0; + props->exp_comp_mask &= ~IB_EXP_DEVICE_ATTR_RSS_TBL_SZ; + } + + if (props->device_cap_flags2) + props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2; + + return ret; +} + +/* + * Experimental functions + */ +struct ib_qp *mlx4_ib_exp_create_qp(struct ib_pd *pd, + struct ib_exp_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + int rwqe_size; + struct ib_qp *qp; + struct mlx4_ib_qp *mqp; + int use_inlr; + struct mlx4_ib_dev *dev; + + if (init_attr->max_inl_recv && !udata) + return ERR_PTR(-EINVAL); + + use_inlr = mlx4_ib_qp_has_rq((struct ib_qp_init_attr *)init_attr) && + init_attr->max_inl_recv && pd; + if (use_inlr) { + rwqe_size = roundup_pow_of_two(max(1U, init_attr->cap.max_recv_sge)) * + sizeof(struct mlx4_wqe_data_seg); + if (rwqe_size < init_attr->max_inl_recv) { + dev = to_mdev(pd->device); + init_attr->max_inl_recv = min(init_attr->max_inl_recv, + (u32)(dev->dev->caps.max_rq_sg * + sizeof(struct mlx4_wqe_data_seg))); + init_attr->cap.max_recv_sge = roundup_pow_of_two(init_attr->max_inl_recv) / + sizeof(struct mlx4_wqe_data_seg); + } + } else { + init_attr->max_inl_recv = 0; + } + qp = mlx4_ib_create_qp(pd, (struct ib_qp_init_attr *)init_attr, udata); + if (IS_ERR(qp)) + return qp; + + if (use_inlr) { + mqp = to_mqp(qp); + mqp->max_inlr_data = 1 << mqp->rq.wqe_shift; + init_attr->max_inl_recv = mqp->max_inlr_data; + } + + return qp; +} diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h new file mode 100644 index 000000000000..58675a4add73 --- /dev/null +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_EXP_H +#define MLX4_EXP_H + +#include +#include "mlx4_ib.h" + +struct ib_qp *mlx4_ib_exp_create_qp(struct ib_pd *pd, + struct ib_exp_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_exp_query_device(struct ib_device *ibdev, + struct ib_exp_device_attr *props); + +#endif /* MLX4_EXP_H */ diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h index 2435df5cc52f..ddf523648325 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -47,7 +48,6 @@ #include #include -#include #define MLX4_IB_DRV_NAME "mlx4_ib" @@ -72,9 +72,7 @@ enum { /*module param to indicate if SM assigns the alias_GUID*/ extern int mlx4_ib_sm_guid_assign; -#ifdef __linux__ extern struct proc_dir_entry *mlx4_mrs_dir_entry; -#endif #define MLX4_IB_UC_STEER_QPN_ALIGN 1 #define MLX4_IB_UC_MAX_NUM_QPS (256 * 1024) @@ -128,6 +126,7 @@ struct mlx4_ib_cq { struct mutex resize_mutex; struct ib_umem *umem; struct ib_umem *resize_umem; + int create_flags; }; struct mlx4_ib_mr { @@ -135,6 +134,13 @@ struct mlx4_ib_mr { struct mlx4_mr mmr; struct ib_umem *umem; struct mlx4_shared_mr_info *smr_info; + atomic_t invalidated; + struct completion invalidation_comp; +}; + +struct mlx4_ib_mw { + struct ib_mw ibmw; + struct mlx4_mw mmw; }; struct mlx4_ib_fast_reg_page_list { @@ -148,6 +154,12 @@ struct mlx4_ib_fmr { struct mlx4_fmr mfmr; }; +struct mlx4_ib_flow { + struct ib_flow ibflow; + /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ + u64 reg_id[2]; +}; + struct mlx4_ib_wq { u64 *wrid; spinlock_t lock; @@ -163,6 +175,9 @@ struct mlx4_ib_wq { enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX4_IB_QP_CAP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, + MLX4_IB_QP_CAP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, + MLX4_IB_QP_CAP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, MLX4_IB_SRIOV_SQP = 1 << 31, @@ -179,6 +194,7 @@ enum mlx4_ib_mmap_cmd { MLX4_IB_MMAP_UAR_PAGE = 0, MLX4_IB_MMAP_BLUE_FLAME_PAGE = 1, MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES = 2, + MLX4_IB_MMAP_GET_HW_CLOCK = 3, }; enum mlx4_ib_qp_type { @@ -319,8 +335,14 @@ struct mlx4_ib_qp { struct mlx4_roce_smac_vlan_info pri; struct mlx4_roce_smac_vlan_info alt; struct list_head rules_list; + u64 reg_id; int max_inline_data; struct mlx4_bf bf; + + /* + * Experimental data + */ + int max_inlr_data; }; struct mlx4_ib_srq { @@ -354,6 +376,12 @@ struct mlx4_ib_ah { #define MLX4_NOT_SET_GUID (0x00LL) #define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) +/****************************************/ +/* ioctl codes */ +/****************************************/ +#define MLX4_IOC_MAGIC 'm' +#define MLX4_IOCHWCLOCKOFFSET _IOR(MLX4_IOC_MAGIC, 1, int) + enum mlx4_guid_alias_rec_status { MLX4_GUID_INFO_STATUS_IDLE, MLX4_GUID_INFO_STATUS_SET, @@ -478,7 +506,9 @@ struct mlx4_ib_sriov { struct mlx4_ib_iboe { spinlock_t lock; struct net_device *netdevs[MLX4_MAX_PORTS]; + struct net_device *masters[MLX4_MAX_PORTS]; struct notifier_block nb; + struct notifier_block nb_inet; union ib_gid gid_table[MLX4_MAX_PORTS][128]; }; @@ -518,6 +548,11 @@ struct mlx4_ib_iov_port { struct mlx4_ib_iov_sysfs_attr mcg_dentry; }; +struct mlx4_ib_counter { + int counter_index; + int status; +}; + struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; @@ -534,7 +569,7 @@ struct mlx4_ib_dev { struct mutex cap_mask_mutex; bool ib_active; struct mlx4_ib_iboe iboe; - int counters[MLX4_MAX_PORTS]; + struct mlx4_ib_counter counters[MLX4_MAX_PORTS]; int *eq_table; int eq_added; struct kobject *iov_parent; @@ -595,6 +630,11 @@ static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx4_ib_mr, ibmr); } +static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx4_ib_mw, ibmw); +} + static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) { return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl); @@ -604,6 +644,12 @@ static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) { return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); } + +static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) +{ + return container_of(ibflow, struct mlx4_ib_flow, ibflow); +} + static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx4_ib_qp, ibqp); @@ -646,16 +692,23 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata, int mr_id); int mlx4_ib_dereg_mr(struct ib_mr *mr); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len); void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); -int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx4_ib_modify_cq(struct ib_cq *cq, + struct ib_cq_attr *cq_attr, + int cq_attr_mask); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq); -struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, + struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int mlx4_ib_destroy_cq(struct ib_cq *cq); @@ -730,6 +783,13 @@ static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) return !!(ah->av.ib.g_slid & 0x80); } +static inline int mlx4_ib_qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) + return 0; + + return !attr->srq; +} int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); @@ -757,7 +817,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, struct ib_grh *grh, struct ib_mad *mad); int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, - u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad); + u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, struct ib_mad *mad); __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, @@ -799,5 +859,7 @@ int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach); +int mlx4_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props); #endif /* MLX4_IB_H */ diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c index 9ea49011d6b1..61c20886ba4d 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c @@ -35,11 +35,6 @@ #include #include -#ifdef __linux__ -#include -#include -#endif - #include "mlx4_ib.h" static u32 convert_access(int acc) @@ -48,9 +43,11 @@ static u32 convert_access(int acc) (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) | MLX4_PERM_LOCAL_READ; } -#ifdef __linux__ +/* No suuport for Shared MR feature */ +#if 0 static ssize_t shared_mr_proc_read(struct file *file, char __user *buffer, size_t len, @@ -129,7 +126,7 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) return &mr->ibmr; err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); err_free: kfree(mr); @@ -159,7 +156,7 @@ static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, if (len & (mtt_size-1ULL)) { WARN(1 , "write_block: len %llx is not aligned to mtt_size %llx\n", - (long long)len, (long long)mtt_size); + (unsigned long long)len, (unsigned long long)mtt_size); return -EINVAL; } @@ -203,8 +200,6 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { u64 *pages; - struct ib_umem_chunk *chunk; - int j; u64 len = 0; int err = 0; u64 mtt_size; @@ -212,6 +207,8 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, u64 mtt_shift; int start_index = 0; int npages = 0; + struct scatterlist *sg; + int i; pages = (u64 *) __get_free_page(GFP_KERNEL); if (!pages) @@ -220,12 +217,11 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, mtt_shift = mtt->page_shift; mtt_size = 1ULL << mtt_shift; - list_for_each_entry(chunk, &umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { if (cur_start_addr + len == - sg_dma_address(&chunk->page_list[j])) { + sg_dma_address(sg)) { /* still the same block */ - len += sg_dma_len(&chunk->page_list[j]); + len += sg_dma_len(sg); continue; } /* A new block is started ...*/ @@ -242,8 +238,8 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, goto out; cur_start_addr = - sg_dma_address(&chunk->page_list[j]); - len = sg_dma_len(&chunk->page_list[j]); + sg_dma_address(sg); + len = sg_dma_len(sg); } /* Handle the last block */ @@ -319,8 +315,6 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, int *num_of_mtts) { - struct ib_umem_chunk *chunk; - int j; u64 block_shift = MLX4_MAX_MTT_SHIFT; u64 current_block_len = 0; u64 current_block_start = 0; @@ -330,14 +324,18 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 total_len = 0; u64 last_block_aligned_end = 0; u64 min_shift = ilog2(umem->page_size); + struct scatterlist *sg; + int i; + u64 next_block_start; + u64 current_block_end; - list_for_each_entry(chunk, &umem->chunk_list, list) { + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { /* Initialization - save the first chunk start as the current_block_start - block means contiguous pages. */ if (current_block_len == 0 && current_block_start == 0) { first_block_start = current_block_start = - sg_dma_address(&chunk->page_list[0]); + sg_dma_address(sg); /* Find the bits that are different between the physical address and the virtual address for the start of the MR. @@ -361,13 +359,12 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, , block_shift); } - /* Go over the scatter entries in the current chunk, check + /* Go over the scatter entries and check if they continue the previous scatter entry. */ - for (j = 0; j < chunk->nmap; ++j) { - u64 next_block_start = - sg_dma_address(&chunk->page_list[j]); - u64 current_block_end = current_block_start + next_block_start = + sg_dma_address(sg); + current_block_end = current_block_start + current_block_len; /* If we have a split (non-contig.) between two block*/ if (current_block_end != next_block_start) { @@ -392,7 +389,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, /* Start a new block */ current_block_start = next_block_start; current_block_len = - sg_dma_len(&chunk->page_list[j]); + sg_dma_len(sg); continue; } /* The scatter entry is another part of @@ -402,8 +399,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, which merge some blocks together. */ current_block_len += - sg_dma_len(&chunk->page_list[j]); - } + sg_dma_len(sg); } /* Account for the last block in the total len */ @@ -416,7 +412,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, WARN((total_len & ((1ULL<> block_shift; end: @@ -426,16 +422,19 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, */ WARN(1, "mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n", - (long long)block_shift); + (unsigned long long)block_shift); block_shift = min_shift; } return block_shift; + } -#ifdef __linux__ +/* No suuport for Shared MR */ +#if 0 static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id) { + struct proc_dir_entry *mr_proc_entry; mode_t mode = S_IFREG; char name_buff[16]; @@ -475,8 +474,51 @@ static int is_shared_mr(int access_flags) IB_ACCESS_SHARED_MR_OTHER_WRITE)); } + +static void free_smr_info(struct mlx4_ib_mr *mr) +{ + /* When master/parent shared mr is dereged there is + no ability to share this mr any more - its mr_id will be + returned to the kernel as part of ib_uverbs_dereg_mr + and may be allocated again as part of other reg_mr. + */ + char name_buff[16]; + + sprintf(name_buff, "%X", mr->smr_info->mr_id); + /* Remove proc entry is checking internally that no operation + was strated on that proc fs file and if in the middle + current process will wait till end of operation. + That's why no sync mechanism is needed when we release + below the shared umem. + */ + remove_proc_entry(name_buff, mlx4_mrs_dir_entry); + kfree(mr->smr_info); + mr->smr_info = NULL; +} #endif +static void mlx4_invalidate_umem(void *invalidation_cookie, + struct ib_umem *umem, + unsigned long addr, size_t size) +{ + struct mlx4_ib_mr *mr = (struct mlx4_ib_mr *)invalidation_cookie; + + /* This function is called under client peer lock so its resources are race protected */ + if (atomic_inc_return(&mr->invalidated) > 1) { + umem->invalidation_ctx->inflight_invalidation = 1; + goto end; + } + + umem->invalidation_ctx->peer_callback = 1; + mlx4_mr_free(to_mdev(mr->ibmr.device)->dev, &mr->mmr); + ib_umem_release(umem); + complete(&mr->invalidation_comp); + +end: + return; + +} + struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata, @@ -487,18 +529,20 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int shift; int err; int n; + struct ib_peer_memory_client *ib_peer_mem; mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(pd->uobject->context, start, length, - access_flags, 0); + mr->umem = ib_umem_get_ex(pd->uobject->context, start, length, + access_flags, 0, 1); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; } + ib_peer_mem = mr->umem->ib_peer_mem; n = ib_umem_page_count(mr->umem); shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n); @@ -516,7 +560,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto err_mr; mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; -#ifdef __linux__ +/* No suuport for Shared MR */ +#if 0 /* Check whether MR should be shared */ if (is_shared_mr(access_flags)) { /* start address and length must be aligned to page size in order @@ -531,10 +576,32 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto err_mr; } #endif + if (ib_peer_mem) { + if (access_flags & IB_ACCESS_MW_BIND) { + /* Prevent binding MW on peer clients. + * mlx4_invalidate_umem must be void, + * therefore, mlx4_mr_free should not fail + * when using peer clients. */ + err = -ENOSYS; + pr_err("MW is not supported with peer memory client"); + goto err_smr; + } + init_completion(&mr->invalidation_comp); + ib_umem_activate_invalidation_notifier(mr->umem, + mlx4_invalidate_umem, mr); + } + + atomic_set(&mr->invalidated, 0); return &mr->ibmr; +err_smr: +/* No suuport for Shared MR */ +#if 0 + if (mr->smr_info) + free_smr_info(mr); +#endif err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); err_umem: ib_umem_release(mr->umem); @@ -545,41 +612,106 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(err); } - int mlx4_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); + struct ib_umem *umem = mr->umem; + int ret; - mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); - if (mr->smr_info) { - /* When master/parent shared mr is dereged there is - no ability to share this mr any more - its mr_id will be - returned to the kernel as part of ib_uverbs_dereg_mr - and may be allocated again as part of other reg_mr. - */ - char name_buff[16]; - - sprintf(name_buff, "%X", mr->smr_info->mr_id); - /* Remove proc entry is checking internally that no operation - was strated on that proc fs file and if in the middle - current process will wait till end of operation. - That's why no sync mechanism is needed when we release - below the shared umem. - */ -#ifdef __linux__ - remove_proc_entry(name_buff, mlx4_mrs_dir_entry); - kfree(mr->smr_info); +/* No suuport for Shared MR */ +#if 0 + if (mr->smr_info) + free_smr_info(mr); #endif + + if (atomic_inc_return(&mr->invalidated) > 1) { + wait_for_completion(&mr->invalidation_comp); + goto end; } - if (mr->umem) + ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + if (ret) { + /* Error is not expected here, except when memory windows + * are bound to MR which is not supported with + * peer memory clients */ + atomic_set(&mr->invalidated, 0); + return ret; + } + + if (!umem) + goto end; + ib_umem_release(mr->umem); +end: kfree(mr); return 0; } +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mw *mw; + int err; + + mw = kmalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + return ERR_PTR(-ENOMEM); + + err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, (enum mlx4_mw_type)type, &mw->mmw); + if (err) + goto err_free; + + err = mlx4_mw_enable(dev->dev, &mw->mmw); + if (err) + goto err_mw; + + mw->ibmw.rkey = mw->mmw.key; + + return &mw->ibmw; + +err_mw: + mlx4_mw_free(dev->dev, &mw->mmw); + +err_free: + kfree(mw); + + return ERR_PTR(err); +} + +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct ib_send_wr wr; + struct ib_send_wr *bad_wr; + int ret; + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_BIND_MW; + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + wr.wr.bind_mw.mw = mw; + wr.wr.bind_mw.bind_info = mw_bind->bind_info; + wr.wr.bind_mw.rkey = ib_inc_rkey(mw->rkey); + + ret = mlx4_ib_post_send(qp, &wr, &bad_wr); + if (!ret) + mw->rkey = wr.wr.bind_mw.rkey; + + return ret; +} + +int mlx4_ib_dealloc_mw(struct ib_mw *ibmw) +{ + struct mlx4_ib_mw *mw = to_mmw(ibmw); + + mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw); + kfree(mw); + + return 0; +} + struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) { @@ -606,7 +738,7 @@ struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, return &mr->ibmr; err_mr: - mlx4_mr_free(dev->dev, &mr->mmr); + (void) mlx4_mr_free(dev->dev, &mr->mmr); err_free: kfree(mr); @@ -685,7 +817,7 @@ struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, return &fmr->ibfmr; err_mr: - mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); err_free: kfree(fmr); diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/qp.c b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c index c5ebe6bac8c1..b3d969514b7b 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/qp.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c @@ -45,13 +45,11 @@ #include #include -#ifndef __linux__ -#define asm __asm -#endif - #include "mlx4_ib.h" #include "user.h" +#define asm __asm + enum { MLX4_IB_ACK_REQ_FREQ = 8, }; @@ -111,6 +109,8 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), + [IB_WR_BIND_MW] = cpu_to_be32( + MLX4_OPCODE_BIND_MW), }; #ifndef wc_wmb @@ -263,7 +263,7 @@ static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) /* Pad the remainder of the WQE with an inline data segment. */ if (size > s) { inl = wqe + s; - inl->byte_count = cpu_to_be32(1U << 31 | (size - s - sizeof *inl)); + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); } ctrl->srcrb_flags = 0; ctrl->fence_size = size / 16; @@ -274,7 +274,7 @@ static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) wmb(); ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | - (n & qp->sq.wqe_cnt ? cpu_to_be32(1U << 31) : 0); + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); } @@ -573,6 +573,12 @@ static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(dev, + qp->sqp_proxy_rcv[i].map))) { + pr_warn("ib_dma_map_single failed\n"); + kfree(qp->sqp_proxy_rcv[i].addr); + goto err; + } } return 0; @@ -602,15 +608,6 @@ static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) kfree(qp->sqp_proxy_rcv); } -static int qp_has_rq(struct ib_qp_init_attr *attr) -{ - if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) - return 0; - - return !attr->srq; -} - -#ifdef __linux__ static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp, struct ib_qp_init_attr *attr, int *qpn) { @@ -644,7 +641,7 @@ static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp, err = mlx4_ib_steer_qp_alloc(dev, tss_align_num, &tss_base); else err = mlx4_qp_reserve_range(dev->dev, tss_align_num, - tss_align_num, &tss_base, 1); + tss_align_num, &tss_base, MLX4_RESERVE_BF_QP); if (err) goto err1; @@ -791,7 +788,6 @@ static void free_qpg_qpn(struct mlx4_ib_qp *mqp, int qpn) break; } } -#endif static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, struct ib_qp_init_attr *attr, int *qpn) @@ -800,10 +796,12 @@ static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, switch (attr->qpg_type) { case IB_QPG_NONE: - /* Raw packet QPNs must be aligned to 8 bits. If not, the WQE - * BlueFlame setup flow wrongly causes VLAN insertion. */ + /* Raw packet QPNs may not have bits 6,7 set in their qp_num; + * otherwise, the WQE BlueFlame setup flow wrongly causes + * VLAN insertion. */ if (attr->qp_type == IB_QPT_RAW_PACKET) { - err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 1); + err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, + MLX4_RESERVE_BF_QP); } else { if(qp->flags & MLX4_IB_QP_NETIF) err = mlx4_ib_steer_qp_alloc(dev, 1, qpn); @@ -812,15 +810,11 @@ static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, } break; case IB_QPG_PARENT: -#ifdef __linux__ err = init_qpg_parent(dev, qp, attr, qpn); -#endif break; case IB_QPG_CHILD_TX: case IB_QPG_CHILD_RX: -#ifdef __linux__ err = alloc_qpg_qpn(attr, qp, qpn); -#endif break; default: qp->qpg_type = IB_QPG_NONE; @@ -844,15 +838,11 @@ static void free_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_qp_release_range(dev->dev, qpn, 1); break; case IB_QPG_PARENT: -#ifdef __linux__ free_qpg_parent(dev, qp); -#endif break; case IB_QPG_CHILD_TX: case IB_QPG_CHILD_RX: -#ifdef __linux__ free_qpg_qpn(qp, qpn); -#endif break; default: break; @@ -881,10 +871,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct mlx4_ib_qp *qp; enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; -#ifndef __linux__ - init_attr->qpg_type = IB_QPG_NONE; -#endif - /* When tunneling special qps, we use a plain UD qp */ if (sqpn) { if (mlx4_is_mfunc(dev->dev) && @@ -941,6 +927,23 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->mlx4_ib_qp_type = qp_type; + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + + if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + qp->flags |= MLX4_IB_QP_LSO; + + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (dev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED && + !mlx4_is_mfunc(dev->dev)) + qp->flags |= MLX4_IB_QP_NETIF; + else { + err = -EINVAL; + goto err; + } + } + mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); @@ -952,7 +955,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp); + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, mlx4_ib_qp_has_rq(init_attr), qp); if (err) goto err; @@ -961,11 +964,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, int shift; int n; - if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + if (!udata || ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { err = -EFAULT; goto err; } + if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) + qp->flags |= MLX4_IB_QP_CAP_CROSS_CHANNEL; + + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) + qp->flags |= MLX4_IB_QP_CAP_MANAGED_SEND; + + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) + qp->flags |= MLX4_IB_QP_CAP_MANAGED_RECV; + qp->sq_no_prefetch = ucmd.sq_no_prefetch; err = set_user_sq_size(dev, qp, &ucmd); @@ -990,7 +1002,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - if (qp_has_rq(init_attr)) { + if (mlx4_ib_qp_has_rq(init_attr)) { err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &qp->db); if (err) @@ -999,23 +1011,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } else { qp->sq_no_prefetch = 0; - if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) - qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; - - if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) - qp->flags |= MLX4_IB_QP_LSO; - - if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP && - dev->dev->caps.steering_mode == - MLX4_STEERING_MODE_DEVICE_MANAGED && - !mlx4_is_mfunc(dev->dev)) - qp->flags |= MLX4_IB_QP_NETIF; - err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); if (err) goto err; - if (qp_has_rq(init_attr)) { + if (mlx4_ib_qp_has_rq(init_attr)) { err = mlx4_db_alloc(dev->dev, &qp->db, 0); if (err) goto err; @@ -1097,7 +1097,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, free_proxy_bufs(pd->device, qp); err_wrid: if (pd->uobject) { - if (qp_has_rq(init_attr)) + if (mlx4_ib_qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); } else { kfree(qp->sq.wrid); @@ -1114,7 +1114,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); err_db: - if (!pd->uobject && qp_has_rq(init_attr)) + if (!pd->uobject && mlx4_ib_qp_has_rq(init_attr)) mlx4_db_free(dev->dev, &qp->db); if (qp->max_inline_data) @@ -1145,7 +1145,7 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv { if (send_cq == recv_cq) { spin_lock_irq(&send_cq->lock); - (void) __acquire(&recv_cq->lock); + __acquire(&recv_cq->lock); } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); @@ -1159,7 +1159,7 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re __releases(&send_cq->lock) __releases(&recv_cq->lock) { if (send_cq == recv_cq) { - (void) __release(&recv_cq->lock); + __release(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); @@ -1300,14 +1300,14 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) return dev->dev->caps.qp1_proxy[attr->port_num - 1]; } -#ifdef __linux__ static int check_qpg_attr(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) { if (attr->qpg_type == IB_QPG_NONE) return 0; - if (attr->qp_type != IB_QPT_UD) + if (attr->qp_type != IB_QPT_UD && + attr->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; if (attr->qpg_type == IB_QPG_PARENT) { @@ -1346,7 +1346,6 @@ static int check_qpg_attr(struct mlx4_ib_dev *dev, } return 0; } -#endif #define RESERVED_FLAGS_MASK ((((unsigned int)IB_QP_CREATE_RESERVED_END - 1) | IB_QP_CREATE_RESERVED_END) \ & ~(IB_QP_CREATE_RESERVED_START - 1)) @@ -1364,6 +1363,15 @@ static enum mlx4_ib_qp_flags to_mlx4_ib_qp_flags(enum ib_qp_create_flags ib_qp_f if (ib_qp_flags & IB_QP_CREATE_NETIF_QP) mlx4_ib_qp_flags |= MLX4_IB_QP_NETIF; + if (ib_qp_flags & IB_QP_CREATE_CROSS_CHANNEL) + mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_CROSS_CHANNEL; + + if (ib_qp_flags & IB_QP_CREATE_MANAGED_SEND) + mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_MANAGED_SEND; + + if (ib_qp_flags & IB_QP_CREATE_MANAGED_RECV) + mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_MANAGED_RECV; + /* reserved flags */ mlx4_ib_qp_flags |= (ib_qp_flags & RESERVED_FLAGS_MASK); @@ -1387,6 +1395,9 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, * and only for kernel UD QPs. */ if (mlx4_qp_flags & ~(MLX4_IB_QP_LSO | + MLX4_IB_QP_CAP_CROSS_CHANNEL | + MLX4_IB_QP_CAP_MANAGED_SEND | + MLX4_IB_QP_CAP_MANAGED_RECV | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP | MLX4_IB_QP_NETIF)) @@ -1397,19 +1408,30 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-EINVAL); } - if (init_attr->create_flags && - (udata || - ((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) && + if ((mlx4_qp_flags & + (MLX4_IB_QP_CAP_CROSS_CHANNEL | + MLX4_IB_QP_CAP_MANAGED_SEND | + MLX4_IB_QP_CAP_MANAGED_RECV)) && + !(to_mdev(device)->dev->caps.flags & + MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)) { + pr_debug("%s Does not support cross-channel operations\n", + to_mdev(device)->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + if ((init_attr->create_flags & + ~(IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) && + (((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) && init_attr->qp_type != IB_QPT_UD) || ((mlx4_qp_flags & MLX4_IB_SRIOV_SQP) && init_attr->qp_type > IB_QPT_GSI))) return ERR_PTR(-EINVAL); -#ifdef __linux__ err = check_qpg_attr(to_mdev(device), init_attr); if (err) return ERR_PTR(err); -#endif switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: @@ -1559,32 +1581,42 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); } -static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, - struct mlx4_ib_qp *qp, struct mlx4_qp_path *path, - u8 port, int is_primary) +static int ib_rate_to_mlx4(struct mlx4_ib_dev *dev, u8 rate) +{ + if (rate == IB_RATE_PORT_CURRENT) { + return 0; + } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { + return -EINVAL; + } else { + while (rate != IB_RATE_2_5_GBPS && + !(1 << (rate + MLX4_STAT_RATE_OFFSET) & + dev->dev->caps.stat_rate_support)) + --rate; + } + + return rate + MLX4_STAT_RATE_OFFSET; +} + +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, + u8 *smac, u16 vlan_id, struct mlx4_ib_qp *qp, + struct mlx4_qp_path *path, u8 port, int is_primary) { - struct net_device *ndev; - int err; int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_ETHERNET; - u8 mac[6]; - int is_mcast; u16 vlan_tag; int vidx; int smac_index; + int err; u64 u64_mac; - u8 *smac; struct mlx4_roce_smac_vlan_info *smac_info; path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); - if (ah->static_rate) { - path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET; - while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && - !(1 << path->static_rate & dev->dev->caps.stat_rate_support)) - --path->static_rate; - } else - path->static_rate = 0; + + err = ib_rate_to_mlx4(dev, ah->static_rate); + if (err < 0) + return err; + path->static_rate = err; if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { @@ -1614,7 +1646,7 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, else smac_info = &qp->alt; - vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); + vlan_tag = vlan_id; if (vlan_tag < 0x1000) { if (smac_info->vid < 0x1000) { /* both valid vlan ids */ @@ -1653,28 +1685,13 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, } } - err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port); - if (err) - return err; /* get smac_index for RoCE use. * If no smac was yet assigned, register one. * If one was already assigned, but the new mac differs, * unregister the old one and register the new one. */ - spin_lock(&dev->iboe.lock); - ndev = dev->iboe.netdevs[port - 1]; - if (ndev) { -#ifdef __linux__ - smac = ndev->dev_addr; /* fixme: cache this value */ -#else - smac = IF_LLADDR(ndev); /* fixme: cache this value */ -#endif - u64_mac = mlx4_mac_to_u64(smac); - } else - u64_mac = dev->dev->caps.def_mac[port]; - spin_unlock(&dev->iboe.lock); if (!smac_info->smac || smac_info->smac != u64_mac) { /* register candidate now, unreg if needed, after success */ @@ -1688,7 +1705,7 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, } else smac_index = smac_info->smac_index; - memcpy(path->dmac, mac, 6); + memcpy(path->dmac, ah->dmac, 6); path->ackto = MLX4_IB_LINK_TYPE_ETH; /* put MAC table smac index for IBoE */ path->grh_mylmc = (u8) (smac_index) | 0x80 ; @@ -1712,24 +1729,21 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) } } -static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, const u8 *smac, struct mlx4_qp_context *context) { struct net_device *ndev; u64 u64_mac; - u8 *smac; int smac_index; + ndev = dev->iboe.netdevs[qp->port - 1]; if (ndev) { -#ifdef __linux__ - smac = ndev->dev_addr; /* fixme: cache this value */ -#else - smac = IF_LLADDR(ndev); /* fixme: cache this value */ -#endif + smac = IF_LLADDR(ndev); u64_mac = mlx4_mac_to_u64(smac); - } else + } else { u64_mac = dev->dev->caps.def_mac[qp->port]; + } context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); if (!qp->pri.smac) { @@ -1783,6 +1797,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } + if (qp->max_inlr_data) + context->param3 |= cpu_to_be32(1 << 25); + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; else if (ibqp->qp_type == IB_QPT_RAW_PACKET) @@ -1834,12 +1851,13 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { - if (dev->counters[qp->port - 1] != -1) { + if (dev->counters[qp->port - 1].counter_index != -1) { context->pri_path.counter_index = - dev->counters[qp->port - 1]; + dev->counters[qp->port - 1].counter_index; optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; - } else + } else { context->pri_path.counter_index = 0xff; + } if (qp->flags & MLX4_IB_QP_NETIF && (qp->qpg_type == IB_QPG_NONE || qp->qpg_type == IB_QPG_PARENT)) { @@ -1855,8 +1873,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; } - if (attr_mask & IB_QP_AV) { - if (mlx4_set_path(dev, &attr->ah_attr, qp, &context->pri_path, + if ((attr_mask & IB_QP_AV) && (ibqp->qp_type != IB_QPT_RAW_PACKET)) { + if (mlx4_set_path(dev, &attr->ah_attr, (u8 *)attr->smac, + attr_mask & IB_QP_VID ? + attr->vlan_id : 0xffff , + qp, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port, 1)) goto out; @@ -1879,12 +1900,16 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, dev->dev->caps.pkey_table_len[attr->alt_port_num]) goto out; - if (mlx4_set_path(dev, &attr->alt_ah_attr, qp, &context->alt_path, + if (mlx4_set_path(dev, &attr->alt_ah_attr, (u8 *)attr->smac, + attr_mask & IB_QP_ALT_VID ? + attr->alt_vlan_id : 0xffff, + qp, &context->alt_path, attr->alt_port_num, 0)) goto out; context->alt_path.pkey_index = attr->alt_pkey_index; context->alt_path.ackto = attr->alt_timeout << 3; + context->alt_path.counter_index = dev->counters[attr->alt_port_num - 1].counter_index; optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } @@ -1943,6 +1968,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (attr_mask & IB_M_EXT_CLASS_3) context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_RQ); + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + context->params2 |= (qp->flags & MLX4_IB_QP_CAP_CROSS_CHANNEL ? + cpu_to_be32(MLX4_QP_BIT_COLL_MASTER) : 0); + context->params2 |= (qp->flags & MLX4_IB_QP_CAP_MANAGED_SEND ? + cpu_to_be32(MLX4_QP_BIT_COLL_MASTER | MLX4_QP_BIT_COLL_SYNC_SQ) : 0); + context->params2 |= (qp->flags & MLX4_IB_QP_CAP_MANAGED_RECV ? + cpu_to_be32(MLX4_QP_BIT_COLL_MASTER | MLX4_QP_BIT_COLL_SYNC_RQ) : 0); + } + if (ibqp->srq) context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); @@ -1997,6 +2031,12 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; } + if (ibqp->qp_type == IB_QPT_RAW_PACKET && + (attr_mask & IB_QP_AV)) { + context->pri_path.sched_queue |= + ((attr->ah_attr.sl & 0xf) << 3); + context->pri_path.feup = 1 << 6; + } is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port) == IB_LINK_LAYER_ETHERNET; if (is_eth) { @@ -2007,13 +2047,19 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { - err = handle_eth_ud_smac_index(dev, qp, context); + err = handle_eth_ud_smac_index(dev, qp, (const u8 *)attr->smac, context); if (err) return -EINVAL; } } } + if (ibqp->qp_type == IB_QPT_UD) + if (is_eth && (new_state == IB_QPS_RTR)) { + context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; + optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; + } + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; @@ -2072,7 +2118,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, for (i = 0; i < qp->sq.wqe_cnt; ++i) { ctrl = get_send_wqe(qp, i); - ctrl->owner_opcode = cpu_to_be32(1U << 31); + ctrl->owner_opcode = cpu_to_be32(1 << 31); if (qp->sq_max_wqes_per_wr == 1) ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); @@ -2080,6 +2126,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } + if ((qp->port && rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) && (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)) + context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | + MLX4_IB_LINK_TYPE_ETH; + err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), to_mlx4_state(new_state), context, optpar, sqd_event, &qp->mqp); @@ -2268,14 +2319,22 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; + int ll; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + ll = IB_LINK_LAYER_UNSPECIFIED; + } else { + int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = rdma_port_get_link_layer(&dev->ib_dev, port); + } + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask & ~IB_M_QP_MOD_VEND_MASK)) { + attr_mask & ~IB_M_QP_MOD_VEND_MASK, ll)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", @@ -2299,11 +2358,6 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } - if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) && - (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) != - IB_LINK_LAYER_ETHERNET)) - goto out; - if (attr_mask & IB_QP_PKEY_INDEX) { int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { @@ -2421,11 +2475,11 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (header_size <= spc) { - inl->byte_count = cpu_to_be32(1U << 31 | header_size); + inl->byte_count = cpu_to_be32(1 << 31 | header_size); memcpy(inl + 1, sqp->header_buf, header_size); i = 1; } else { - inl->byte_count = cpu_to_be32(1U << 31 | spc); + inl->byte_count = cpu_to_be32(1 << 31 | spc); memcpy(inl + 1, sqp->header_buf, spc); inl = (void *) (inl + 1) + spc; @@ -2444,7 +2498,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, * of 16 mod 64. */ wmb(); - inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc)); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); i = 2; } @@ -2470,7 +2524,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, int is_eth; int is_vlan = 0; int is_grh; - u16 vlan = 0; + u16 uninitialized_var(vlan); int err = 0; send_size = 0; @@ -2497,8 +2551,10 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, return err; } - vlan = rdma_get_vlan_id(&sgid); - is_vlan = vlan < 0x1000; + if (is_eth && ah->av.eth.vlan != 0xffff) { + vlan = cpu_to_be16(ah->av.eth.vlan) & 0x0fff; + is_vlan = 1; + } } ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); @@ -2565,7 +2621,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, } if (is_eth) { - u8 smac[6]; + u8 *smac; struct in6_addr in6; u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; @@ -2577,8 +2633,13 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); memcpy(&in6, sgid.raw, sizeof(in6)); - rdma_get_ll_mac(&in6, smac); + + if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) + smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]); + else + smac = ah->av.eth.s_mac; /* use the src mac of the tunnel */ memcpy(sqp->ud_header.eth.smac_h, smac, 6); + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); if (!is_vlan) { @@ -2628,11 +2689,11 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (header_size <= spc) { - inl->byte_count = cpu_to_be32(1U << 31 | header_size); + inl->byte_count = cpu_to_be32(1 << 31 | header_size); memcpy(inl + 1, sqp->header_buf, header_size); i = 1; } else { - inl->byte_count = cpu_to_be32(1U << 31 | spc); + inl->byte_count = cpu_to_be32(1 << 31 | spc); memcpy(inl + 1, sqp->header_buf, spc); inl = (void *) (inl + 1) + spc; @@ -2651,7 +2712,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, * of 16 mod 64. */ wmb(); - inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc)); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); i = 2; } @@ -2679,9 +2740,12 @@ static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq static __be32 convert_access(int acc) { - return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | - (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | - (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | + return (acc & IB_ACCESS_REMOTE_ATOMIC ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | + (acc & IB_ACCESS_REMOTE_READ ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); } @@ -2707,6 +2771,24 @@ static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) fseg->reserved[1] = 0; } +static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr) +{ + bseg->flags1 = + convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) & + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ | + MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE | + MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC); + bseg->flags2 = 0; + if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2); + if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED); + bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey); + bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey); + bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr); + bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length); +} + static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) { iseg->mem_key = cpu_to_be32(rkey); @@ -2792,23 +2874,25 @@ static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_ hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + memcpy(hdr.mac, ah->av.eth.mac, 6); + hdr.vlan = cpu_to_be16(ah->av.eth.vlan); spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (sizeof (hdr) <= spc) { memcpy(inl + 1, &hdr, sizeof (hdr)); wmb(); - inl->byte_count = cpu_to_be32(1U << 31 | sizeof (hdr)); + inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr)); i = 1; } else { memcpy(inl + 1, &hdr, spc); wmb(); - inl->byte_count = cpu_to_be32(1U << 31 | spc); + inl->byte_count = cpu_to_be32(1 << 31 | spc); inl = (void *) (inl + 1) + spc; memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc); wmb(); - inl->byte_count = cpu_to_be32(1U << 31 | (sizeof (hdr) - spc)); + inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc)); i = 2; } @@ -2833,7 +2917,7 @@ static void set_mlx_icrc_seg(void *dseg) */ wmb(); - iseg->byte_count = cpu_to_be32((1U << 31) | 4); + iseg->byte_count = cpu_to_be32((1 << 31) | 4); } static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) @@ -2901,7 +2985,7 @@ static void add_zero_len_inline(void *wqe) { struct mlx4_wqe_inline_seg *inl = wqe; memset(wqe, 0, 16); - inl->byte_count = cpu_to_be32(1U << 31); + inl->byte_count = cpu_to_be32(1 << 31); } static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr, @@ -3102,6 +3186,12 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_fmr_seg) / 16; break; + case IB_WR_BIND_MW: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_bind_seg(wqe, wr); + wqe += sizeof(struct mlx4_wqe_bind_seg); + size += sizeof(struct mlx4_wqe_bind_seg) / 16; default: /* No extra segments required for sends */ break; @@ -3246,14 +3336,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, */ wmb(); - if (wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { + if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { *bad_wr = wr; err = -EINVAL; goto out; } ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | - (ind & qp->sq.wqe_cnt ? cpu_to_be32(1U << 31) : 0) | blh; + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); @@ -3576,6 +3666,15 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + if (qp->flags & MLX4_IB_QP_CAP_CROSS_CHANNEL) + qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; + + if (qp->flags & MLX4_IB_QP_CAP_MANAGED_SEND) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; + + if (qp->flags & MLX4_IB_QP_CAP_MANAGED_RECV) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + qp_init_attr->qpg_type = ibqp->qpg_type; if (ibqp->qpg_type == IB_QPG_PARENT) qp_init_attr->cap.qpg_tss_mask_sz = qp->qpg_data->qpg_tss_mask_sz; @@ -3586,4 +3685,3 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr mutex_unlock(&qp->mutex); return err; } - diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c b/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c index 6837b86daba4..df4549ff3f29 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c @@ -56,8 +56,8 @@ static ssize_t show_admin_alias_guid(struct device *dev, record_num = mlx4_ib_iov_dentry->entry_num / 8 ; guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ; - return sprintf(buf, "%llx\n", (long long) - be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid. + return sprintf(buf, "%llx\n", + (long long)be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid. ports_guid[port->num - 1]. all_rec_per_port[record_num]. all_recs[8 * guid_index_in_rec])); diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c index 088e4407b82d..3fed07c5d56c 100644 --- a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c @@ -672,8 +672,8 @@ static int mthca_destroy_qp(struct ib_qp *qp) return 0; } -static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries, - int comp_vector, +static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, + struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata) { @@ -681,6 +681,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries, struct mthca_cq *cq; int nent; int err; + int entries = attr->cqe; if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes) return ERR_PTR(-EINVAL); @@ -1010,12 +1011,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata, int mr_id) { struct mthca_dev *dev = to_mdev(pd->device); - struct ib_umem_chunk *chunk; + struct scatterlist *sg; struct mthca_mr *mr; struct mthca_reg_mr ucmd; u64 *pages; int shift, n, len; - int i, j, k; + int i, k, entry; int err = 0; int write_mtt_size; @@ -1044,10 +1045,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, shift = ffs(mr->umem->page_size) - 1; - n = 0; - list_for_each_entry(chunk, &mr->umem->chunk_list, list) - n += chunk->nents; - + n = mr->umem->nmap;; mr->mtt = mthca_alloc_mtt(dev, n); if (IS_ERR(mr->mtt)) { err = PTR_ERR(mr->mtt); @@ -1064,25 +1062,27 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); - list_for_each_entry(chunk, &mr->umem->chunk_list, list) - for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = sg_dma_address(&chunk->page_list[j]) + - mr->umem->page_size * k; - /* - * Be friendly to write_mtt and pass it chunks - * of appropriate size. - */ - if (i == write_mtt_size) { - err = mthca_write_mtt(dev, mr->mtt, n, pages, i); - if (err) - goto mtt_done; - n += i; - i = 0; - } + for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + mr->umem->page_size * k; + /* + * Be friendly to write_mtt and pass it chunks + * of appropriate size. + */ + if (i == write_mtt_size) { + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); + if (err) + goto mtt_done; + n += i; + i = 0; } } + } + + + if (i) err = mthca_write_mtt(dev, mr->mtt, n, pages, i); diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c index 2264bcdb9041..b4c70b415317 100644 --- a/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c @@ -870,7 +870,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) { mthca_dbg(dev, "Bad QP transition (transport %d) " "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile b/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile deleted file mode 100644 index 3090100f0de7..000000000000 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -obj-$(CONFIG_INFINIBAND_IPOIB) += ib_ipoib.o - -ib_ipoib-y := ipoib_main.o \ - ipoib_ib.o \ - ipoib_multicast.o \ - ipoib_verbs.o \ - ipoib_vlan.o \ - ipoib_ethtool.o -ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o -ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o - diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h index 7d5e17550c1e..eb269a414a54 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h @@ -80,6 +80,7 @@ #include #include #include +#include #include @@ -313,6 +314,7 @@ struct ipoib_ethtool_st { */ struct ipoib_dev_priv { spinlock_t lock; + spinlock_t drain_lock; struct ifnet *dev; diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 4fb39b410cd8..814938c101bc 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -383,6 +383,7 @@ ipoib_poll(struct ipoib_dev_priv *priv) int n, i; poll_more: + spin_lock(&priv->drain_lock); for (;;) { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); @@ -401,6 +402,7 @@ ipoib_poll(struct ipoib_dev_priv *priv) if (n != IPOIB_NUM_WC) break; } + spin_unlock(&priv->drain_lock); if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)) @@ -707,6 +709,7 @@ void ipoib_drain_cq(struct ipoib_dev_priv *priv) { int i, n; + spin_lock(&priv->drain_lock); do { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; ++i) { @@ -727,6 +730,7 @@ void ipoib_drain_cq(struct ipoib_dev_priv *priv) ipoib_ib_handle_rx_wc(priv, priv->ibwc + i); } } while (n == IPOIB_NUM_WC); + spin_unlock(&priv->drain_lock); spin_lock(&priv->lock); while (ipoib_poll_tx(priv)) diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c index 695621f16a84..35e16417a5f6 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -832,6 +832,7 @@ ipoib_priv_alloc(void) priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); + spin_lock_init(&priv->drain_lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 9c7bcec1a6f9..4c04da1533d0 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -466,12 +466,20 @@ void ipoib_mcast_join_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct ifnet *dev = priv->dev; + struct ib_port_attr attr; ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags); if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n", + __func__, attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else diff --git a/sys/ofed/include/rdma/ib_addr.h b/sys/ofed/include/rdma/ib_addr.h index b71151004347..b564415b1ed1 100644 --- a/sys/ofed/include/rdma/ib_addr.h +++ b/sys/ofed/include/rdma/ib_addr.h @@ -31,17 +31,20 @@ * SOFTWARE. */ -#if !defined(IB_ADDR_H) +#ifndef IB_ADDR_H #define IB_ADDR_H #include #include #include #include +#include #include +#include #include #include -#include +#include +#include struct rdma_addr_client { atomic_t refcount; @@ -72,7 +75,8 @@ struct rdma_dev_addr { * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. */ -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr); +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, + u16 *vlan_id); /** * rdma_resolve_ip - Resolve source and destination IP addresses to @@ -101,6 +105,9 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, const unsigned char *dst_dev_addr); +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id); +int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac, + u16 *vlan_id); static inline int ip_addr_size(struct sockaddr *addr) { @@ -130,50 +137,56 @@ static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } -static inline void iboe_mac_vlan_to_ll(union ib_gid *gid, u8 *mac, u16 vid) -{ - memset(gid->raw, 0, 16); - *((u32 *)gid->raw) = cpu_to_be32(0xfe800000); - if (vid < 0x1000) { - gid->raw[12] = vid & 0xff; - gid->raw[11] = vid >> 8; - } else { - gid->raw[12] = 0xfe; - gid->raw[11] = 0xff; - } - - memcpy(gid->raw + 13, mac + 3, 3); - memcpy(gid->raw + 8, mac, 3); - gid->raw[8] ^= 2; -} - static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) { -#ifdef __linux__ - return dev->priv_flags & IFF_802_1Q_VLAN ? - vlan_dev_vlan_id(dev) : 0xffff; -#else uint16_t tag; if (VLAN_TAG(__DECONST(struct ifnet *, dev), &tag) != 0) return 0xffff; return tag; -#endif } +static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) +{ + switch (addr->sa_family) { + case AF_INET: + ipv6_addr_set_v4mapped(((struct sockaddr_in *)addr)->sin_addr.s_addr, + (struct in6_addr *)gid); + break; + case AF_INET6: + memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, + 16); + break; + default: + return -EINVAL; + } + return 0; +} + +/* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ +static inline int rdma_gid2ip(struct sockaddr *out, union ib_gid *gid) +{ + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { + struct sockaddr_in *out_in = (struct sockaddr_in *)out; + memset(out_in, 0, sizeof(*out_in)); + out_in->sin_len = sizeof(*out_in); + out_in->sin_family = AF_INET; + memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); + } else { + struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; + memset(out_in, 0, sizeof(*out_in)); + out_in->sin6_family = AF_INET6; + memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); + } + return 0; +} + +/* This func is called only in loopback ip address (127.0.0.1) + * case in which sgid is not relevant + */ static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - struct net_device *dev; - u16 vid = 0xffff; - - dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (dev) { - vid = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - } - - iboe_mac_vlan_to_ll(gid, dev_addr->src_dev_addr, vid); } static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) @@ -223,27 +236,6 @@ static inline enum ib_mtu iboe_get_mtu(int mtu) return 0; } -#ifdef __linux__ -static inline int iboe_get_rate(struct net_device *dev) -{ - struct ethtool_cmd cmd; - - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings || - dev->ethtool_ops->get_settings(dev, &cmd)) - return IB_RATE_PORT_CURRENT; - - if (cmd.speed >= 40000) - return IB_RATE_40_GBPS; - else if (cmd.speed >= 30000) - return IB_RATE_30_GBPS; - else if (cmd.speed >= 20000) - return IB_RATE_20_GBPS; - else if (cmd.speed >= 10000) - return IB_RATE_10_GBPS; - else - return IB_RATE_PORT_CURRENT; -} -#else static inline int iboe_get_rate(struct net_device *dev) { if (dev->if_baudrate >= IF_Gbps(40)) @@ -257,11 +249,10 @@ static inline int iboe_get_rate(struct net_device *dev) else return IB_RATE_PORT_CURRENT; } -#endif static inline int rdma_link_local_addr(struct in6_addr *addr) { - if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) && + if (addr->s6_addr32[0] == htonl(0xfe800000) && addr->s6_addr32[1] == 0) return 1; @@ -280,6 +271,20 @@ static inline int rdma_is_multicast_addr(struct in6_addr *addr) return addr->s6_addr[0] == 0xff; } +static inline void resolve_mcast_mac(struct in6_addr *addr, u8 *mac) +{ + if (addr->s6_addr[0] != 0xff) + return; + +#ifdef DUAL_MODE_MCAST_MAC + if (addr->s6_addr[1] == 0x0e) /* IPv4 */ + ip_eth_mc_map(addr->s6_addr32[3], mac); + else +#endif + ipv6_eth_mc_map(addr, mac); +} + + static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; @@ -300,12 +305,7 @@ static inline u16 rdma_get_vlan_id(union ib_gid *dgid) static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) { -#ifdef __linux__ - return dev->priv_flags & IFF_802_1Q_VLAN ? - vlan_dev_real_dev(dev) : 0; -#else return VLAN_TRUNKDEV(__DECONST(struct ifnet *, dev)); -#endif } #endif /* IB_ADDR_H */ diff --git a/sys/ofed/include/rdma/ib_cache.h b/sys/ofed/include/rdma/ib_cache.h index 00a2b8ec327f..ad9a3c280944 100644 --- a/sys/ofed/include/rdma/ib_cache.h +++ b/sys/ofed/include/rdma/ib_cache.h @@ -100,6 +100,22 @@ int ib_find_cached_pkey(struct ib_device *device, u16 pkey, u16 *index); +/** + * ib_find_exact_cached_pkey - Returns the PKey table index where a specified + * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit) + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the cached PKey table where the PKey was found. + * + * ib_find_exact_cached_pkey() searches the specified PKey table in + * the local software cache. + */ +int ib_find_exact_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index); + /** * ib_get_cached_lmc - Returns a cached lmc table entry * @device: The device to query. diff --git a/sys/ofed/include/rdma/ib_cm.h b/sys/ofed/include/rdma/ib_cm.h index 40c24b6683a4..a7ffaf950391 100644 --- a/sys/ofed/include/rdma/ib_cm.h +++ b/sys/ofed/include/rdma/ib_cm.h @@ -497,7 +497,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, * message. * @cm_id: Connection identifier associated with the connection message. * @service_timeout: The lower 5-bits specify the maximum time required for - * the sender to reply to to the connection message. The upper 3-bits + * the sender to reply to the connection message. The upper 3-bits * specify additional control flags. * @private_data: Optional user-defined private data sent with the * message receipt acknowledgement. @@ -601,4 +601,6 @@ struct ib_cm_sidr_rep_param { int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_param *param); +int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac); + #endif /* IB_CM_H */ diff --git a/sys/ofed/include/rdma/ib_mad.h b/sys/ofed/include/rdma/ib_mad.h index 32f81141efd3..3d81b90cc315 100644 --- a/sys/ofed/include/rdma/ib_mad.h +++ b/sys/ofed/include/rdma/ib_mad.h @@ -77,6 +77,15 @@ #define IB_MGMT_MAX_METHODS 128 +/* MAD Status field bit masks */ +#define IB_MGMT_MAD_STATUS_SUCCESS 0x0000 +#define IB_MGMT_MAD_STATUS_BUSY 0x0001 +#define IB_MGMT_MAD_STATUS_REDIRECT_REQD 0x0002 +#define IB_MGMT_MAD_STATUS_BAD_VERSION 0x0004 +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD 0x0008 +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB 0x000c +#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE 0x001c + /* RMPP information */ #define IB_MGMT_RMPP_VERSION 1 diff --git a/sys/ofed/include/rdma/ib_pack.h b/sys/ofed/include/rdma/ib_pack.h index af615a477ffd..1678be7f047d 100644 --- a/sys/ofed/include/rdma/ib_pack.h +++ b/sys/ofed/include/rdma/ib_pack.h @@ -263,7 +263,5 @@ int ib_ud_header_pack(struct ib_ud_header *header, int ib_ud_header_unpack(void *buf, struct ib_ud_header *header); -int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf); -int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh); #endif /* IB_PACK_H */ diff --git a/sys/ofed/include/rdma/ib_peer_mem.h b/sys/ofed/include/rdma/ib_peer_mem.h new file mode 100644 index 000000000000..b2a8a4a3f246 --- /dev/null +++ b/sys/ofed/include/rdma/ib_peer_mem.h @@ -0,0 +1,59 @@ +#if !defined(IB_PEER_MEM_H) +#define IB_PEER_MEM_H + +#include + + +struct invalidation_ctx; +struct ib_ucontext; + +struct ib_peer_memory_statistics { + unsigned long num_alloc_mrs; + unsigned long num_dealloc_mrs; + unsigned long num_reg_pages; + unsigned long num_dereg_pages; + unsigned long num_free_callbacks; +}; + +struct ib_peer_memory_client { + const struct peer_memory_client *peer_mem; + + struct list_head core_peer_list; + struct list_head core_ticket_list; + unsigned long last_ticket; +#ifdef __FreeBSD__ + int holdcount; + int needwakeup; + struct cv peer_cv; +#else + struct srcu_struct peer_srcu; +#endif + struct mutex lock; + struct kobject *kobj; + struct attribute_group peer_mem_attr_group; + struct ib_peer_memory_statistics stats; +}; + +struct core_ticket { + unsigned long key; + void *context; + struct list_head ticket_list; +}; + +struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr, + size_t size, void **peer_client_context, + int *srcu_key); + +void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, + void *peer_client_context, + int srcu_key); + +unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client, + void *context); +int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client, + unsigned long key); +struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client, + unsigned long key); +#endif + + diff --git a/sys/ofed/include/rdma/ib_sa.h b/sys/ofed/include/rdma/ib_sa.h index 61588d9ccdf8..65f1a006ee52 100644 --- a/sys/ofed/include/rdma/ib_sa.h +++ b/sys/ofed/include/rdma/ib_sa.h @@ -154,6 +154,9 @@ struct ib_sa_path_rec { u8 packet_life_time_selector; u8 packet_life_time; u8 preference; + u8 smac[ETH_ALEN]; + u8 dmac[6]; + __be16 vlan_id; }; #define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) @@ -251,127 +254,6 @@ struct ib_sa_service_rec { u64 data64[2]; }; -enum { - IB_SA_EVENT_TYPE_FATAL = 0x0, - IB_SA_EVENT_TYPE_URGENT = 0x1, - IB_SA_EVENT_TYPE_SECURITY = 0x2, - IB_SA_EVENT_TYPE_SM = 0x3, - IB_SA_EVENT_TYPE_INFO = 0x4, - IB_SA_EVENT_TYPE_EMPTY = 0x7F, - IB_SA_EVENT_TYPE_ALL = 0xFFFF -}; - -enum { - IB_SA_EVENT_PRODUCER_TYPE_CA = 0x1, - IB_SA_EVENT_PRODUCER_TYPE_SWITCH = 0x2, - IB_SA_EVENT_PRODUCER_TYPE_ROUTER = 0x3, - IB_SA_EVENT_PRODUCER_TYPE_CLASS_MANAGER = 0x4, - IB_SA_EVENT_PRODUCER_TYPE_ALL = 0xFFFFFF -}; - -enum { - IB_SA_SM_TRAP_GID_IN_SERVICE = 64, - IB_SA_SM_TRAP_GID_OUT_OF_SERVICE = 65, - IB_SA_SM_TRAP_CREATE_MC_GROUP = 66, - IB_SA_SM_TRAP_DELETE_MC_GROUP = 67, - IB_SA_SM_TRAP_PORT_CHANGE_STATE = 128, - IB_SA_SM_TRAP_LINK_INTEGRITY = 129, - IB_SA_SM_TRAP_EXCESSIVE_BUFFER_OVERRUN = 130, - IB_SA_SM_TRAP_FLOW_CONTROL_UPDATE_EXPIRED = 131, - IB_SA_SM_TRAP_BAD_M_KEY = 256, - IB_SA_SM_TRAP_BAD_P_KEY = 257, - IB_SA_SM_TRAP_BAD_Q_KEY = 258, - IB_SA_SM_TRAP_SWITCH_BAD_P_KEY = 259, - IB_SA_SM_TRAP_ALL = 0xFFFF -}; - -struct ib_sa_inform { - union ib_gid gid; - __be16 lid_range_begin; - __be16 lid_range_end; - u8 is_generic; - u8 subscribe; - __be16 type; - union { - struct { - __be16 trap_num; - __be32 qpn; - u8 resp_time; - __be32 producer_type; - } generic; - struct { - __be16 device_id; - __be32 qpn; - u8 resp_time; - __be32 vendor_id; - } vendor; - } trap; -}; - -struct ib_sa_notice { - u8 is_generic; - u8 type; - union { - struct { - __be32 producer_type; - __be16 trap_num; - } generic; - struct { - __be32 vendor_id; - __be16 device_id; - } vendor; - } trap; - __be16 issuer_lid; - __be16 notice_count; - u8 notice_toggle; - /* - * Align data 16 bits off 64 bit field to match InformInfo definition. - * Data contained within this field will then align properly. - * See IB spec 1.2, sections 13.4.8.2 and 14.2.5.1. - */ - u8 reserved[5]; - u8 data_details[54]; - union ib_gid issuer_gid; -}; - -/* - * SM notice data details for: - * - * IB_SA_SM_TRAP_GID_IN_SERVICE = 64 - * IB_SA_SM_TRAP_GID_OUT_OF_SERVICE = 65 - * IB_SA_SM_TRAP_CREATE_MC_GROUP = 66 - * IB_SA_SM_TRAP_DELETE_MC_GROUP = 67 - */ -struct ib_sa_notice_data_gid { - u8 reserved[6]; - u8 gid[16]; - u8 padding[32]; -}; - -/* - * SM notice data details for: - * - * IB_SA_SM_TRAP_PORT_CHANGE_STATE = 128 - */ -struct ib_sa_notice_data_port_change { - __be16 lid; - u8 padding[52]; -}; - -/* - * SM notice data details for: - * - * IB_SA_SM_TRAP_LINK_INTEGRITY = 129 - * IB_SA_SM_TRAP_EXCESSIVE_BUFFER_OVERRUN = 130 - * IB_SA_SM_TRAP_FLOW_CONTROL_UPDATE_EXPIRED = 131 - */ -struct ib_sa_notice_data_port_error { - u8 reserved[2]; - __be16 lid; - u8 port_num; - u8 padding[49]; -}; - #define IB_SA_GUIDINFO_REC_LID IB_SA_COMP_MASK(0) #define IB_SA_GUIDINFO_REC_BLOCK_NUM IB_SA_COMP_MASK(1) #define IB_SA_GUIDINFO_REC_RES1 IB_SA_COMP_MASK(2) @@ -528,56 +410,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, */ void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec); -struct ib_inform_info { - void *context; - int (*callback)(int status, - struct ib_inform_info *info, - struct ib_sa_notice *notice); - u16 trap_number; -}; - -/** - * ib_sa_register_inform_info - Registers to receive notice events. - * @device: Device associated with the registration. - * @port_num: Port on the specified device to associate with the registration. - * @trap_number: InformInfo trap number to register for. - * @gfp_mask: GFP mask for memory allocations. - * @callback: User callback invoked once the registration completes and to - * report noticed events. - * @context: User specified context stored with the ib_inform_reg structure. - * - * This call initiates a registration request with the SA for the specified - * trap number. If the operation is started successfully, it returns - * an ib_inform_info structure that is used to track the registration operation. - * Users must free this structure by calling ib_unregister_inform_info, - * even if the operation later fails. (The callback status is non-zero.) - * - * If the registration fails; status will be non-zero. If the registration - * succeeds, the callback status will be zero, but the notice parameter will - * be NULL. If the notice parameter is not NULL, a trap or notice is being - * reported to the user. - * - * A status of -ENETRESET indicates that an error occurred which requires - * reregisteration. - */ -struct ib_inform_info * -ib_sa_register_inform_info(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u16 trap_number, gfp_t gfp_mask, - int (*callback)(int status, - struct ib_inform_info *info, - struct ib_sa_notice *notice), - void *context); - -/** - * ib_sa_unregister_inform_info - Releases an InformInfo registration. - * @info: InformInfo registration tracking structure. - * - * This call blocks until the registration request is destroyed. It may - * not be called from within the registration callback. - */ -void ib_sa_unregister_inform_info(struct ib_inform_info *info); - +/* Support GuidInfoRecord */ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, @@ -588,6 +421,4 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, void *context), void *context, struct ib_sa_query **sa_query); - - #endif /* IB_SA_H */ diff --git a/sys/ofed/include/rdma/ib_umem.h b/sys/ofed/include/rdma/ib_umem.h index a825111918ab..82f6cfaa5b7a 100644 --- a/sys/ofed/include/rdma/ib_umem.h +++ b/sys/ofed/include/rdma/ib_umem.h @@ -37,9 +37,26 @@ #include #include #include +#include +#include struct ib_ucontext; -struct vm_area_struct; +struct ib_umem; + +typedef void (*umem_invalidate_func_t)(void *invalidation_cookie, + struct ib_umem *umem, + unsigned long addr, size_t size); + +struct invalidation_ctx { + struct ib_umem *umem; + umem_invalidate_func_t func; + void *cookie; + unsigned long context_ticket; + int peer_callback; + int inflight_invalidation; + int peer_invalidated; + struct completion comp; +}; struct ib_umem { struct ib_ucontext *context; @@ -48,55 +65,29 @@ struct ib_umem { int page_size; int writable; int hugetlb; - struct list_head chunk_list; -#ifdef __linux__ struct work_struct work; - struct mm_struct *mm; -#else - unsigned long start; -#endif unsigned long diff; -}; - -struct ib_cmem { - - struct ib_ucontext *context; - size_t length; - /* Link list of contiguous blocks being part of that cmem */ - struct list_head ib_cmem_block; - - /* Order of cmem block, 2^ block_order will equal number - of physical pages per block - */ - unsigned long block_order; - /* Refernce counter for that memory area - - When value became 0 pages will be returned to the kernel. - */ - struct kref refcount; -}; - - -struct ib_umem_chunk { - struct list_head list; - int nents; + unsigned long start; + struct sg_table sg_head; int nmap; - struct dma_attrs attrs; - struct scatterlist page_list[0]; + int npages; + /* peer memory that manages this umem*/ + struct ib_peer_memory_client *ib_peer_mem; + struct invalidation_ctx *invalidation_ctx; + int peer_mem_srcu_key; + /* peer memory private context */ + void *peer_mem_client_context; }; struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync); +struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync, + int invalidation_supported); +void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, + umem_invalidate_func_t func, + void *cookie); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); -int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem, - struct vm_area_struct *vma); -struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context, - unsigned long total_size, - unsigned long page_size_order); -void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem); -int ib_umem_map_to_vma(struct ib_umem *umem, - struct vm_area_struct *vma); - - #endif /* IB_UMEM_H */ diff --git a/sys/ofed/include/rdma/ib_user_verbs.h b/sys/ofed/include/rdma/ib_user_verbs.h index 670d6e8d6db2..a07de88a5d0d 100644 --- a/sys/ofed/include/rdma/ib_user_verbs.h +++ b/sys/ofed/include/rdma/ib_user_verbs.h @@ -43,6 +43,13 @@ * compatibility are made. */ #define IB_USER_VERBS_ABI_VERSION 6 +#define IB_USER_VERBS_CMD_THRESHOLD 50 + +/* + * To support 6 legacy commands using the old extension style + */ +#define IB_USER_VERBS_LEGACY_CMD_FIRST 52 +#define IB_USER_VERBS_LEGACY_EX_CMD_LAST 56 enum { IB_USER_VERBS_CMD_GET_CONTEXT, @@ -85,17 +92,15 @@ enum { IB_USER_VERBS_CMD_OPEN_XRCD, IB_USER_VERBS_CMD_CLOSE_XRCD, IB_USER_VERBS_CMD_CREATE_XSRQ, - IB_USER_VERBS_CMD_OPEN_QP, - IB_USER_VERBS_CMD_ATTACH_FLOW, - IB_USER_VERBS_CMD_DETACH_FLOW, - IB_USER_VERBS_CMD_CREATE_XRC_SRQ, - IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP, - IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP, - IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP, - IB_USER_VERBS_CMD_REG_XRC_RCV_QP, - IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP, + IB_USER_VERBS_CMD_OPEN_QP }; +enum { + IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, + IB_USER_VERBS_EX_CMD_DESTROY_FLOW +}; + + /* * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to @@ -125,12 +130,33 @@ struct ib_uverbs_comp_event_desc { * the rest of the command struct based on these value. */ +#define IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, field) \ + ((ibv_type *)((void *)(ex_ptr) + offsetof(ex_type, \ + field) + sizeof((ex_ptr)->field))) + +#define IBV_RESP_TO_VERBS_RESP_EX(ex_ptr, ex_type, ibv_type) \ + IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, comp_mask) + + +#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff +#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u +#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24 + +#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80 + struct ib_uverbs_cmd_hdr { __u32 command; __u16 in_words; __u16 out_words; }; +struct ib_uverbs_ex_cmd_hdr { + __u64 response; + __u16 provider_in_words; + __u16 provider_out_words; + __u32 cmd_hdr_reserved; +}; + struct ib_uverbs_get_context { __u64 response; __u64 driver_data[0]; @@ -146,6 +172,11 @@ struct ib_uverbs_query_device { __u64 driver_data[0]; }; +struct ib_uverbs_query_device_ex { + __u64 comp_mask; + __u64 driver_data[0]; +}; + struct ib_uverbs_query_device_resp { __u64 fw_ver; __be64 node_guid; @@ -269,6 +300,22 @@ struct ib_uverbs_dereg_mr { __u32 mr_handle; }; +struct ib_uverbs_alloc_mw { + __u64 response; + __u32 pd_handle; + __u8 mw_type; + __u8 reserved[3]; +}; + +struct ib_uverbs_alloc_mw_resp { + __u32 mw_handle; + __u32 rkey; +}; + +struct ib_uverbs_dealloc_mw { + __u32 mw_handle; +}; + struct ib_uverbs_create_comp_channel { __u64 response; }; @@ -292,6 +339,30 @@ struct ib_uverbs_create_cq_resp { __u32 cqe; }; +enum ib_uverbs_create_cq_ex_comp_mask { + IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS = (u64)1 << 0, +}; + +struct ib_uverbs_create_cq_ex { + __u64 comp_mask; + __u64 user_handle; + __u32 cqe; + __u32 comp_vector; + __s32 comp_channel; + __u32 reserved; + __u64 create_flags; + __u64 driver_data[0]; +}; + +struct ib_uverbs_modify_cq_ex { + __u64 comp_mask; + __u32 cq_handle; + __u32 attr_mask; + __u16 cq_count; + __u16 cq_period; + __u32 cq_cap_flags; +}; + struct ib_uverbs_resize_cq { __u64 response; __u32 cq_handle; @@ -543,6 +614,42 @@ struct ib_uverbs_modify_qp { __u64 driver_data[0]; }; +enum ib_uverbs_modify_qp_ex_comp_mask { + IB_UVERBS_QP_ATTR_DCT_KEY = 1ULL << 0, +}; + +struct ib_uverbs_modify_qp_ex { + __u32 comp_mask; + struct ib_uverbs_qp_dest dest; + struct ib_uverbs_qp_dest alt_dest; + __u32 qp_handle; + __u32 attr_mask; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 en_sqd_async_notify; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[2]; + __u64 dct_key; + __u64 driver_data[0]; +}; + struct ib_uverbs_modify_qp_resp { }; @@ -599,16 +706,6 @@ struct ib_uverbs_send_wr { } wr; }; -struct ibv_uverbs_flow_spec { - __u32 type; - __be32 src_ip; - __be32 dst_ip; - __be16 src_port; - __be16 dst_port; - __u8 l4_protocol; - __u8 block_mc_loopback; -}; - struct ib_uverbs_post_send { __u64 response; __u32 qp_handle; @@ -686,43 +783,117 @@ struct ib_uverbs_detach_mcast { __u64 driver_data[0]; }; -struct ibv_kern_flow_spec { +struct ib_uverbs_flow_spec_hdr { __u32 type; - __u32 reserved1; + __u16 size; + __u16 reserved; + /* followed by flow_spec */ + __u64 flow_spec_data[0]; +}; + +struct ib_kern_eth_filter { + __u8 dst_mac[6]; + __u8 src_mac[6]; + __be16 ether_type; + __be16 vlan_tag; +}; + +struct ib_uverbs_flow_spec_eth { union { + struct ib_uverbs_flow_spec_hdr hdr; struct { - __be16 ethertype; - __be16 vlan; - __u8 vlan_present; - __u8 mac[6]; - __u8 port; - } eth; + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_kern_eth_filter val; + struct ib_kern_eth_filter mask; +}; + +struct ib_kern_ib_filter { + __be32 l3_type_qpn; + __u8 dst_gid[16]; +}; + +struct ib_uverbs_flow_spec_ib { + union { + struct ib_uverbs_flow_spec_hdr hdr; struct { - __be32 qpn; - } ib_uc; - struct { - __u8 mgid[16]; - } ib_mc; - } l2_id; + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_kern_ib_filter val; + struct ib_kern_ib_filter mask; +}; + +struct ib_kern_ipv4_filter { __be32 src_ip; __be32 dst_ip; - __be16 src_port; +}; + +struct ib_uverbs_flow_spec_ipv4 { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_kern_ipv4_filter val; + struct ib_kern_ipv4_filter mask; +}; + +struct ib_kern_tcp_udp_filter { __be16 dst_port; - __u8 l4_protocol; - __u8 block_mc_loopback; + __be16 src_port; +}; + +struct ib_uverbs_flow_spec_tcp_udp { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_kern_tcp_udp_filter val; + struct ib_kern_tcp_udp_filter mask; +}; + +struct ib_uverbs_flow_attr { + __u32 type; + __u16 size; + __u16 priority; + __u8 num_of_specs; __u8 reserved[2]; + __u8 port; + __u32 flags; + /* Following are the optional layers according to user request + * struct ib_flow_spec_xxx + * struct ib_flow_spec_yyy + */ + struct ib_uverbs_flow_spec_hdr flow_specs[0]; }; -struct ib_uverbs_attach_flow { +struct ib_uverbs_create_flow { + __u32 comp_mask; __u32 qp_handle; - __u32 priority; - struct ibv_kern_flow_spec spec; + struct ib_uverbs_flow_attr flow_attr; }; -struct ib_uverbs_detach_flow { - __u32 qp_handle; - __u32 priority; - struct ibv_kern_flow_spec spec; +struct ib_uverbs_create_flow_resp { + __u32 comp_mask; + __u32 flow_handle; +}; + +struct ib_uverbs_destroy_flow { + __u32 comp_mask; + __u32 flow_handle; }; struct ib_uverbs_create_srq { @@ -788,95 +959,22 @@ struct ib_uverbs_destroy_srq_resp { __u32 events_reported; }; -struct ib_uverbs_open_xrc_domain { + +/* + * Legacy extended verbs related structures + */ +struct ib_uverbs_ex_cmd_hdr_legacy { + __u32 command; + __u16 in_words; + __u16 out_words; + __u16 provider_in_words; + __u16 provider_out_words; + __u32 cmd_hdr_reserved; +}; + +struct ib_uverbs_ex_cmd_resp1_legacy { + __u64 comp_mask; __u64 response; - __u32 fd; - __u32 oflags; - __u64 driver_data[0]; }; -struct ib_uverbs_open_xrc_domain_resp { - __u32 xrcd_handle; -}; - -struct ib_uverbs_close_xrc_domain { - __u64 response; - __u32 xrcd_handle; - __u32 reserved; - __u64 driver_data[0]; -}; - -struct ib_uverbs_create_xrc_rcv_qp { - __u64 response; - __u64 user_handle; - __u32 xrc_domain_handle; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u8 sq_sig_all; - __u8 qp_type; - __u8 reserved[6]; - __u64 driver_data[0]; -}; - -struct ib_uverbs_create_xrc_rcv_qp_resp { - __u32 qpn; - __u32 reserved; -}; - -struct ib_uverbs_modify_xrc_rcv_qp { - __u32 xrc_domain_handle; - __u32 qp_num; - struct ib_uverbs_qp_dest dest; - struct ib_uverbs_qp_dest alt_dest; - __u32 attr_mask; - __u32 qkey; - __u32 rq_psn; - __u32 sq_psn; - __u32 dest_qp_num; - __u32 qp_access_flags; - __u16 pkey_index; - __u16 alt_pkey_index; - __u8 qp_state; - __u8 cur_qp_state; - __u8 path_mtu; - __u8 path_mig_state; - __u8 en_sqd_async_notify; - __u8 max_rd_atomic; - __u8 max_dest_rd_atomic; - __u8 min_rnr_timer; - __u8 port_num; - __u8 timeout; - __u8 retry_cnt; - __u8 rnr_retry; - __u8 alt_port_num; - __u8 alt_timeout; - __u8 reserved[6]; - __u64 driver_data[0]; -}; - -struct ib_uverbs_query_xrc_rcv_qp { - __u64 response; - __u32 xrc_domain_handle; - __u32 qp_num; - __u32 attr_mask; - __u32 reserved; - __u64 driver_data[0]; -}; - -struct ib_uverbs_reg_xrc_rcv_qp { - __u32 xrc_domain_handle; - __u32 qp_num; - __u64 driver_data[0]; -}; - -struct ib_uverbs_unreg_xrc_rcv_qp { - __u32 xrc_domain_handle; - __u32 qp_num; - __u64 driver_data[0]; -}; - - #endif /* IB_USER_VERBS_H */ diff --git a/sys/ofed/include/rdma/ib_user_verbs_exp.h b/sys/ofed/include/rdma/ib_user_verbs_exp.h new file mode 100644 index 000000000000..557d4ba7cece --- /dev/null +++ b/sys/ofed/include/rdma/ib_user_verbs_exp.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_VERBS_EXP_H +#define IB_USER_VERBS_EXP_H + +#include + +enum { + IB_USER_VERBS_EXP_CMD_FIRST = 64 +}; + +enum { + IB_USER_VERBS_EXP_CMD_CREATE_QP, + IB_USER_VERBS_EXP_CMD_MODIFY_CQ, + IB_USER_VERBS_EXP_CMD_MODIFY_QP, + IB_USER_VERBS_EXP_CMD_CREATE_CQ, + IB_USER_VERBS_EXP_CMD_QUERY_DEVICE, + IB_USER_VERBS_EXP_CMD_CREATE_DCT, + IB_USER_VERBS_EXP_CMD_DESTROY_DCT, + IB_USER_VERBS_EXP_CMD_QUERY_DCT, +}; + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * Specifically: + * - Do not use pointer types -- pass pointers in __u64 instead. + * - Make sure that any structure larger than 4 bytes is padded to a + * multiple of 8 bytes. Otherwise the structure size will be + * different between 32-bit and 64-bit architectures. + */ + +enum ib_uverbs_exp_create_qp_comp_mask { + IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS = (1ULL << 0), + IB_UVERBS_EXP_CREATE_QP_INL_RECV = (1ULL << 1), + IB_UVERBS_EXP_CREATE_QP_QPG = (1ULL << 2) +}; + +struct ib_uverbs_qpg_init_attrib { + __u32 tss_child_count; + __u32 rss_child_count; +}; + +struct ib_uverbs_qpg { + __u32 qpg_type; + union { + struct { + __u32 parent_handle; + __u32 reserved; + }; + struct ib_uverbs_qpg_init_attrib parent_attrib; + }; + __u32 reserved2; +}; + +struct ib_uverbs_exp_create_qp { + __u64 comp_mask; + __u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u64 qp_cap_flags; + __u32 max_inl_recv; + __u32 reserved1; + struct ib_uverbs_qpg qpg; + __u64 driver_data[0]; +}; + +enum ib_uverbs_exp_create_qp_resp_comp_mask { + IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV = (1ULL << 0), +}; + +struct ib_uverbs_exp_create_qp_resp { + __u64 comp_mask; + __u32 qp_handle; + __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 max_inl_recv; +}; + +struct ib_uverbs_create_dct { + __u64 comp_mask; + __u64 user_handle; + __u32 pd_handle; + __u32 cq_handle; + __u32 srq_handle; + __u32 access_flags; + __u32 flow_label; + __u64 dc_key; + __u8 min_rnr_timer; + __u8 tclass; + __u8 port; + __u8 pkey_index; + __u8 gid_index; + __u8 hop_limit; + __u8 mtu; + __u8 rsvd; + __u32 create_flags; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_dct_resp { + __u32 dct_handle; + __u32 dctn; +}; + +struct ib_uverbs_destroy_dct { + __u64 comp_mask; + __u64 user_handle; +}; + +struct ib_uverbs_destroy_dct_resp { + __u64 reserved; +}; + +struct ib_uverbs_query_dct { + __u64 comp_mask; + __u64 dct_handle; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_dct_resp { + __u64 dc_key; + __u32 access_flags; + __u32 flow_label; + __u32 key_violations; + __u8 port; + __u8 min_rnr_timer; + __u8 tclass; + __u8 mtu; + __u8 pkey_index; + __u8 gid_index; + __u8 hop_limit; + __u8 state; + __u32 rsvd; + __u64 driver_data[0]; +}; + +struct ib_uverbs_exp_query_device { + __u64 comp_mask; + __u64 driver_data[0]; +}; + +struct ib_uverbs_exp_query_device_resp { + __u64 comp_mask; + struct ib_uverbs_query_device_resp base; + __u64 timestamp_mask; + __u64 hca_core_clock; + __u64 device_cap_flags2; + __u32 dc_rd_req; + __u32 dc_rd_res; + __u32 inline_recv_sz; + __u32 max_rss_tbl_sz; +}; + +#endif /* IB_USER_VERBS_EXP_H */ diff --git a/sys/ofed/include/rdma/ib_verbs.h b/sys/ofed/include/rdma/ib_verbs.h index d167e42fa4ac..d2607c805811 100644 --- a/sys/ofed/include/rdma/ib_verbs.h +++ b/sys/ofed/include/rdma/ib_verbs.h @@ -48,10 +48,10 @@ #include #include #include +#include +#include #include -#include -#include extern struct workqueue_struct *ib_wq; @@ -68,12 +68,14 @@ enum rdma_node_type { RDMA_NODE_IB_CA = 1, RDMA_NODE_IB_SWITCH, RDMA_NODE_IB_ROUTER, - RDMA_NODE_RNIC + RDMA_NODE_RNIC, + RDMA_NODE_MIC }; enum rdma_transport_type { RDMA_TRANSPORT_IB, - RDMA_TRANSPORT_IWARP + RDMA_TRANSPORT_IWARP, + RDMA_TRANSPORT_SCIF }; enum rdma_transport_type @@ -83,6 +85,7 @@ enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, + IB_LINK_LAYER_SCIF }; enum ib_device_cap_flags { @@ -120,7 +123,29 @@ enum ib_device_cap_flags { IB_DEVICE_SHARED_MR = (1<<24), IB_DEVICE_QPG = (1<<25), IB_DEVICE_UD_RSS = (1<<26), - IB_DEVICE_UD_TSS = (1<<27) + IB_DEVICE_UD_TSS = (1<<27), + IB_DEVICE_CROSS_CHANNEL = (1<<28), + IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), + /* + * Devices can set either IB_DEVICE_MEM_WINDOW_TYPE_2A or + * IB_DEVICE_MEM_WINDOW_TYPE_2B if it supports type 2A or type 2B + * memory windows. It can set neither to indicate it doesn't support + * type 2 windows at all. + */ + IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<30), + IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<31), + IB_DEVICE_SIGNATURE_HANDOVER = (1LL<<32) +}; + +enum ib_signature_prot_cap { + IB_PROT_T10DIF_TYPE_1 = 1, + IB_PROT_T10DIF_TYPE_2 = 1 << 1, + IB_PROT_T10DIF_TYPE_3 = 1 << 2, +}; + +enum ib_signature_guard_cap { + IB_GUARD_T10DIF_CRC = 1, + IB_GUARD_T10DIF_CSUM = 1 << 1, }; enum ib_atomic_cap { @@ -129,6 +154,12 @@ enum ib_atomic_cap { IB_ATOMIC_GLOB }; +enum ib_cq_create_flags { + IB_CQ_CREATE_CROSS_CHANNEL = 1 << 0, + IB_CQ_TIMESTAMP = 1 << 1, + IB_CQ_TIMESTAMP_TO_SYS_TIME = 1 << 2 +}; + struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; @@ -139,7 +170,7 @@ struct ib_device_attr { u32 hw_ver; int max_qp; int max_qp_wr; - int device_cap_flags; + u64 device_cap_flags; int max_sge; int max_sge_rd; int max_cq; @@ -171,6 +202,16 @@ struct ib_device_attr { int max_rss_tbl_sz; u16 max_pkeys; u8 local_ca_ack_delay; + int comp_mask; + uint64_t timestamp_mask; + uint64_t hca_core_clock; + unsigned int sig_prot_cap; + unsigned int sig_guard_cap; +}; + +enum ib_device_attr_comp_mask { + IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK = 1ULL << 1, + IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = 1ULL << 2 }; enum ib_mtu { @@ -199,7 +240,8 @@ enum ib_port_state { IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, - IB_PORT_ACTIVE_DEFER = 5 + IB_PORT_ACTIVE_DEFER = 5, + IB_PORT_DUMMY = -1 /* force enum signed */ }; enum ib_port_cap_flags { @@ -326,7 +368,6 @@ struct ib_port_attr { u8 active_width; u8 active_speed; u8 phys_state; - enum rdma_link_layer link_layer; }; enum ib_device_modify_flags { @@ -373,10 +414,6 @@ enum ib_event_type { IB_EVENT_GID_CHANGE, }; -enum ib_event_flags { - IB_XRC_QP_EVENT_FLAG = 0x80000000, -}; - struct ib_event { struct ib_device *device; union { @@ -384,7 +421,6 @@ struct ib_event { struct ib_qp *qp; struct ib_srq *srq; u8 port_num; - u32 xrc_qp_num; } element; enum ib_event_type event; }; @@ -450,6 +486,22 @@ enum ib_rate { IB_RATE_300_GBPS = 18 }; +enum ib_mr_create_flags { + IB_MR_SIGNATURE_EN = 1, +}; + +/** + * ib_mr_init_attr - Memory region init attributes passed to routine + * ib_create_mr. + * @max_reg_descriptors: max number of registration descriptors that + * may be used with registration work requests. + * @flags: MR creation flags bit mask. + */ +struct ib_mr_init_attr { + int max_reg_descriptors; + u32 flags; +}; + /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be @@ -465,6 +517,120 @@ int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; */ int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__; +struct ib_cq_init_attr { + int cqe; + int comp_vector; + u32 flags; +}; + +enum ib_signature_type { + IB_SIG_TYPE_T10_DIF, +}; + +/** + * T10-DIF Signature types + * T10-DIF types are defined by SCSI + * specifications. + */ +enum ib_t10_dif_type { + IB_T10DIF_NONE, + IB_T10DIF_TYPE1, + IB_T10DIF_TYPE2, + IB_T10DIF_TYPE3 +}; + +/** + * Signature T10-DIF block-guard types + * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules. + * IB_T10DIF_CSUM: Corresponds to IP checksum rules. + */ +enum ib_t10_dif_bg_type { + IB_T10DIF_CRC, + IB_T10DIF_CSUM +}; + +/** + * struct ib_t10_dif_domain - Parameters specific for T10-DIF + * domain. + * @type: T10-DIF type (0|1|2|3) + * @bg_type: T10-DIF block guard type (CRC|CSUM) + * @pi_interval: protection information interval. + * @bg: seed of guard computation. + * @app_tag: application tag of guard block + * @ref_tag: initial guard block reference tag. + * @type3_inc_reftag: T10-DIF type 3 does not state + * about the reference tag, it is the user + * choice to increment it or not. + */ +struct ib_t10_dif_domain { + enum ib_t10_dif_type type; + enum ib_t10_dif_bg_type bg_type; + u32 pi_interval; + u16 bg; + u16 app_tag; + u32 ref_tag; + bool type3_inc_reftag; +}; + +/** + * struct ib_sig_domain - Parameters for signature domain + * @sig_type: specific signauture type + * @sig: union of all signature domain attributes that may + * be used to set domain layout. + */ +struct ib_sig_domain { + enum ib_signature_type sig_type; + union { + struct ib_t10_dif_domain dif; + } sig; +}; + +/** + * struct ib_sig_attrs - Parameters for signature handover operation + * @check_mask: bitmask for signature byte check (8 bytes) + * @mem: memory domain layout desciptor. + * @wire: wire domain layout desciptor. + */ +struct ib_sig_attrs { + u8 check_mask; + struct ib_sig_domain mem; + struct ib_sig_domain wire; +}; + +enum ib_sig_err_type { + IB_SIG_BAD_GUARD, + IB_SIG_BAD_REFTAG, + IB_SIG_BAD_APPTAG, +}; + +/** + * struct ib_sig_err - signature error descriptor + */ +struct ib_sig_err { + enum ib_sig_err_type err_type; + u32 expected; + u32 actual; + u64 sig_err_offset; + u32 key; +}; + +enum ib_mr_status_check { + IB_MR_CHECK_SIG_STATUS = 1, +}; + +/** + * struct ib_mr_status - Memory region status container + * + * @fail_status: Bitmask of MR checks status. For each + * failed check a corresponding status bit is set. + * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS + * failure. + */ +struct ib_mr_status { + u32 fail_status; + struct ib_sig_err sig_err; +}; + /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. @@ -480,6 +646,8 @@ struct ib_ah_attr { u8 static_rate; u8 ah_flags; u8 port_num; + u8 dmac[6]; + u16 vlan_id; }; enum ib_wc_status { @@ -532,6 +700,11 @@ enum ib_wc_flags { IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), + IB_WC_WITH_SL = (1<<4), + IB_WC_WITH_SLID = (1<<5), + IB_WC_WITH_TIMESTAMP = (1<<6), + IB_WC_WITH_SMAC = (1<<7), + IB_WC_WITH_VLAN = (1<<8), }; struct ib_wc { @@ -553,6 +726,11 @@ struct ib_wc { u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ int csum_ok; + struct { + uint64_t timestamp; /* timestamp = 0 indicates error*/ + } ts; + u8 smac[6]; + u16 vlan_id; }; enum ib_cq_notify_flags { @@ -618,19 +796,37 @@ enum ib_qp_type { IB_QPT_RC, IB_QPT_UC, IB_QPT_UD, - IB_QPT_XRC, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = 8, IB_QPT_XRC_INI = 9, IB_QPT_XRC_TGT, + IB_QPT_DC_INI, IB_QPT_MAX, + /* Reserve a range for qp types internal to the low level driver. + * These qp types will not be visible at the IB core layer, so the + * IB_QPT_MAX usages should not be affected in the core layer + */ + IB_QPT_RESERVED1 = 0x1000, + IB_QPT_RESERVED2, + IB_QPT_RESERVED3, + IB_QPT_RESERVED4, + IB_QPT_RESERVED5, + IB_QPT_RESERVED6, + IB_QPT_RESERVED7, + IB_QPT_RESERVED8, + IB_QPT_RESERVED9, + IB_QPT_RESERVED10, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, - IB_QP_CREATE_NETIF_QP = 1 << 2, + IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, + IB_QP_CREATE_MANAGED_SEND = 1 << 3, + IB_QP_CREATE_MANAGED_RECV = 1 << 4, + IB_QP_CREATE_NETIF_QP = 1 << 5, + IB_QP_CREATE_SIGNATURE_EN = 1 << 6, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, @@ -659,7 +855,7 @@ struct ib_qp_init_attr { union { struct ib_qp *qpg_parent; /* see qpg_type */ struct ib_qpg_init_attrib parent_attrib; - } pp; + }; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; @@ -667,6 +863,43 @@ struct ib_qp_init_attr { u8 port_num; /* special QP types only */ }; +enum { + IB_DCT_CREATE_FLAG_RCV_INLINE = 1 << 0, + IB_DCT_CREATE_FLAGS_MASK = IB_DCT_CREATE_FLAG_RCV_INLINE, +}; + +struct ib_dct_init_attr { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_srq *srq; + u64 dc_key; + u8 port; + u32 access_flags; + u8 min_rnr_timer; + u8 tclass; + u32 flow_label; + enum ib_mtu mtu; + u8 pkey_index; + u8 gid_index; + u8 hop_limit; + u32 create_flags; +}; + +struct ib_dct_attr { + u64 dc_key; + u8 port; + u32 access_flags; + u8 min_rnr_timer; + u8 tclass; + u32 flow_label; + enum ib_mtu mtu; + u8 pkey_index; + u8 gid_index; + u8 hop_limit; + u32 key_violations; + u8 state; +}; + struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; @@ -731,7 +964,12 @@ enum ib_qp_attr_mask { IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), - IB_QP_GROUP_RSS = (1<<21) + IB_QP_GROUP_RSS = (1<<21), + IB_QP_DC_KEY = (1<<22), + IB_QP_SMAC = (1<<23), + IB_QP_ALT_SMAC = (1<<24), + IB_QP_VID = (1<<25), + IB_QP_ALT_VID = (1<<26) }; enum ib_qp_state { @@ -741,7 +979,8 @@ enum ib_qp_state { IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, - IB_QPS_ERR + IB_QPS_ERR, + IB_QPS_DUMMY = -1 /* force enum signed */ }; enum ib_mig_state { @@ -750,6 +989,11 @@ enum ib_mig_state { IB_MIG_ARMED }; +enum ib_mw_type { + IB_MW_TYPE_1 = 1, + IB_MW_TYPE_2 = 2 +}; + struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; @@ -776,6 +1020,40 @@ struct ib_qp_attr { u8 rnr_retry; u8 alt_port_num; u8 alt_timeout; + u8 smac[ETH_ALEN]; + u8 alt_smac[ETH_ALEN]; + u16 vlan_id; + u16 alt_vlan_id; + +}; + +struct ib_qp_attr_ex { + enum ib_qp_state qp_state; + enum ib_qp_state cur_qp_state; + enum ib_mtu path_mtu; + enum ib_mig_state path_mig_state; + u32 qkey; + u32 rq_psn; + u32 sq_psn; + u32 dest_qp_num; + int qp_access_flags; + struct ib_qp_cap cap; + struct ib_ah_attr ah_attr; + struct ib_ah_attr alt_ah_attr; + u16 pkey_index; + u16 alt_pkey_index; + u8 en_sqd_async_notify; + u8 sq_draining; + u8 max_rd_atomic; + u8 max_dest_rd_atomic; + u8 min_rnr_timer; + u8 port_num; + u8 timeout; + u8 retry_cnt; + u8 rnr_retry; + u8 alt_port_num; + u8 alt_timeout; + u64 dct_key; }; enum ib_wr_opcode { @@ -787,13 +1065,27 @@ enum ib_wr_opcode { IB_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD, IB_WR_LSO, - IB_WR_BIG_LSO, IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV, IB_WR_FAST_REG_MR, IB_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, + IB_WR_BIND_MW, + IB_WR_REG_SIG_MR, + /* reserve values for low level drivers' internal use. + * These values will not be used at all in the ib core layer. + */ + IB_WR_RESERVED1 = 0xf0, + IB_WR_RESERVED2, + IB_WR_RESERVED3, + IB_WR_RESERVED4, + IB_WR_RESERVED5, + IB_WR_RESERVED6, + IB_WR_RESERVED7, + IB_WR_RESERVED8, + IB_WR_RESERVED9, + IB_WR_RESERVED10, }; enum ib_send_flags { @@ -801,21 +1093,12 @@ enum ib_send_flags { IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), - IB_SEND_IP_CSUM = (1<<4) -}; + IB_SEND_IP_CSUM = (1<<4), -enum ib_flow_types { - IB_FLOW_ETH = 0, - IB_FLOW_IB_UC = 1, - IB_FLOW_IB_MC_IPV4 = 2, - IB_FLOW_IB_MC_IPV6 = 3 -}; - -enum { - IB_FLOW_L4_NONE = 0, - IB_FLOW_L4_OTHER = 3, - IB_FLOW_L4_UDP = 5, - IB_FLOW_L4_TCP = 6 + /* reserve bits 26-31 for low level drivers' internal use */ + IB_SEND_RESERVED_START = (1 << 26), + IB_SEND_RESERVED_END = (1 << 31), + IB_SEND_UMR_UNREG = (1<<5) }; struct ib_sge { @@ -830,6 +1113,23 @@ struct ib_fast_reg_page_list { unsigned int max_page_list_len; }; +/** + * struct ib_mw_bind_info - Parameters for a memory window bind operation. + * @mr: A memory region to bind the memory window to. + * @addr: The address where the memory window should begin. + * @length: The length of the memory window, in bytes. + * @mw_access_flags: Access flags from enum ib_access_flags for the window. + * + * This struct contains the shared parameters for type 1 and type 2 + * memory window bind operations. + */ +struct ib_mw_bind_info { + struct ib_mr *mr; + u64 addr; + u64 length; + int mw_access_flags; +}; + struct ib_send_wr { struct ib_send_wr *next; u64 wr_id; @@ -874,10 +1174,26 @@ struct ib_send_wr { u32 rkey; } fast_reg; struct { - struct ib_unpacked_lrh *lrh; - u32 eth_type; - u8 static_rate; - } raw_ety; + int npages; + int access_flags; + u32 mkey; + struct ib_pd *pd; + u64 virt_addr; + u64 length; + int page_shift; + } umr; + struct { + struct ib_mw *mw; + /* The new rkey for the memory window. */ + u32 rkey; + struct ib_mw_bind_info bind_info; + } bind_mw; + struct { + struct ib_sig_attrs *sig_attrs; + struct ib_mr *sig_mr; + int access_flags; + struct ib_sge *prot; + } sig_handover; } wr; u32 xrc_remote_srq_num; /* XRC TGT QPs only */ }; @@ -896,13 +1212,7 @@ enum ib_access_flags { IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), IB_ACCESS_ALLOCATE_MR = (1<<5), - IB_ACCESS_SHARED_MR_USER_READ = (1<<6), - IB_ACCESS_SHARED_MR_USER_WRITE = (1<<7), - IB_ACCESS_SHARED_MR_GROUP_READ = (1<<8), - IB_ACCESS_SHARED_MR_GROUP_WRITE = (1<<9), - IB_ACCESS_SHARED_MR_OTHER_READ = (1<<10), - IB_ACCESS_SHARED_MR_OTHER_WRITE = (1<<11) - + IB_ZERO_BASED = (1<<13) }; struct ib_phys_buf { @@ -925,13 +1235,16 @@ enum ib_mr_rereg_flags { IB_MR_REREG_ACCESS = (1<<2) }; +/** + * struct ib_mw_bind - Parameters for a type 1 memory window bind operation. + * @wr_id: Work request id. + * @send_flags: Flags from ib_send_flags enum. + * @bind_info: More parameters of the bind operation. + */ struct ib_mw_bind { - struct ib_mr *mr; u64 wr_id; - u64 addr; - u32 length; int send_flags; - int mw_access_flags; + struct ib_mw_bind_info bind_info; }; struct ib_fmr_attr { @@ -950,7 +1263,11 @@ struct ib_ucontext { struct list_head srq_list; struct list_head ah_list; struct list_head xrcd_list; + struct list_head rule_list; + struct list_head dct_list; int closing; + void *peer_mem_private_data; + char *peer_mem_name; }; struct ib_uobject { @@ -964,19 +1281,22 @@ struct ib_uobject { int live; }; +struct ib_udata; +struct ib_udata_ops { + int (*copy_from)(void *dest, struct ib_udata *udata, + size_t len); + int (*copy_to)(struct ib_udata *udata, void *src, + size_t len); +}; + struct ib_udata { + struct ib_udata_ops *ops; void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; -struct ib_uxrc_rcv_object { - struct list_head list; /* link to context's list */ - u32 qp_num; - u32 domain_handle; -}; - struct ib_pd { struct ib_device *device; struct ib_uobject *uobject; @@ -985,10 +1305,8 @@ struct ib_pd { struct ib_xrcd { struct ib_device *device; - struct ib_uobject *uobject; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; - struct rb_node node; struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; @@ -1000,6 +1318,23 @@ struct ib_ah { struct ib_uobject *uobject; }; +enum ib_cq_attr_mask { + IB_CQ_MODERATION = (1 << 0), + IB_CQ_CAP_FLAGS = (1 << 1) +}; + +enum ib_cq_cap_flags { + IB_CQ_IGNORE_OVERRUN = (1 << 0) +}; + +struct ib_cq_attr { + struct { + u16 cq_count; + u16 cq_period; + } moderation; + u32 cq_cap_flags; +}; + typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); struct ib_cq { @@ -1038,7 +1373,8 @@ struct ib_qp { struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; - atomic_t usecnt; /* count times opened, mcast attaches */ + /* count times opened, mcast attaches, flow attaches */ + atomic_t usecnt; struct list_head open_list; struct ib_qp *real_qp; struct ib_uobject *uobject; @@ -1047,6 +1383,16 @@ struct ib_qp { u32 qp_num; enum ib_qp_type qp_type; enum ib_qpg_type qpg_type; + u8 port_num; +}; + +struct ib_dct { + struct ib_device *device; + struct ib_uobject *uobject; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_srq *srq; + u32 dct_num; }; struct ib_mr { @@ -1063,6 +1409,7 @@ struct ib_mw { struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; + enum ib_mw_type type; }; struct ib_fmr { @@ -1073,30 +1420,128 @@ struct ib_fmr { u32 rkey; }; -struct ib_flow_spec { - enum ib_flow_types type; - union { - struct { - __be16 ethertype; - __be16 vlan; - u8 vlan_present; - u8 mac[6]; - u8 port; - } eth; - struct { - __be32 qpn; - } ib_uc; - struct { - u8 mgid[16]; - } ib_mc; - } l2_id; +/* Supported steering options */ +enum ib_flow_attr_type { + /* steering according to rule specifications */ + IB_FLOW_ATTR_NORMAL = 0x0, + /* default unicast and multicast rule - + * receive all Eth traffic which isn't steered to any QP + */ + IB_FLOW_ATTR_ALL_DEFAULT = 0x1, + /* default multicast rule - + * receive all Eth multicast traffic which isn't steered to any QP + */ + IB_FLOW_ATTR_MC_DEFAULT = 0x2, + /* sniffer rule - receive all port traffic */ + IB_FLOW_ATTR_SNIFFER = 0x3 +}; + +/* Supported steering header types */ +enum ib_flow_spec_type { + /* L2 headers*/ + IB_FLOW_SPEC_ETH = 0x20, + IB_FLOW_SPEC_IB = 0x21, + /* L3 header*/ + IB_FLOW_SPEC_IPV4 = 0x30, + /* L4 headers*/ + IB_FLOW_SPEC_TCP = 0x40, + IB_FLOW_SPEC_UDP = 0x41 +}; + +#define IB_FLOW_SPEC_SUPPORT_LAYERS 4 + +/* Flow steering rule priority is set according to it's domain. + * Lower domain value means higher priority. + */ +enum ib_flow_domain { + IB_FLOW_DOMAIN_USER, + IB_FLOW_DOMAIN_ETHTOOL, + IB_FLOW_DOMAIN_RFS, + IB_FLOW_DOMAIN_NIC, + IB_FLOW_DOMAIN_NUM /* Must be last */ +}; + +enum ib_flow_flags { + IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1 +}; + +struct ib_flow_eth_filter { + u8 dst_mac[6]; + u8 src_mac[6]; + __be16 ether_type; + __be16 vlan_tag; +}; + +struct ib_flow_spec_eth { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_eth_filter val; + struct ib_flow_eth_filter mask; +}; + +struct ib_flow_ib_filter { + __be32 l3_type_qpn; + u8 dst_gid[16]; +}; + +struct ib_flow_spec_ib { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_ib_filter val; + struct ib_flow_ib_filter mask; +}; + +struct ib_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; - __be16 src_port; +}; + +struct ib_flow_spec_ipv4 { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_ipv4_filter val; + struct ib_flow_ipv4_filter mask; +}; + +struct ib_flow_tcp_udp_filter { __be16 dst_port; - u8 l4_protocol; - u8 block_mc_loopback; - u8 rule_type; + __be16 src_port; +}; + +struct ib_flow_spec_tcp_udp { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_tcp_udp_filter val; + struct ib_flow_tcp_udp_filter mask; +}; + +union ib_flow_spec { + struct { + enum ib_flow_spec_type type; + u16 size; + }; + struct ib_flow_spec_ib ib; + struct ib_flow_spec_eth eth; + struct ib_flow_spec_ipv4 ipv4; + struct ib_flow_spec_tcp_udp tcp_udp; +}; + +struct ib_flow_attr { + enum ib_flow_attr_type type; + u16 size; + u16 priority; + u8 num_of_specs; + u8 port; + u32 flags; + /* Following are the optional layers according to user request + * struct ib_flow_spec_xxx + * struct ib_flow_spec_yyy + */ +}; + +struct ib_flow { + struct ib_qp *qp; + struct ib_uobject *uobject; }; struct ib_mad; @@ -1125,6 +1570,15 @@ struct ib_cache { u8 *lmc_cache; }; +enum verbs_values_mask { + IBV_VALUES_HW_CLOCK = 1 << 0 +}; + +struct ib_device_values { + int values_mask; + uint64_t hwclock; +}; + struct ib_dma_mapping_ops { int (*mapping_error)(struct ib_device *dev, u64 dma_addr); @@ -1169,6 +1623,8 @@ struct ib_dma_mapping_ops { }; struct iw_cm_verbs; +struct ib_exp_device_attr; +struct ib_exp_qp_init_attr; struct ib_device { struct device *dma_device; @@ -1257,12 +1713,13 @@ struct ib_device { int (*post_recv)(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); - struct ib_cq * (*create_cq)(struct ib_device *device, int cqe, - int comp_vector, + struct ib_cq * (*create_cq)(struct ib_device *device, + struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); - int (*modify_cq)(struct ib_cq *cq, u16 cq_count, - u16 cq_period); + int (*modify_cq)(struct ib_cq *cq, + struct ib_cq_attr *cq_attr, + int cq_attr_mask); int (*destroy_cq)(struct ib_cq *cq); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); @@ -1289,6 +1746,9 @@ struct ib_device { int (*query_mr)(struct ib_mr *mr, struct ib_mr_attr *mr_attr); int (*dereg_mr)(struct ib_mr *mr); + int (*destroy_mr)(struct ib_mr *mr); + struct ib_mr * (*create_mr)(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd, int max_page_list_len); struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, @@ -1301,7 +1761,8 @@ struct ib_device { int num_phys_buf, int mr_access_flags, u64 *iova_start); - struct ib_mw * (*alloc_mw)(struct ib_pd *pd); + struct ib_mw * (*alloc_mw)(struct ib_pd *pd, + enum ib_mw_type type); int (*bind_mw)(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind); @@ -1327,43 +1788,28 @@ struct ib_device { struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad); - struct ib_srq * (*create_xrc_srq)(struct ib_pd *pd, - struct ib_cq *xrc_cq, - struct ib_xrcd *xrcd, - struct ib_srq_init_attr *srq_init_attr, - struct ib_udata *udata); struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, struct ib_ucontext *ucontext, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd); - int (*create_xrc_rcv_qp)(struct ib_qp_init_attr *init_attr, - u32 *qp_num); - int (*modify_xrc_rcv_qp)(struct ib_xrcd *xrcd, - u32 qp_num, - struct ib_qp_attr *attr, - int attr_mask); - int (*query_xrc_rcv_qp)(struct ib_xrcd *xrcd, - u32 qp_num, - struct ib_qp_attr *attr, - int attr_mask, - struct ib_qp_init_attr *init_attr); - int (*reg_xrc_rcv_qp)(struct ib_xrcd *xrcd, - void *context, - u32 qp_num); - int (*unreg_xrc_rcv_qp)(struct ib_xrcd *xrcd, - void *context, - u32 qp_num); - int (*attach_flow)(struct ib_qp *qp, - struct ib_flow_spec *spec, - int priority); - int (*detach_flow)(struct ib_qp *qp, - struct ib_flow_spec *spec, - int priority); + struct ib_flow * (*create_flow)(struct ib_qp *qp, + struct ib_flow_attr + *flow_attr, + int domain); + int (*destroy_flow)(struct ib_flow *flow_id); + int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status); unsigned long (*get_unmapped_area)(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); + int (*ioctl)(struct ib_ucontext *context, + unsigned int cmd, + unsigned long arg); + int (*query_values)(struct ib_device *device, + int q_values, + struct ib_device_values *values); struct ib_dma_mapping_ops *dma_ops; struct module *owner; @@ -1379,14 +1825,33 @@ struct ib_device { int uverbs_abi_ver; u64 uverbs_cmd_mask; + u64 uverbs_ex_cmd_mask; char node_desc[64]; __be64 node_guid; u32 local_dma_lkey; u8 node_type; u8 phys_port_cnt; - struct rb_root ib_uverbs_xrcd_table; - struct mutex xrcd_table_mutex; + int cmd_perf; + u64 cmd_avg; + u32 cmd_n; + spinlock_t cmd_perf_lock; + + /* + * Experimental data and functions + */ + int (*exp_query_device)(struct ib_device *device, + struct ib_exp_device_attr *device_attr); + struct ib_qp * (*exp_create_qp)(struct ib_pd *pd, + struct ib_exp_qp_init_attr *qp_init_attr, + struct ib_udata *udata); + struct ib_dct * (*exp_create_dct)(struct ib_pd *pd, + struct ib_dct_init_attr *attr, + struct ib_udata *udata); + int (*exp_destroy_dct)(struct ib_dct *dct); + int (*exp_query_dct)(struct ib_dct *dct, struct ib_dct_attr *attr); + + u64 uverbs_exp_cmd_mask; }; struct ib_client { @@ -1414,12 +1879,12 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { - return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; + return udata->ops->copy_from(dest, udata, len); } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { - return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; + return udata->ops->copy_to(udata, src, len); } /** @@ -1430,6 +1895,7 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes + * @ll : link layer of port * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It @@ -1438,7 +1904,8 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len * and that the attribute mask supplied is allowed for the transition. */ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask); + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll); int ib_register_event_handler (struct ib_event_handler *event_handler); int ib_unregister_event_handler(struct ib_event_handler *event_handler); @@ -1551,26 +2018,6 @@ int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); */ int ib_destroy_ah(struct ib_ah *ah); -/** - * ib_create_xrc_srq - Creates an XRC SRQ associated with the specified - * protection domain, cq, and xrc domain. - * @pd: The protection domain associated with the SRQ. - * @xrc_cq: The cq to be associated with the XRC SRQ. - * @xrcd: The XRC domain to be associated with the XRC SRQ. - * @srq_init_attr: A list of initial attributes required to create the - * XRC SRQ. If XRC SRQ creation succeeds, then the attributes are updated - * to the actual capabilities of the created XRC SRQ. - * - * srq_attr->max_wr and srq_attr->max_sge are read the determine the - * requested size of the XRC SRQ, and set to the actual values allocated - * on return. If ib_create_xrc_srq() succeeds, then max_wr and max_sge - * will always be at least as large as the requested values. - */ -struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd, - struct ib_cq *xrc_cq, - struct ib_xrcd *xrcd, - struct ib_srq_init_attr *srq_init_attr); - /** * ib_create_srq - Creates a SRQ associated with the specified protection * domain. @@ -1732,13 +2179,6 @@ static inline int ib_post_recv(struct ib_qp *qp, return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } -/* - * IB_CQ_VECTOR_LEAST_ATTACHED: The constant specifies that - * the CQ will be attached to the completion vector that has - * the least number of CQs already attached to it. - */ -#define IB_CQ_VECTOR_LEAST_ATTACHED 0xffffffff - /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. @@ -1769,13 +2209,16 @@ struct ib_cq *ib_create_cq(struct ib_device *device, int ib_resize_cq(struct ib_cq *cq, int cqe); /** - * ib_modify_cq - Modifies moderation params of the CQ + * ib_modify_cq - Modifies the attributes for the specified CQ and then + * transitions the CQ to the given state. * @cq: The CQ to modify. - * @cq_count: number of CQEs that will trigger an event - * @cq_period: max period of time in usec before triggering an event - * + * @cq_attr: specifies the CQ attributes to modify. + * @cq_attr_mask: A bit-mask used to specify which attributes of the CQ + * are being modified. */ -int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int ib_modify_cq(struct ib_cq *cq, + struct ib_cq_attr *cq_attr, + int cq_attr_mask); /** * ib_destroy_cq - Destroys the specified CQ. @@ -2179,9 +2622,30 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); * ib_dereg_mr - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. + * + * This function can fail, if the memory region has memory windows bound to it. */ int ib_dereg_mr(struct ib_mr *mr); + +/** + * ib_create_mr - Allocates a memory region that may be used for + * signature handover operations. + * @pd: The protection domain associated with the region. + * @mr_init_attr: memory region init attributes. + */ +struct ib_mr *ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); + +/** + * ib_destroy_mr - Destroys a memory region that was created using + * ib_create_mr and removes it from HW translation tables. + * @mr: The memory region to destroy. + * + * This function can fail, if the memory region has memory windows bound to it. + */ +int ib_destroy_mr(struct ib_mr *mr); + /** * ib_alloc_fast_reg_mr - Allocates memory region usable with the * IB_WR_FAST_REG_MR send work request. @@ -2230,11 +2694,23 @@ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) mr->rkey = (mr->rkey & 0xffffff00) | newkey; } +/** + * ib_inc_rkey - increments the key portion of the given rkey. Can be used + * for calculating a new rkey for type 2 memory windows. + * @rkey - the rkey to increment. + */ +static inline u32 ib_inc_rkey(u32 rkey) +{ + const u32 mask = 0x000000ff; + return ((rkey + 1) & mask) | (rkey & ~mask); +} + /** * ib_alloc_mw - Allocates a memory window. * @pd: The protection domain associated with the memory window. + * @type: The type of the memory window (1 or 2). */ -struct ib_mw *ib_alloc_mw(struct ib_pd *pd); +struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); /** * ib_bind_mw - Posts a work request to the send queue of the specified @@ -2244,6 +2720,10 @@ struct ib_mw *ib_alloc_mw(struct ib_pd *pd); * @mw: The memory window to bind. * @mw_bind: Specifies information about the memory window, including * its address range, remote access rights, and associated memory region. + * + * If there is no immediate error, the function will update the rkey member + * of the mw parameter to its new value. The bind operation can still fail + * asynchronously. */ static inline int ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, @@ -2334,7 +2814,77 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); */ int ib_dealloc_xrcd(struct ib_xrcd *xrcd); -int ib_attach_flow(struct ib_qp *qp, struct ib_flow_spec *spec, int priority); -int ib_detach_flow(struct ib_qp *qp, struct ib_flow_spec *spec, int priority); +struct ib_flow *ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, int domain); +int ib_destroy_flow(struct ib_flow *flow_id); + +struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr, + struct ib_udata *udata); +int ib_destroy_dct(struct ib_dct *dct); +int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr); + +int ib_query_values(struct ib_device *device, + int q_values, struct ib_device_values *values); + +static inline void ib_active_speed_enum_to_rate(u8 active_speed, + int *rate, + char **speed) +{ + switch (active_speed) { + case IB_SPEED_DDR: + *speed = " DDR"; + *rate = 50; + break; + case IB_SPEED_QDR: + *speed = " QDR"; + *rate = 100; + break; + case IB_SPEED_FDR10: + *speed = " FDR10"; + *rate = 100; + break; + case IB_SPEED_FDR: + *speed = " FDR"; + *rate = 140; + break; + case IB_SPEED_EDR: + *speed = " EDR"; + *rate = 250; + break; + case IB_SPEED_SDR: + default: /* default to SDR for invalid rates */ + *rate = 25; + break; + } + +} + +static inline int ib_check_mr_access(int flags) +{ + /* + * Local write permission is required if remote write or + * remote atomic permission is also requested. + */ + if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && + !(flags & IB_ACCESS_LOCAL_WRITE)) + return -EINVAL; + + return 0; +} + +/** + * ib_check_mr_status: lightweight check of MR status. + * This routine may provide status checks on a selected + * ib_mr. first use is for signature status check. + * + * @mr: A memory region. + * @check_mask: Bitmask of which checks to perform from + * ib_mr_status_check enumeration. + * @mr_status: The container of relevant status checks. + * failed checks will be indicated in the status bitmask + * and the relevant info shall be in the error item. + */ +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status); #endif /* IB_VERBS_H */ diff --git a/sys/ofed/include/rdma/ib_verbs_exp.h b/sys/ofed/include/rdma/ib_verbs_exp.h new file mode 100644 index 000000000000..ca5b84b5c76d --- /dev/null +++ b/sys/ofed/include/rdma/ib_verbs_exp.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_VERBS_EXP_H +#define IB_VERBS_EXP_H + +#include + + +enum ib_exp_device_cap_flags2 { + IB_EXP_DEVICE_DC_TRANSPORT = 1 << 0, + IB_EXP_DEVICE_QPG = 1 << 1, + IB_EXP_DEVICE_UD_RSS = 1 << 2, + IB_EXP_DEVICE_UD_TSS = 1 << 3 +}; + +enum ib_exp_device_attr_comp_mask { + IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK = 1ULL << 1, + IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = 1ULL << 2, + IB_EXP_DEVICE_ATTR_CAP_FLAGS2 = 1ULL << 3, + IB_EXP_DEVICE_ATTR_DC_REQ_RD = 1ULL << 4, + IB_EXP_DEVICE_ATTR_DC_RES_RD = 1ULL << 5, + IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ = 1ULL << 6, + IB_EXP_DEVICE_ATTR_RSS_TBL_SZ = 1ULL << 7, +}; + +struct ib_exp_device_attr { + struct ib_device_attr base; + /* Use IB_EXP_DEVICE_ATTR_... for exp_comp_mask */ + uint32_t exp_comp_mask; + uint64_t device_cap_flags2; + uint32_t dc_rd_req; + uint32_t dc_rd_res; + uint32_t inline_recv_sz; + uint32_t max_rss_tbl_sz; +}; + +struct ib_exp_qp_init_attr { + void (*event_handler)(struct ib_event *, void *); + void *qp_context; + struct ib_cq *send_cq; + struct ib_cq *recv_cq; + struct ib_srq *srq; + struct ib_xrcd *xrcd; /* XRC TGT QPs only */ + struct ib_qp_cap cap; + union { + struct ib_qp *qpg_parent; /* see qpg_type */ + struct ib_qpg_init_attrib parent_attrib; + }; + enum ib_sig_type sq_sig_type; + enum ib_qp_type qp_type; + enum ib_qp_create_flags create_flags; + enum ib_qpg_type qpg_type; + u8 port_num; /* special QP types only */ + u32 max_inl_recv; +}; + + +int ib_exp_query_device(struct ib_device *device, + struct ib_exp_device_attr *device_attr); + + + + +#endif /* IB_VERBS_EXP_H */ diff --git a/sys/ofed/include/rdma/iw_cm.h b/sys/ofed/include/rdma/iw_cm.h index 412320e0899a..271c2f832eef 100644 --- a/sys/ofed/include/rdma/iw_cm.h +++ b/sys/ofed/include/rdma/iw_cm.h @@ -46,24 +46,17 @@ enum iw_cm_event_type { IW_CM_EVENT_CLOSE /* close complete */ }; -enum iw_cm_event_status { - IW_CM_EVENT_STATUS_OK = 0, /* request successful */ - IW_CM_EVENT_STATUS_ACCEPTED = 0, /* connect request accepted */ - IW_CM_EVENT_STATUS_REJECTED, /* connect request rejected */ - IW_CM_EVENT_STATUS_TIMEOUT, /* the operation timed out */ - IW_CM_EVENT_STATUS_RESET, /* reset from remote peer */ - IW_CM_EVENT_STATUS_EINVAL, /* asynchronous failure for bad parm */ -}; - struct iw_cm_event { enum iw_cm_event_type event; - enum iw_cm_event_status status; + int status; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; void *private_data; - u8 private_data_len; void *provider_data; + u8 private_data_len; struct socket *so; + u8 ord; + u8 ird; }; /** diff --git a/sys/ofed/include/rdma/peer_mem.h b/sys/ofed/include/rdma/peer_mem.h new file mode 100644 index 000000000000..85658831299c --- /dev/null +++ b/sys/ofed/include/rdma/peer_mem.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(PEER_MEM_H) +#define PEER_MEM_H + +#include +#include +#include +#include +#include + + +#define IB_PEER_MEMORY_NAME_MAX 64 +#define IB_PEER_MEMORY_VER_MAX 16 + +struct peer_memory_client { + char name[IB_PEER_MEMORY_NAME_MAX]; + char version[IB_PEER_MEMORY_VER_MAX]; + /* acquire return code: 1 mine, 0 - not mine */ + int (*acquire) (unsigned long addr, size_t size, void *peer_mem_private_data, + char *peer_mem_name, void **client_context); + int (*get_pages) (unsigned long addr, + size_t size, int write, int force, + struct sg_table *sg_head, + void *client_context, void *core_context); + int (*dma_map) (struct sg_table *sg_head, void *client_context, + struct device *dma_device, int dmasync, int *nmap); + int (*dma_unmap) (struct sg_table *sg_head, void *client_context, + struct device *dma_device); + void (*put_pages) (struct sg_table *sg_head, void *client_context); + unsigned long (*get_page_size) (void *client_context); + void (*release) (void *client_context); + +}; + +typedef int (*invalidate_peer_memory)(void *reg_handle, + void *core_context); + +void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, + invalidate_peer_memory *invalidate_callback); +void ib_unregister_peer_memory_client(void *reg_handle); + +#endif diff --git a/sys/ofed/include/rdma/rdma_cm.h b/sys/ofed/include/rdma/rdma_cm.h index c6b2962315b3..d69926172492 100644 --- a/sys/ofed/include/rdma/rdma_cm.h +++ b/sys/ofed/include/rdma/rdma_cm.h @@ -59,15 +59,26 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, RDMA_CM_EVENT_ADDR_CHANGE, - RDMA_CM_EVENT_TIMEWAIT_EXIT + RDMA_CM_EVENT_TIMEWAIT_EXIT, + RDMA_CM_EVENT_ALT_ROUTE_RESOLVED, + RDMA_CM_EVENT_ALT_ROUTE_ERROR, + RDMA_CM_EVENT_LOAD_ALT_PATH, + RDMA_CM_EVENT_ALT_PATH_LOADED, }; enum rdma_port_space { RDMA_PS_SDP = 0x0001, RDMA_PS_IPOIB = 0x0002, + RDMA_PS_IB = 0x013F, RDMA_PS_TCP = 0x0106, RDMA_PS_UDP = 0x0111, - RDMA_PS_SCTP = 0x0183 +}; + +enum alt_path_type { + RDMA_ALT_PATH_NONE, + RDMA_ALT_PATH_PORT, + RDMA_ALT_PATH_LID, + RDMA_ALT_PATH_BEST }; struct rdma_addr { @@ -101,6 +112,7 @@ struct rdma_ud_param { struct ib_ah_attr ah_attr; u32 qp_num; u32 qkey; + u8 alt_path_index; }; struct rdma_cm_event { @@ -112,6 +124,20 @@ struct rdma_cm_event { } param; }; +enum rdma_cm_state { + RDMA_CM_IDLE, + RDMA_CM_ADDR_QUERY, + RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_QUERY, + RDMA_CM_ROUTE_RESOLVED, + RDMA_CM_CONNECT, + RDMA_CM_DISCONNECT, + RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN, + RDMA_CM_DEVICE_REMOVAL, + RDMA_CM_DESTROYING +}; + struct rdma_cm_id; /** @@ -131,7 +157,9 @@ struct rdma_cm_id { rdma_cm_event_handler event_handler; struct rdma_route route; enum rdma_port_space ps; + enum ib_qp_type qp_type; u8 port_num; + void *ucontext; }; /** @@ -141,9 +169,11 @@ struct rdma_cm_id { * returned rdma_id. * @context: User specified context associated with the id. * @ps: RDMA port space. + * @qp_type: type of queue pair associated with the id. */ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps); + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type); /** * rdma_destroy_id - Destroys an RDMA identifier. @@ -191,6 +221,19 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, */ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); +/** + * rdma_enable_apm - Get ready to use APM for the given ID. + * Actual Alternate path discovery and load will take place only + * after a connection has been established. + * + * Calling this function only has an effect on the connection's client side. + * It should be called after rdma_resolve_route and before rdma_connect. + * + * @id: RDMA identifier. + * @alt_type: Alternate path type to resolve. + */ +int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type); + /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA * identifier. @@ -330,4 +373,32 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); */ void rdma_set_service_type(struct rdma_cm_id *id, int tos); +/** + * rdma_set_reuseaddr - Allow the reuse of local addresses when binding + * the rdma_cm_id. + * @id: Communication identifier to configure. + * @reuse: Value indicating if the bound address is reusable. + * + * Reuse must be set before an address is bound to the id. + */ +int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); + +/** + * rdma_set_afonly - Specify that listens are restricted to the + * bound address family only. + * @id: Communication identifer to configure. + * @afonly: Value indicating if listens are restricted. + * + * Must be set before identifier is in the listening state. + */ +int rdma_set_afonly(struct rdma_cm_id *id, int afonly); + +/** + * rdma_set_timeout - Set the QP timeout associated with a connection + * identifier. + * @id: Communication identifier to associated with service type. + * @timeout: QP timeout + */ +void rdma_set_timeout(struct rdma_cm_id *id, int timeout); + #endif /* RDMA_CM_H */ diff --git a/sys/ofed/include/rdma/rdma_user_cm.h b/sys/ofed/include/rdma/rdma_user_cm.h index 1d165022c02d..4d9909994ddd 100644 --- a/sys/ofed/include/rdma/rdma_user_cm.h +++ b/sys/ofed/include/rdma/rdma_user_cm.h @@ -77,7 +77,8 @@ struct rdma_ucm_create_id { __u64 uid; __u64 response; __u16 ps; - __u8 reserved[6]; + __u8 qp_type; + __u8 reserved[5]; }; struct rdma_ucm_create_id_resp { @@ -222,7 +223,11 @@ enum { /* Option details */ enum { RDMA_OPTION_ID_TOS = 0, - RDMA_OPTION_IB_PATH = 1 + RDMA_OPTION_ID_REUSEADDR = 1, + RDMA_OPTION_ID_AFONLY = 2, + + RDMA_OPTION_IB_PATH = 1, + RDMA_OPTION_IB_APM = 2, }; struct rdma_ucm_set_option {