diff --git a/sys/conf/files b/sys/conf/files index a370cfa534be..7ba7e7c0e172 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3542,6 +3542,18 @@ ofed/drivers/infiniband/ulp/sdp/sdp_tx.c optional sdp inet \ no-depend \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" +ofed/drivers/infiniband/hw/mlx4/alias_GUID.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/mcg.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/sysfs.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/cm.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" ofed/drivers/infiniband/hw/mlx4/ah.c optional mlx4ib \ no-depend obj-prefix "mlx4ib_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" @@ -3624,7 +3636,10 @@ ofed/drivers/net/mlx4/sense.c optional mlx4ib | mlxen \ ofed/drivers/net/mlx4/srq.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" -ofed/drivers/net/mlx4/xrcd.c optional mlx4ib | mlxen \ +ofed/drivers/net/mlx4/resource_tracker.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/sys_tune.c optional mlx4ib | mlxen \ no-depend obj-prefix "mlx4_" \ compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" diff --git a/sys/contrib/rdma/ib_umem.h b/sys/contrib/rdma/ib_umem.h index 50dd5dac867d..9e1abbbe294a 100644 --- a/sys/contrib/rdma/ib_umem.h +++ b/sys/contrib/rdma/ib_umem.h @@ -28,21 +28,17 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $FreeBSD$ */ #ifndef IB_UMEM_H #define IB_UMEM_H -struct ib_ucontext; +#include +#include +#include +#include -struct ib_umem_chunk { - TAILQ_ENTRY(ib_umem_chunk) entry; - int nents; - int nmap; - struct rdma_scatterlist page_list[0]; -}; +struct ib_ucontext; struct ib_umem { struct ib_ucontext *context; @@ -50,28 +46,76 @@ struct ib_umem { int offset; int page_size; int writable; - TAILQ_HEAD(, ib_umem_chunk) chunk_list; -#ifdef notyet + int hugetlb; + struct list_head chunk_list; struct work_struct work; struct mm_struct *mm; -#endif unsigned long diff; }; +/* contiguous memory structure */ +struct ib_cmem { + + struct ib_ucontext *context; + size_t length; + /* Link list of contiguous blocks being part of that cmem */ + struct list_head ib_cmem_block; + + /* Order of cmem block, 2^ block_order will equal number + of physical pages per block + */ + unsigned long block_order; + /* Refernce counter for that memory area + - When value became 0 pages will be returned to the kernel. + */ + struct kref refcount; +}; + + +struct ib_cmem_block { + + struct list_head list; + /* page will point to the page struct of the head page + in the current compound page. + block order is saved once as part of ib_cmem. + */ + struct page *page; +}; + + + +struct ib_umem_chunk { + struct list_head list; + int nents; + int nmap; + struct dma_attrs attrs; + struct scatterlist page_list[0]; +}; + #ifdef CONFIG_INFINIBAND_USER_MEM struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, - size_t size, int access); + size_t size, int access, int dmasync); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem, + struct vm_area_struct *vma); +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context, + unsigned long total_size, + unsigned long page_size_order); +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem); +int ib_umem_map_to_vma(struct ib_umem *umem, + struct vm_area_struct *vma); + #else /* CONFIG_INFINIBAND_USER_MEM */ +#include static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, - int access) { - return ERR_PTR(EINVAL); + int access, int dmasync) { + return ERR_PTR(-EINVAL); } static inline void ib_umem_release(struct ib_umem *umem) { } static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c index 9d6944aa1d7c..64ac36c01c10 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c @@ -541,7 +541,8 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr, static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt, int acc, struct ib_udata *udata) + u64 virt, int acc, struct ib_udata *udata, + int mr_id) { __be64 *pages; int shift, i, n; @@ -1136,7 +1137,7 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref; dev->ibdev.iwcm->get_qp = iwch_get_qp; - ret = ib_register_device(&dev->ibdev); + ret = ib_register_device(&dev->ibdev, NULL); if (ret) goto bail1; diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 2161b8b407af..8e7c895da835 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -128,6 +128,7 @@ SUBDIR= \ hwpmc \ ${_hyperv} \ ${_i2c} \ + ${_ibcore} \ ${_ibcs2} \ ${_ichwd} \ ${_ida} \ @@ -149,6 +150,7 @@ SUBDIR= \ ${_igb} \ ${_iir} \ ${_io} \ + ${_ipoib} \ ${_ipdivert} \ ${_ipfilter} \ ${_ipfw} \ @@ -499,11 +501,17 @@ _fe= fe _glxiic= glxiic _glxsb= glxsb _i2c= i2c +.if ${MK_OFED} != "no" || defined(ALL_MODULES) +_ibcore= ibcore +.endif _ibcs2= ibcs2 _ie= ie _if_ndis= if_ndis _igb= igb _io= io +.if ${MK_OFED} != "no" || defined(ALL_MODULES) +_ipoib= ipoib +.endif _lindev= lindev _linprocfs= linprocfs _linsysfs= linsysfs @@ -675,6 +683,9 @@ _hptrr= hptrr .endif _hyperv= hyperv _i2c= i2c +.if ${MK_OFED} != "no" || defined(ALL_MODULES) +_ibcore= ibcore +.endif _ichwd= ichwd _ida= ida _if_ndis= if_ndis @@ -682,6 +693,9 @@ _igb= igb _iir= iir _io= io _ipmi= ipmi +.if ${MK_OFED} != "no" || defined(ALL_MODULES) +_ipoib= ipoib +.endif _ips= ips _ipw= ipw .if ${MK_SOURCELESS_UCODE} != "no" diff --git a/sys/modules/ibcore/Makefile b/sys/modules/ibcore/Makefile new file mode 100644 index 000000000000..427902c1d332 --- /dev/null +++ b/sys/modules/ibcore/Makefile @@ -0,0 +1,23 @@ +# $FreeBSD$ +.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/core +.PATH: ${.CURDIR}/../../ofed/include/linux + +.include + +KMOD = ibcore +SRCS = addr.c cm_msgs.h iwcm.c mad_rmpp.h sa_query.c ucma.c uverbs_cmd.c +SRCS+= agent.c local_sa.c iwcm.h multicast.c smi.c ud_header.c uverbs_main.c +SRCS+= agent.h core_priv.h mad.c notice.c smi.h umem.c uverbs_marshall.c +SRCS+= cache.c device.c mad_priv.h packer.c sysfs.c user_mad.c verbs.c +SRCS+= cm.c fmr_pool.c mad_rmpp.c sa.h ucm.c uverbs.h cma.c +SRCS+= linux_compat.c linux_radix.c linux_idr.c +SRCS+= vnode_if.h device_if.h bus_if.h pci_if.h opt_inet.h + +CFLAGS+= -I${.CURDIR}/../../ofed/drivers/infiniband/core +CFLAGS+= -I${.CURDIR}/../mlx4ib +CFLAGS+= -I${.CURDIR}/../../ofed/include/ +CFLAGS+= -DINET6 -DINET -DOFED + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions diff --git a/sys/modules/ipoib/Makefile b/sys/modules/ipoib/Makefile new file mode 100644 index 000000000000..11d5435c6057 --- /dev/null +++ b/sys/modules/ipoib/Makefile @@ -0,0 +1,31 @@ +# $FreeBSD$ +.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/ulp/ipoib +.PATH: ${.CURDIR}/../../ofed/include/linux + +.include + +KMOD = ipoib +SRCS = device_if.h bus_if.h opt_ofed.h vnode_if.h opt_inet.h opt_inet6.h +SRCS += ipoib_cm.c ipoib_ib.c ipoib_main.c ipoib_multicast.c ipoib_verbs.c ipoib.h +SRCS+= linux_compat.c linux_radix.c linux_idr.c + +CFLAGS+= -I${.CURDIR}/../../ofed/drivers/infiniband/ulp/ipoib +CFLAGS+= -I${.CURDIR}/../ibcore +CFLAGS+= -I${.CURDIR}/../../ofed/include/ +CFLAGS+= -DINET6 -DINET -DOFED + +.if !defined(KERNBUILDDIR) +.if ${MK_INET_SUPPORT} != "no" +opt_inet.h: + @echo "#define INET 1" > ${.TARGET} +.endif + +.if ${MK_INET6_SUPPORT} != "no" +opt_inet6.h: + @echo "#define INET6 1" > ${.TARGET} +.endif +.endif + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions diff --git a/sys/modules/mlx4/Makefile b/sys/modules/mlx4/Makefile index ccaf3c3fc144..ce1ac5c60e11 100644 --- a/sys/modules/mlx4/Makefile +++ b/sys/modules/mlx4/Makefile @@ -3,11 +3,11 @@ .include +.PATH: ${.CURDIR}/../../ofed/include/linux KMOD = mlx4 SRCS = device_if.h bus_if.h pci_if.h vnode_if.h -SRCS+= alloc.c catas.c cmd.c cq.c eq.c fw.c icm.c intf.c main.c mcg.c mr.c -SRCS+= pd.c port.c profile.c qp.c reset.c sense.c srq.c xrcd.c -SRCS+= opt_inet.h opt_inet6.h +SRCS+= alloc.c catas.c cmd.c cq.c eq.c fw.c icm.c intf.c main.c mcg.c mr.c linux_compat.c linux_radix.c linux_idr.c +SRCS+= pd.c port.c profile.c qp.c reset.c sense.c srq.c resource_tracker.c sys_tune.c CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4 CFLAGS+= -I${.CURDIR}/../../ofed/include/ diff --git a/sys/modules/mlx4ib/Makefile b/sys/modules/mlx4ib/Makefile index 3ab6d6f802ad..abe15858d1c7 100644 --- a/sys/modules/mlx4ib/Makefile +++ b/sys/modules/mlx4ib/Makefile @@ -1,14 +1,21 @@ # $FreeBSD$ .PATH: ${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4 +.PATH: ${.CURDIR}/../../ofed/include/linux .include KMOD = mlx4ib SRCS = device_if.h bus_if.h pci_if.h vnode_if.h -SRCS+= ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c +SRCS+= linux_compat.c linux_radix.c linux_idr.c +SRCS+= alias_GUID.c mcg.c sysfs.c ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c cm.c SRCS+= opt_inet.h opt_inet6.h +#CFLAGS+= -I${.CURDIR}/../../ofed/include/ +#CFLAGS+= -I${.CURDIR}/../../../../include +CFLAGS+= -I${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4 CFLAGS+= -I${.CURDIR}/../../ofed/include/ +CFLAGS+= -DCONFIG_INFINIBAND_USER_MEM +CFLAGS+= -DINET6 -DINET -DOFED .if !defined(KERNBUILDDIR) .if ${MK_INET_SUPPORT} != "no" diff --git a/sys/ofed/drivers/infiniband/core/addr.c b/sys/ofed/drivers/infiniband/core/addr.c index a4678072d0bc..0048c7c4f394 100644 --- a/sys/ofed/drivers/infiniband/core/addr.c +++ b/sys/ofed/drivers/infiniband/core/addr.c @@ -356,7 +356,7 @@ static int addr_resolve(struct sockaddr *src_in, u_char edst[MAX_ADDR_LEN]; int multi; int bcast; - int error; + int error = 0; /* * Determine whether the address is unicast, multicast, or broadcast diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c index 34419f35efc5..318beb15a3d8 100644 --- a/sys/ofed/drivers/infiniband/core/cma.c +++ b/sys/ofed/drivers/infiniband/core/cma.c @@ -2957,7 +2957,7 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); - } else if ((addr->sa_family == AF_INET6)) { + } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ diff --git a/sys/ofed/drivers/infiniband/core/core_priv.h b/sys/ofed/drivers/infiniband/core/core_priv.h index 05ac36e6acdb..08c4bbbad168 100644 --- a/sys/ofed/drivers/infiniband/core/core_priv.h +++ b/sys/ofed/drivers/infiniband/core/core_priv.h @@ -38,7 +38,8 @@ #include -int ib_device_register_sysfs(struct ib_device *device); +int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, + u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); int ib_sysfs_setup(void); diff --git a/sys/ofed/drivers/infiniband/core/device.c b/sys/ofed/drivers/infiniband/core/device.c index 9d34bb6bce65..6c2c2f49fa99 100644 --- a/sys/ofed/drivers/infiniband/core/device.c +++ b/sys/ofed/drivers/infiniband/core/device.c @@ -273,7 +273,9 @@ static int read_port_table_lengths(struct ib_device *device) * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ -int ib_register_device(struct ib_device *device) +int ib_register_device(struct ib_device *device, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) { int ret; @@ -294,8 +296,6 @@ int ib_register_device(struct ib_device *device) INIT_LIST_HEAD(&device->client_data_list); spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); - device->ib_uverbs_xrcd_table = RB_ROOT; - mutex_init(&device->xrcd_table_mutex); ret = read_port_table_lengths(device); if (ret) { @@ -304,7 +304,7 @@ int ib_register_device(struct ib_device *device) goto out; } - ret = ib_device_register_sysfs(device); + ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", device->name); @@ -752,3 +752,19 @@ static void __exit ib_core_cleanup(void) module_init(ib_core_init); module_exit(ib_core_cleanup); + +#undef MODULE_VERSION +#include +static int +ibcore_evhand(module_t mod, int event, void *arg) +{ + return (0); +} + +static moduledata_t ibcore_mod = { + .name = "ibcore", + .evhand = ibcore_evhand, +}; + +MODULE_VERSION(ibcore, 1); +DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_SMP, SI_ORDER_ANY); diff --git a/sys/ofed/drivers/infiniband/core/sa_query.c b/sys/ofed/drivers/infiniband/core/sa_query.c index 0fc1c0ec5c6d..f36dbd625e39 100644 --- a/sys/ofed/drivers/infiniband/core/sa_query.c +++ b/sys/ofed/drivers/infiniband/core/sa_query.c @@ -1105,6 +1105,27 @@ static void ib_sa_inform_release(struct ib_sa_query *sa_query) kfree(container_of(sa_query, struct ib_sa_inform_query, sa_query)); } +int ib_sa_guid_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + // stub function - + // called originally from mad.c under mlx4_ib_init_sriov() + // which calls mlx4_ib_init_alias_guid_service() in alias_GUID.c + // which goes down to this function + + printk("ERROR: function should be called only in SRIOV flow!!!"); + + return 0; +} + /** * ib_sa_informinfo_query - Start an InformInfo registration. * @client:SA client diff --git a/sys/ofed/drivers/infiniband/core/sysfs.c b/sys/ofed/drivers/infiniband/core/sysfs.c index a40640686aa1..7c9b4b298341 100644 --- a/sys/ofed/drivers/infiniband/core/sysfs.c +++ b/sys/ofed/drivers/infiniband/core/sysfs.c @@ -38,6 +38,7 @@ #include #include +#include struct ib_port { struct kobject kobj; @@ -103,7 +104,7 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, return ret; return sprintf(buf, "%d: %s\n", attr.state, - attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? + attr.state < ARRAY_SIZE(state_name) ? state_name[attr.state] : "UNKNOWN"); } @@ -292,118 +293,124 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, return sprintf(buf, "0x%04x\n", pkey); } -#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ -struct port_table_attribute port_pma_attr_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ +static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr, + char *buf, int c_ext) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + struct ib_mad *in_mad = NULL; + struct ib_mad *out_mad = NULL; + ssize_t ret; + + if (!p->ibdev->process_mad) + return -ENXIO; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + ret = -ENOMEM; + goto out; + } + + in_mad->mad_hdr.base_version = 1; + in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; + in_mad->mad_hdr.class_version = 1; + in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; + if (c_ext) + in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS_EXT; + else + in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS; + + in_mad->data[41] = p->port_num; /* PortSelect field */ + + if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, + p->port_num, NULL, NULL, in_mad, out_mad) & + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { + ret = -EINVAL; + goto out; + } + + switch (width) { + case 4: + ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> + (4 - (offset % 8))) & 0xf); + break; + case 8: + ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); + break; + case 16: + ret = sprintf(buf, "%u\n", + be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); + break; + case 32: + ret = sprintf(buf, "%u\n", + be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); + break; + case 64: + ret = sprintf(buf, "%llu\n", (unsigned long long) + be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8))); + break; + default: + ret = 0; + } + +out: + kfree(in_mad); + kfree(out_mad); + + return ret; +} + +#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ } static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) + char *buf) { - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - int offset = tab_attr->index & 0xffff; - int width = (tab_attr->index >> 16) & 0xff; - struct ib_mad *in_mad = NULL; - struct ib_mad *out_mad = NULL; - ssize_t ret; - - if (!p->ibdev->process_mad) - return sprintf(buf, "N/A (no PMA)\n"); - - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); - if (!in_mad || !out_mad) { - ret = -ENOMEM; - goto out; - } - - in_mad->mad_hdr.base_version = 1; - in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; - in_mad->mad_hdr.class_version = 1; - in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; - in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */ - - in_mad->data[41] = p->port_num; /* PortSelect field */ - - if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, - p->port_num, NULL, NULL, in_mad, out_mad) & - (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != - (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { - ret = -EINVAL; - goto out; - } - - switch (width) { - case 4: - ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> - (4 - (offset % 8))) & 0xf); - break; - case 8: - ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); - break; - case 16: - ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); - break; - case 32: - ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); - break; - default: - ret = 0; - } - -out: - kfree(in_mad); - kfree(out_mad); - - return ret; + return get_pma_counters(p, attr, buf, 0); } -static PORT_PMA_ATTR(symbol_error , 0, 16, 32); -static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); -static PORT_PMA_ATTR(link_downed , 2, 8, 56); -static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); +static PORT_PMA_ATTR(symbol_error , 0, 16, 32); +static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); +static PORT_PMA_ATTR(link_downed , 2, 8, 56); +static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); -static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); +static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); -static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); +static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); -static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); -static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); -static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); -static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); -static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); -/* - * There is no bit allocated for port_xmit_wait in the CounterSelect field - * (IB spec). However, since this bit is ignored when reading - * (show_pma_counter), the _counter field of port_xmit_wait can be set to zero. - */ -static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); +static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); +static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); +static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); +static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); +static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); static struct attribute *pma_attrs[] = { - &port_pma_attr_symbol_error.attr.attr, - &port_pma_attr_link_error_recovery.attr.attr, - &port_pma_attr_link_downed.attr.attr, - &port_pma_attr_port_rcv_errors.attr.attr, - &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, - &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, - &port_pma_attr_port_xmit_discards.attr.attr, - &port_pma_attr_port_xmit_constraint_errors.attr.attr, - &port_pma_attr_port_rcv_constraint_errors.attr.attr, - &port_pma_attr_local_link_integrity_errors.attr.attr, - &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, - &port_pma_attr_VL15_dropped.attr.attr, - &port_pma_attr_port_xmit_data.attr.attr, - &port_pma_attr_port_rcv_data.attr.attr, - &port_pma_attr_port_xmit_packets.attr.attr, - &port_pma_attr_port_rcv_packets.attr.attr, - &port_pma_attr_port_xmit_wait.attr.attr, - NULL + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_port_xmit_data.attr.attr, + &port_pma_attr_port_rcv_data.attr.attr, + &port_pma_attr_port_xmit_packets.attr.attr, + &port_pma_attr_port_rcv_packets.attr.attr, + NULL }; static struct attribute_group pma_group = { @@ -411,6 +418,44 @@ static struct attribute_group pma_group = { .attrs = pma_attrs }; +#define PORT_PMA_ATTR_EXT(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter_ext, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ +} + +static ssize_t show_pma_counter_ext(struct ib_port *p, + struct port_attribute *attr, char *buf) +{ + return get_pma_counters(p, attr, buf, 1); +} + +static PORT_PMA_ATTR_EXT(port_xmit_data_64 , 0, 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data_64 , 0, 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets_64 , 0, 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets_64 , 0, 64, 256); +static PORT_PMA_ATTR_EXT(port_unicast_xmit_packets , 0, 64, 320); +static PORT_PMA_ATTR_EXT(port_unicast_rcv_packets , 0, 64, 384); +static PORT_PMA_ATTR_EXT(port_multicast_xmit_packets , 0, 64, 448); +static PORT_PMA_ATTR_EXT(port_multicast_rcv_packets , 0, 64, 512); + +static struct attribute *pma_attrs_ext[] = { + &port_pma_attr_ext_port_xmit_data_64.attr.attr, + &port_pma_attr_ext_port_rcv_data_64.attr.attr, + &port_pma_attr_ext_port_xmit_packets_64.attr.attr, + &port_pma_attr_ext_port_rcv_packets_64.attr.attr, + &port_pma_attr_ext_port_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_port_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_port_multicast_xmit_packets.attr.attr, + &port_pma_attr_ext_port_multicast_rcv_packets.attr.attr, + NULL +}; + +static struct attribute_group pma_ext_group = { + .name = "counters_ext", + .attrs = pma_attrs_ext +}; + static void ib_port_release(struct kobject *kobj) { struct ib_port *p = container_of(kobj, struct ib_port, kobj); @@ -503,7 +548,9 @@ alloc_group_attrs(ssize_t (*show)(struct ib_port *, return NULL; } -static int add_port(struct ib_device *device, int port_num) +static int add_port(struct ib_device *device, int port_num, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) { struct ib_port *p; struct ib_port_attr attr; @@ -522,7 +569,7 @@ static int add_port(struct ib_device *device, int port_num) p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_parent, + kobject_get(device->ports_parent), "%d", port_num); if (ret) goto err_put; @@ -531,10 +578,14 @@ static int add_port(struct ib_device *device, int port_num) if (ret) goto err_put; + ret = sysfs_create_group(&p->kobj, &pma_ext_group); + if (ret) + goto err_remove_pma; + p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); if (!p->gid_group.attrs) - goto err_remove_pma; + goto err_remove_pma_ext; ret = sysfs_create_group(&p->kobj, &p->gid_group); if (ret) @@ -550,6 +601,12 @@ static int add_port(struct ib_device *device, int port_num) if (ret) goto err_free_pkey; + if (port_callback) { + ret = port_callback(device, port_num, &p->kobj); + if (ret) + goto err_remove_pkey; + } + list_add_tail(&p->kobj.entry, &device->port_list); #ifdef __linux__ @@ -557,6 +614,9 @@ static int add_port(struct ib_device *device, int port_num) #endif return 0; +err_remove_pkey: + sysfs_remove_group(&p->kobj, &p->pkey_group); + err_free_pkey: for (i = 0; i < attr.pkey_tbl_len; ++i) kfree(p->pkey_group.attrs[i]); @@ -572,6 +632,9 @@ static int add_port(struct ib_device *device, int port_num) kfree(p->gid_group.attrs); +err_remove_pma_ext: + sysfs_remove_group(&p->kobj, &pma_ext_group); + err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); @@ -786,16 +849,17 @@ static struct attribute_group iw_stats_group = { .attrs = iw_proto_stats_attrs, }; -int ib_device_register_sysfs(struct ib_device *device) +int ib_device_register_sysfs(struct ib_device *device, + int (*port_callback)(struct ib_device *, u8, struct kobject *)) { struct device *class_dev = &device->dev; int ret; int i; class_dev->class = &ib_class; - class_dev->driver_data = device; class_dev->parent = device->dma_device; - dev_set_name(class_dev, device->name); + dev_set_name(class_dev, device->name); + dev_set_drvdata(class_dev, device); INIT_LIST_HEAD(&device->port_list); @@ -810,19 +874,19 @@ int ib_device_register_sysfs(struct ib_device *device) } device->ports_parent = kobject_create_and_add("ports", - &class_dev->kobj); - if (!device->ports_parent) { + kobject_get(&class_dev->kobj)); + if (!device->ports_parent) { ret = -ENOMEM; goto err_put; } if (device->node_type == RDMA_NODE_IB_SWITCH) { - ret = add_port(device, 0); + ret = add_port(device, 0, port_callback); if (ret) goto err_put; } else { for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i); + ret = add_port(device, i, port_callback); if (ret) goto err_put; } @@ -864,10 +928,15 @@ void ib_device_unregister_sysfs(struct ib_device *device) { struct kobject *p, *t; struct ib_port *port; + int i; /* Hold kobject until ib_dealloc_device() */ kobject_get(&device->dev.kobj); + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { + device_remove_file(&device->dev, ib_class_attributes[i]); + } + list_for_each_entry_safe(p, t, &device->port_list, entry) { list_del(&p->entry); port = container_of(p, struct ib_port, kobj); @@ -891,7 +960,7 @@ void ib_sysfs_cleanup(void) class_unregister(&ib_class); } -int ib_sysfs_create_port_files(struct ib_device *device, +/*int ib_sysfs_create_port_files(struct ib_device *device, int (*create)(struct ib_device *dev, u8 port_num, struct kobject *kobj)) { @@ -908,4 +977,4 @@ int ib_sysfs_create_port_files(struct ib_device *device, return ret; } -EXPORT_SYMBOL(ib_sysfs_create_port_files); +EXPORT_SYMBOL(ib_sysfs_create_port_files);*/ diff --git a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c index 3520182a6cec..9946c7129fca 100644 --- a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c +++ b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c @@ -312,7 +312,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); - INIT_LIST_HEAD(&ucontext->xrc_domain_list); + INIT_LIST_HEAD(&ucontext->xrcd_list); ucontext->closing = 0; resp.num_comp_vectors = file->device->num_comp_vectors; @@ -633,7 +633,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, } mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, - cmd.access_flags, &udata); + cmd.access_flags, &udata, 0); if (IS_ERR(mr)) { ret = PTR_ERR(mr); goto err_put; @@ -1087,7 +1087,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, attr.srq = srq; attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; attr.qp_type = cmd.qp_type; - attr.xrc_domain = xrcd; + attr.xrcd = xrcd; attr.create_flags = 0; attr.cap.max_send_wr = cmd.max_send_wr; @@ -1115,14 +1115,14 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; - qp->xrcd = attr.xrc_domain; + qp->xrcd = attr.xrcd; atomic_inc(&pd->usecnt); atomic_inc(&attr.send_cq->usecnt); atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); - else if (attr.xrc_domain) - atomic_inc(&attr.xrc_domain->usecnt); + else if (attr.xrcd) + atomic_inc(&attr.xrcd->usecnt); obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); @@ -2032,8 +2032,8 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, srq->uobject = &obj->uobject; srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; - srq->xrc_cq = NULL; - srq->xrcd = NULL; + srq->ext.xrc.cq = NULL; + srq->ext.xrc.xrcd = NULL; atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); @@ -2083,7 +2083,7 @@ ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { - struct ib_uverbs_create_xrc_srq cmd; + struct ib_uverbs_create_xsrq cmd; struct ib_uverbs_create_srq_resp resp; struct ib_udata udata; struct ib_uevent_object *obj; @@ -2119,7 +2119,7 @@ ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file, goto err; } - xrc_cq = idr_read_cq(cmd.xrc_cq, file->ucontext, 0); + xrc_cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!xrc_cq) { ret = -EINVAL; goto err_put_pd; @@ -2152,8 +2152,8 @@ ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file, srq->uobject = &obj->uobject; srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; - srq->xrc_cq = xrc_cq; - srq->xrcd = xrcd; + srq->ext.xrc.cq = xrc_cq; + srq->ext.xrc.xrcd = xrcd; atomic_inc(&pd->usecnt); atomic_inc(&xrc_cq->usecnt); atomic_inc(&xrcd->usecnt); @@ -2528,7 +2528,7 @@ ssize_t ib_uverbs_open_xrc_domain(struct ib_uverbs_file *file, INIT_LIST_HEAD(&xrcd_uobj->xrc_reg_qp_list); mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->xrc_domain_list); + list_add_tail(&uobj->list, &file->ucontext->xrcd_list); mutex_unlock(&file->mutex); uobj->live = 1; @@ -2598,7 +2598,7 @@ ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file, if (!ret) { list_for_each_entry(t_uobj, &file->ucontext->srq_list, list) { struct ib_srq *srq = t_uobj->object; - if (srq->xrcd && srq->xrcd == uobj->object) { + if (srq->ext.xrc.xrcd && srq->ext.xrc.xrcd == uobj->object) { ret = -EBUSY; break; } @@ -2702,7 +2702,7 @@ ssize_t ib_uverbs_create_xrc_rcv_qp(struct ib_uverbs_file *file, init_attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_XRC; - init_attr.xrc_domain = xrcd; + init_attr.xrcd = xrcd; init_attr.cap.max_send_wr = 1; init_attr.cap.max_recv_wr = 0; diff --git a/sys/ofed/drivers/infiniband/core/uverbs_main.c b/sys/ofed/drivers/infiniband/core/uverbs_main.c index 380abd314dc6..a0eb4fe53a31 100644 --- a/sys/ofed/drivers/infiniband/core/uverbs_main.c +++ b/sys/ofed/drivers/infiniband/core/uverbs_main.c @@ -110,8 +110,8 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, [IB_USER_VERBS_CMD_CREATE_XRC_SRQ] = ib_uverbs_create_xrc_srq, - [IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN] = ib_uverbs_open_xrc_domain, - [IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN] = ib_uverbs_close_xrc_domain, + [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrc_domain, + [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrc_domain, [IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP] = ib_uverbs_create_xrc_rcv_qp, [IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP] = ib_uverbs_modify_xrc_rcv_qp, [IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP] = ib_uverbs_query_xrc_rcv_qp, @@ -258,7 +258,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, } mutex_lock(&file->device->ib_dev->xrcd_table_mutex); - list_for_each_entry_safe(uobj, tmp, &context->xrc_domain_list, list) { + list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) { struct ib_xrcd *xrcd = uobj->object; struct ib_uxrc_rcv_object *xrc_qp_obj, *tmp1; struct ib_uxrcd_object *xrcd_uobj = @@ -629,8 +629,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (hdr.in_words * 4 != count) return -EINVAL; - if (hdr.command < 0 || - hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || + if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || !uverbs_cmd_table[hdr.command] || !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) return -EINVAL; diff --git a/sys/ofed/drivers/infiniband/core/verbs.c b/sys/ofed/drivers/infiniband/core/verbs.c index 90bdeaa91987..023564f814d6 100644 --- a/sys/ofed/drivers/infiniband/core/verbs.c +++ b/sys/ofed/drivers/infiniband/core/verbs.c @@ -250,8 +250,8 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->uobject = NULL; srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; - srq->xrc_cq = NULL; - srq->xrcd = NULL; + srq->ext.xrc.cq = NULL; + srq->ext.xrc.xrcd = NULL; atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); } @@ -278,8 +278,8 @@ struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd, srq->uobject = NULL; srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; - srq->xrc_cq = xrc_cq; - srq->xrcd = xrcd; + srq->ext.xrc.cq = xrc_cq; + srq->ext.xrc.xrcd = xrcd; atomic_inc(&pd->usecnt); atomic_inc(&xrcd->usecnt); atomic_inc(&xrc_cq->usecnt); @@ -319,8 +319,8 @@ int ib_destroy_srq(struct ib_srq *srq) return -EBUSY; pd = srq->pd; - xrc_cq = srq->xrc_cq; - xrcd = srq->xrcd; + xrc_cq = srq->ext.xrc.cq; + xrcd = srq->ext.xrc.xrcd; ret = srq->device->destroy_srq(srq); if (!ret) { @@ -355,7 +355,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->qp_context = qp_init_attr->qp_context; qp->qp_type = qp_init_attr->qp_type; qp->xrcd = qp->qp_type == IB_QPT_XRC ? - qp_init_attr->xrc_domain : NULL; + qp_init_attr->xrcd : NULL; atomic_inc(&pd->usecnt); atomic_inc(&qp_init_attr->send_cq->usecnt); atomic_inc(&qp_init_attr->recv_cq->usecnt); @@ -371,8 +371,8 @@ EXPORT_SYMBOL(ib_create_qp); static const struct { int valid; - enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETH + 1]; - enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETH + 1]; + enum ib_qp_attr_mask req_param[IB_QPT_RAW_PACKET + 1]; + enum ib_qp_attr_mask opt_param[IB_QPT_RAW_PACKET + 1]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -382,7 +382,7 @@ static const struct { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY), - [IB_QPT_RAW_ETH] = IB_QP_PORT, + [IB_QPT_RAW_PACKET] = IB_QP_PORT, [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), @@ -1005,7 +1005,7 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) switch (rdma_node_get_transport(qp->device->node_type)) { case RDMA_TRANSPORT_IB: - if (qp->qp_type == IB_QPT_RAW_ETH) { + if (qp->qp_type == IB_QPT_RAW_PACKET) { /* In raw Etherent mgids the 63 msb's should be 0 */ if (gid->global.subnet_prefix & cpu_to_be64(~1ULL)) return -EINVAL; @@ -1013,7 +1013,7 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) return -EINVAL; break; case RDMA_TRANSPORT_IWARP: - if (qp->qp_type != IB_QPT_RAW_ETH) + if (qp->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; break; } @@ -1028,7 +1028,7 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) switch (rdma_node_get_transport(qp->device->node_type)) { case RDMA_TRANSPORT_IB: - if (qp->qp_type == IB_QPT_RAW_ETH) { + if (qp->qp_type == IB_QPT_RAW_PACKET) { /* In raw Etherent mgids the 63 msb's should be 0 */ if (gid->global.subnet_prefix & cpu_to_be64(~1ULL)) return -EINVAL; @@ -1036,7 +1036,7 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) return -EINVAL; break; case RDMA_TRANSPORT_IWARP: - if (qp->qp_type != IB_QPT_RAW_ETH) + if (qp->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; break; } diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig b/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig index 4175a4bd0c78..24ab11a9ad1e 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig +++ b/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig @@ -1,5 +1,7 @@ config MLX4_INFINIBAND tristate "Mellanox ConnectX HCA support" + depends on NETDEVICES && ETHERNET && PCI + select NET_VENDOR_MELLANOX select MLX4_CORE ---help--- This driver provides low-level InfiniBand support for diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/Makefile b/sys/ofed/drivers/infiniband/hw/mlx4/Makefile index ce885a855cff..cbfa7a449328 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/Makefile +++ b/sys/ofed/drivers/infiniband/hw/mlx4/Makefile @@ -1,4 +1,31 @@ -obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o +# $FreeBSD$ +#.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4 +#.PATH: ${.CURDIR}/../../../../include/linux -mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o -mlx4_ib-y += wc.o +.include + +KMOD = mlx4ib +SRCS = device_if.h bus_if.h pci_if.h vnode_if.h +#SRCS+= linux_compat.c linux_radix.c +SRCS+= ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c +SRCS+= opt_inet.h opt_inet6.h + +#CFLAGS+= -I${.CURDIR}/../../ofed/include/ +CFLAGS+= -I${.CURDIR}/../../../../include +CFLAGS+= -DCONFIG_INFINIBAND_USER_MEM + +.if !defined(KERNBUILDDIR) +.if ${MK_INET_SUPPORT} != "no" +opt_inet.h: + @echo "#define INET 1" > ${.TARGET} +.endif + +.if ${MK_INET6_SUPPORT} != "no" +opt_inet6.h: + @echo "#define INET6 1" > ${.TARGET} +.endif +.endif + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/ah.c b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c index 26251b473d3d..47c9aa0c512d 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/ah.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c @@ -30,25 +30,25 @@ * SOFTWARE. */ -#include "mlx4_ib.h" + +#include +#include +#include #include +#include + +#include #include #include -#include + +#include "mlx4_ib.h" int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, u8 *mac, int *is_mcast, u8 port) { - struct mlx4_ib_iboe *iboe = &dev->iboe; struct in6_addr in6; *is_mcast = 0; - spin_lock(&iboe->lock); - if (!iboe->netdevs[port - 1]) { - spin_unlock(&iboe->lock); - return -EINVAL; - } - spin_unlock(&iboe->lock); memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6); if (rdma_link_local_addr(&in6)) @@ -92,15 +92,15 @@ static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, } static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, - struct mlx4_ib_ah *ah) + struct mlx4_ib_ah *ah) { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); struct mlx4_dev *dev = ibdev->dev; + union ib_gid sgid; u8 mac[6]; int err; int is_mcast; u16 vlan_tag; - union ib_gid sgid; err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num); if (err) @@ -130,7 +130,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr ah->av.ib.dlid = cpu_to_be16(0xc000); memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16); - ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 29); return &ah->ibah; } @@ -147,25 +147,24 @@ struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) { if (!(ah_attr->ah_flags & IB_AH_GRH)) { ret = ERR_PTR(-EINVAL); - goto out; } else { - /* TBD: need to handle the case when we get called - in an atomic context and there we might sleep. We - don't expect this currently since we're working with - link local addresses which we can translate without - going to sleep */ + /* + * TBD: need to handle the case when we get + * called in an atomic context and there we + * might sleep. We don't expect this + * currently since we're working with link + * local addresses which we can translate + * without going to sleep. + */ ret = create_iboe_ah(pd, ah_attr, ah); - if (IS_ERR(ret)) - goto out; - else - return ret; } + + if (IS_ERR(ret)) + kfree(ah); + + return ret; } else return create_ib_ah(pd, ah_attr, ah); /* never fails */ - -out: - kfree(ah); - return ret; } int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) @@ -202,4 +201,3 @@ int mlx4_ib_destroy_ah(struct ib_ah *ah) kfree(to_mah(ah)); return 0; } - diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c b/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c new file mode 100644 index 000000000000..ae7b558cf139 --- /dev/null +++ b/sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***********************************************************/ +/*This file support the handling of the Alias GUID feature. */ +/***********************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx4_ib.h" + +/* +The driver keeps the current state of all guids, as they are in the HW. +Whenever we receive an smp mad GUIDInfo record, the data will be cached. +*/ + +struct mlx4_alias_guid_work_context { + u8 port; + struct mlx4_ib_dev *dev ; + struct ib_sa_query *sa_query; + struct completion done; + int query_id; + struct list_head list; + int block_num; +}; + +struct mlx4_next_alias_guid_work { + u8 port; + u8 block_num; + struct mlx4_sriov_alias_guid_info_rec_det rec_det; +}; + + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, + u8 port_num, u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + int port_index = port_num - 1; + + if (!mlx4_is_master(dev->dev)) + return; + + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* The location of the specific index starts from bit number 4 + * until bit num 11 */ + if (test_bit(i + 4, (unsigned long *)&guid_indexes)) { + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->num_slaves) { + pr_debug("The last slave: %d\n", slave_id); + return; + } + + /* cache the guid: */ + memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id], + &p_data[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } else + pr_debug("Guid number: %d in block: %d" + " was not updated\n", i, block_num); + } +} + +static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index) +{ + if (index >= NUM_ALIAS_GUID_PER_PORT) { + pr_err("%s: ERROR: asked for index:%d\n", __func__, index); + return (__force __be64) -1; + } + return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index]; +} + + +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index) +{ + return IB_SA_COMP_MASK(4 + index); +} + +/* + * Whenever new GUID is set/unset (guid table change) create event and + * notify the relevant slave (master also should be notified). + * If the GUID value is not as we have in the cache the slave will not be + * updated; in this case it waits for the smp_snoop or the port management + * event to call the function and to update the slave. + * block_number - the index of the block (16 blocks available) + * port_number - 1 or 2 + */ +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + enum slave_port_state new_state; + enum slave_port_state prev_state; + __be64 tmp_cur_ag, form_cache_ag; + enum slave_port_gen_event gen_event; + + if (!mlx4_is_master(dev->dev)) + return; + + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + /*calculate the slaves and notify them*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* the location of the specific index runs from bits 4..11 */ + if (!(test_bit(i + 4, (unsigned long *)&guid_indexes))) + continue; + + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->num_slaves) + return; + tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE]; + form_cache_ag = get_cached_alias_guid(dev, port_num, + (NUM_ALIAS_GUID_IN_REC * block_num) + i); + /* + * Check if guid is not the same as in the cache, + * If it is different, wait for the snoop_smp or the port mgmt + * change event to update the slave on its port state change + */ + if (tmp_cur_ag != form_cache_ag) + continue; + mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); + + /*2 cases: Valid GUID, and Invalid Guid*/ + + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/ + prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num); + new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + &gen_event); + pr_debug("slave: %d, port: %d prev_port_state: %d," + " new_port_state: %d, gen_event: %d\n", + slave_id, port_num, prev_state, new_state, gen_event); + if (gen_event == SLAVE_PORT_GEN_EVENT_UP) { + pr_debug("sending PORT_UP event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, + port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE); + } + } else { /* request to invalidate GUID */ + set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, + &gen_event); + pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num, + MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } + } +} + +static void aliasguid_query_handler(int status, + struct ib_sa_guidinfo_rec *guid_rec, + void *context) +{ + struct mlx4_ib_dev *dev; + struct mlx4_alias_guid_work_context *cb_ctx = context; + u8 port_index ; + int i; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags, flags1; + + if (!context) + return; + + dev = cb_ctx->dev; + port_index = cb_ctx->port - 1; + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[cb_ctx->block_num]; + + if (status) { + rec->status = MLX4_GUID_INFO_STATUS_IDLE; + pr_debug("(port: %d) failed: status = %d\n", + cb_ctx->port, status); + goto out; + } + + if (guid_rec->block_num != cb_ctx->block_num) { + pr_err("block num mismatch: %d != %d\n", + cb_ctx->block_num, guid_rec->block_num); + goto out; + } + + pr_debug("lid/port: %d/%d, block_num: %d\n", + be16_to_cpu(guid_rec->lid), cb_ctx->port, + guid_rec->block_num); + + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[guid_rec->block_num]; + + rec->status = MLX4_GUID_INFO_STATUS_SET; + rec->method = MLX4_GUID_INFO_RECORD_SET; + + for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) { + __be64 tmp_cur_ag; + tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE]; + /* check if the SM didn't assign one of the records. + * if it didn't, if it was not sysadmin request: + * ask the SM to give a new GUID, (instead of the driver request). + */ + if (tmp_cur_ag == MLX4_NOT_SET_GUID) { + mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in " + "block_num: %d was declined by SM, " + "ownership by %d (0 = driver, 1=sysAdmin," + " 2=None)\n", __func__, i, + guid_rec->block_num, rec->ownership); + if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) { + /* if it is driver assign, asks for new GUID from SM*/ + *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = + MLX4_NOT_SET_GUID; + + /* Mark the record as not assigned, and let it + * be sent again in the next work sched.*/ + rec->status = MLX4_GUID_INFO_STATUS_IDLE; + rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + } + } else { + /* properly assigned record. */ + /* We save the GUID we just got from the SM in the + * admin_guid in order to be persistent, and in the + * request from the sm the process will ask for the same GUID */ + if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN && + tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) { + /* the sysadmin assignment failed.*/ + mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" + " admin guid after SysAdmin " + "configuration. " + "Record num %d in block_num:%d " + "was declined by SM, " + "new val(0x%llx) was kept\n", + __func__, i, + guid_rec->block_num, + (long long)be64_to_cpu(*(__be64 *) & + rec->all_recs[i * GUID_REC_SIZE])); + } else { + memcpy(&rec->all_recs[i * GUID_REC_SIZE], + &guid_rec->guid_info_list[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } + } + } + /* + The func is call here to close the cases when the + sm doesn't send smp, so in the sa response the driver + notifies the slave. + */ + mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num, + cb_ctx->port, + guid_rec->guid_info_list); +out: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq, + &dev->sriov.alias_guid.ports_guid[port_index]. + alias_guid_work, 0); + if (cb_ctx->sa_query) { + list_del(&cb_ctx->list); + kfree(cb_ctx); + } else + complete(&cb_ctx->done); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) +{ + int i; + u64 cur_admin_val; + ib_sa_comp_mask comp_mask = 0; + + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status + = MLX4_GUID_INFO_STATUS_IDLE; + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method + = MLX4_GUID_INFO_RECORD_SET; + + /* calculate the comp_mask for that record.*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + cur_admin_val = + *(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].all_recs[GUID_REC_SIZE * i]; + /* + check the admin value: if it's for delete (~00LL) or + it is the first guid of the first record (hw guid) or + the records is not in ownership of the sysadmin and the sm doesn't + need to assign GUIDs, then don't put it up for assignment. + */ + if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val || + (!index && !i) || + MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid. + ports_guid[port - 1].all_rec_per_port[index].ownership) + continue; + comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + } + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes = comp_mask; +} + +static int set_guid_rec(struct ib_device *ibdev, + u8 port, int index, + struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +{ + int err; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_guidinfo_rec guid_info_rec; + ib_sa_comp_mask comp_mask; + struct ib_port_attr attr; + struct mlx4_alias_guid_work_context *callback_context; + unsigned long resched_delay, flags, flags1; + struct list_head *head = + &dev->sriov.alias_guid.ports_guid[port - 1].cb_list; + + err = __mlx4_ib_query_port(ibdev, port, &attr, 1); + if (err) { + pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n", + err, port); + return err; + } + /*check the port was configured by the sm, otherwise no need to send */ + if (attr.state != IB_PORT_ACTIVE) { + pr_debug("port %d not active...rescheduling\n", port); + resched_delay = 5 * HZ; + err = -EAGAIN; + goto new_schedule; + } + + callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL); + if (!callback_context) { + err = -ENOMEM; + resched_delay = HZ * 5; + goto new_schedule; + } + callback_context->port = port; + callback_context->dev = dev; + callback_context->block_num = index; + + memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); + + guid_info_rec.lid = cpu_to_be16(attr.lid); + guid_info_rec.block_num = index; + + memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, + GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC); + comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM | + rec_det->guid_indexes; + + init_completion(&callback_context->done); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_add_tail(&callback_context->list, head); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + + callback_context->query_id = + ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client, + ibdev, port, &guid_info_rec, + comp_mask, rec_det->method, 1000, + GFP_KERNEL, aliasguid_query_handler, + callback_context, + &callback_context->sa_query); + if (callback_context->query_id < 0) { + pr_debug("ib_sa_guid_info_rec_query failed, query_id: " + "%d. will reschedule to the next 1 sec.\n", + callback_context->query_id); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_del(&callback_context->list); + kfree(callback_context); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + resched_delay = 1 * HZ; + err = -EAGAIN; + goto new_schedule; + } + err = 0; + goto out; + +new_schedule: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + invalidate_guid_record(dev, port, index); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + resched_delay); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + +out: + return err; +} + +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) +{ + int i; + unsigned long flags, flags1; + + pr_debug("port %d\n", port); + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++) + invalidate_guid_record(dev, port, i); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) { + /* + make sure no work waits in the queue, if the work is already + queued(not on the timer) the cancel will fail. That is not a problem + because we just want the work started. + */ + cancel_delayed_work(&dev->sriov.alias_guid. + ports_guid[port - 1].alias_guid_work); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +/* The function returns the next record that was + * not configured (or failed to be configured) */ +static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *rec) +{ + int j; + unsigned long flags; + + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status == + MLX4_GUID_INFO_STATUS_IDLE) { + memcpy(&rec->rec_det, + &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j], + sizeof (struct mlx4_sriov_alias_guid_info_rec_det)); + rec->port = port; + rec->block_num = j; + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status = + MLX4_GUID_INFO_STATUS_PENDING; + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + return 0; + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + } + return -ENOENT; +} + +static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port, + int rec_index, + struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +{ + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes = + rec_det->guid_indexes; + memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs, + rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status = + rec_det->status; +} + +static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port) +{ + int j; + struct mlx4_sriov_alias_guid_info_rec_det rec_det ; + + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) { + memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); + rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) | + IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 | + IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 | + IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 | + IB_SA_GUIDINFO_REC_GID7; + rec_det.status = MLX4_GUID_INFO_STATUS_IDLE; + set_administratively_guid_record(dev, port, j, &rec_det); + } +} + +static void alias_guid_work(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + int ret = 0; + struct mlx4_next_alias_guid_work *rec; + struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port = + container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det, + alias_guid_work); + struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent; + struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid, + struct mlx4_ib_sriov, + alias_guid); + struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov); + + rec = kzalloc(sizeof *rec, GFP_KERNEL); + if (!rec) { + pr_err("alias_guid_work: No Memory\n"); + return; + } + + pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1); + ret = get_next_record_to_update(dev, sriov_alias_port->port, rec); + if (ret) { + pr_debug("No more records to update.\n"); + goto out; + } + + set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num, + &rec->rec_det); + +out: + kfree(rec); +} + + +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port) +{ + unsigned long flags, flags1; + + if (!mlx4_is_master(dev->dev)) + return; + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq, + &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) +{ + int i; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct mlx4_alias_guid_work_context *cb_ctx; + struct mlx4_sriov_alias_guid_port_rec_det *det; + struct ib_sa_query *sa_query; + unsigned long flags; + + for (i = 0 ; i < dev->num_ports; i++) { + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work); + det = &sriov->alias_guid.ports_guid[i]; + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + while (!list_empty(&det->cb_list)) { + cb_ctx = list_entry(det->cb_list.next, + struct mlx4_alias_guid_work_context, + list); + sa_query = cb_ctx->sa_query; + cb_ctx->sa_query = NULL; + list_del(&cb_ctx->list); + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + ib_sa_cancel_query(cb_ctx->query_id, sa_query); + wait_for_completion(&cb_ctx->done); + kfree(cb_ctx); + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + } + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + } + for (i = 0 ; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + } + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); +} + +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) +{ + char alias_wq_name[15]; + int ret = 0; + int i, j, k; + union ib_gid gid; + + if (!mlx4_is_master(dev->dev)) + return 0; + dev->sriov.alias_guid.sa_client = + kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL); + if (!dev->sriov.alias_guid.sa_client) + return -ENOMEM; + + ib_sa_register_client(dev->sriov.alias_guid.sa_client); + + spin_lock_init(&dev->sriov.alias_guid.ag_work_lock); + + for (i = 1; i <= dev->num_ports; ++i) { + if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) { + ret = -EFAULT; + goto err_unregister; + } + } + + for (i = 0 ; i < dev->num_ports; i++) { + memset(&dev->sriov.alias_guid.ports_guid[i], 0, + sizeof (struct mlx4_sriov_alias_guid_port_rec_det)); + /*Check if the SM doesn't need to assign the GUIDs*/ + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + if (mlx4_ib_sm_guid_assign) { + dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j]. + ownership = MLX4_GUID_DRIVER_ASSIGN; + continue; + } + dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j]. + ownership = MLX4_GUID_NONE_ASSIGN; + /*mark each val as it was deleted, + till the sysAdmin will give it valid val*/ + for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { + *(__be64 *)&dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] = + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); + } + } + INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list); + /*prepare the records, set them to be allocated by sm*/ + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) + invalidate_guid_record(dev, i + 1, j); + + dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; + dev->sriov.alias_guid.ports_guid[i].port = i; + if (mlx4_ib_sm_guid_assign) + set_all_slaves_guids(dev, i); + + snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); + dev->sriov.alias_guid.ports_guid[i].wq = + create_singlethread_workqueue(alias_wq_name); + if (!dev->sriov.alias_guid.ports_guid[i].wq) { + ret = -ENOMEM; + goto err_thread; + } + INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work, + alias_guid_work); + } + return 0; + +err_thread: + for (--i; i >= 0; i--) { + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + dev->sriov.alias_guid.ports_guid[i].wq = NULL; + } + +err_unregister: + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); + dev->sriov.alias_guid.sa_client = NULL; + pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret); + return ret; +} diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/cm.c b/sys/ofed/drivers/infiniband/hw/mlx4/cm.c new file mode 100644 index 000000000000..3745367fe2c0 --- /dev/null +++ b/sys/ofed/drivers/infiniband/hw/mlx4/cm.c @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include + +#include "mlx4_ib.h" + +#define CM_CLEANUP_CACHE_TIMEOUT (5 * HZ) + +struct id_map_entry { + struct rb_node node; + + u32 sl_cm_id; + u32 pv_cm_id; + int slave_id; + int scheduled_delete; + struct mlx4_ib_dev *dev; + + struct list_head list; + struct delayed_work timeout; +}; + +struct cm_generic_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; +}; + +struct cm_req_msg { + unsigned char unused[0x60]; + union ib_gid primary_path_sgid; +}; + + +static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) +{ + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); +} + +static u32 get_local_comm_id(struct ib_mad *mad) +{ + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + + return be32_to_cpu(msg->local_comm_id); +} + +static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) +{ + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); +} + +static u32 get_remote_comm_id(struct ib_mad *mad) +{ + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + + return be32_to_cpu(msg->remote_comm_id); +} + +static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) +{ + struct cm_req_msg *msg = (struct cm_req_msg *)mad; + + return msg->primary_path_sgid; +} + +/* Lock should be taken before called */ +static struct id_map_entry * +id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node *node = sl_id_map->rb_node; + + while (node) { + struct id_map_entry *id_map_entry = + rb_entry(node, struct id_map_entry, node); + + if (id_map_entry->sl_cm_id > sl_cm_id) + node = node->rb_left; + else if (id_map_entry->sl_cm_id < sl_cm_id) + node = node->rb_right; + else if (id_map_entry->slave_id > slave_id) + node = node->rb_left; + else if (id_map_entry->slave_id < slave_id) + node = node->rb_right; + else + return id_map_entry; + } + return NULL; +} + +static void id_map_ent_timeout(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout); + struct id_map_entry *db_ent, *found_ent; + struct mlx4_ib_dev *dev = ent->dev; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + int pv_id = (int) ent->pv_cm_id; + + spin_lock(&sriov->id_map_lock); + db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id); + if (!db_ent) + goto out; + found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_id); + +out: + list_del(&ent->list); + spin_unlock(&sriov->id_map_lock); + kfree(ent); +} + +static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct id_map_entry *ent, *found_ent; + + spin_lock(&sriov->id_map_lock); + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id); + if (!ent) + goto out; + found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_cm_id); +out: + spin_unlock(&sriov->id_map_lock); +} + +static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node **link = &sl_id_map->rb_node, *parent = NULL; + struct id_map_entry *ent; + int slave_id = new->slave_id; + int sl_cm_id = new->sl_cm_id; + + ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id); + if (ent) { + pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n", + sl_cm_id); + + rb_replace_node(&ent->node, &new->node, sl_id_map); + return; + } + + /* Go to the bottom of the tree */ + while (*link) { + parent = *link; + ent = rb_entry(parent, struct id_map_entry, node); + + if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id)) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, sl_id_map); +} + +static struct id_map_entry * +id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) +{ + int ret, id; + static int next_id; + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL); + if (!ent) { + mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n"); + return ERR_PTR(-ENOMEM); + } + + ent->sl_cm_id = sl_cm_id; + ent->slave_id = slave_id; + ent->scheduled_delete = 0; + ent->dev = to_mdev(ibdev); + INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout); + + do { + spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); + ret = idr_get_new_above(&sriov->pv_id_table, ent, + next_id, &id); + if (!ret) { + next_id = ((unsigned) id + 1) & MAX_IDR_MASK; + ent->pv_cm_id = (u32)id; + sl_id_map_add(ibdev, ent); + } + + spin_unlock(&sriov->id_map_lock); + } while (ret == -EAGAIN && idr_pre_get(&sriov->pv_id_table, GFP_KERNEL)); + /*the function idr_get_new_above can return -ENOSPC, so don't insert in that case.*/ + if (!ret) { + spin_lock(&sriov->id_map_lock); + list_add_tail(&ent->list, &sriov->cm_list); + spin_unlock(&sriov->id_map_lock); + return ent; + } + /*error flow*/ + kfree(ent); + mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret); + return ERR_PTR(-ENOMEM); +} + +static struct id_map_entry * +id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id) +{ + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + spin_lock(&sriov->id_map_lock); + if (*pv_cm_id == -1) { + ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id); + if (ent) + *pv_cm_id = (int) ent->pv_cm_id; + } else + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id); + spin_unlock(&sriov->id_map_lock); + + return ent; +} + +static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + unsigned long flags; + + spin_lock(&sriov->id_map_lock); + spin_lock_irqsave(&sriov->going_down_lock, flags); + /*make sure that there is no schedule inside the scheduled work.*/ + if (!sriov->is_going_down) { + id->scheduled_delete = 1; + schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + } + spin_unlock_irqrestore(&sriov->going_down_lock, flags); + spin_unlock(&sriov->id_map_lock); +} + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad) +{ + struct id_map_entry *id; + u32 sl_cm_id; + int pv_cm_id = -1; + + sl_cm_id = get_local_comm_id(mad); + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_REP_ATTR_ID) { + id = id_map_alloc(ibdev, slave_id, sl_cm_id); + if (IS_ERR(id)) { + mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", + __func__, slave_id, sl_cm_id); + return PTR_ERR(id); + } + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) { + return 0; + } else { + id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); + } + + if (!id) { + pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n", + slave_id, sl_cm_id); + return -EINVAL; + } + + set_local_comm_id(mad, id->pv_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) + id_map_find_del(ibdev, pv_cm_id); + + return 0; +} + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad, int is_eth) +{ + u32 pv_cm_id; + struct id_map_entry *id; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) { + union ib_gid gid; + + if (is_eth) + return 0; + + gid = gid_from_req_msg(ibdev, mad); + *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); + if (*slave < 0) { + mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n", + gid.global.interface_id); + return -ENOENT; + } + return 0; + } + + pv_cm_id = get_remote_comm_id(mad); + id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1); + + if (!id) { + pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id); + return -ENOENT; + } + + if (!is_eth) + *slave = id->slave_id; + set_remote_comm_id(mad, id->sl_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) { + id_map_find_del(ibdev, (int) pv_cm_id); + } + + return 0; +} + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev) +{ + spin_lock_init(&dev->sriov.id_map_lock); + INIT_LIST_HEAD(&dev->sriov.cm_list); + dev->sriov.sl_id_map = RB_ROOT; + idr_init(&dev->sriov.pv_id_table); + idr_pre_get(&dev->sriov.pv_id_table, GFP_KERNEL); +} + +/* slave = -1 ==> all slaves */ +/* TBD -- call paravirt clean for single slave. Need for slave RESET event */ +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) +{ + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct list_head lh; + struct rb_node *nd; + int need_flush = 1; + struct id_map_entry *map, *tmp_map; + /* cancel all delayed work queue entries */ + INIT_LIST_HEAD(&lh); + spin_lock(&sriov->id_map_lock); + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave < 0 || slave == map->slave_id) { + if (map->scheduled_delete) + need_flush &= !!cancel_delayed_work(&map->timeout); + } + } + + spin_unlock(&sriov->id_map_lock); + + if (!need_flush) + flush_scheduled_work(); /* make sure all timers were flushed */ + + /* now, remove all leftover entries from databases*/ + spin_lock(&sriov->id_map_lock); + if (slave < 0) { + while (rb_first(sl_id_map)) { + struct id_map_entry *ent = + rb_entry(rb_first(sl_id_map), + struct id_map_entry, node); + + rb_erase(&ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id); + } + list_splice_init(&dev->sriov.cm_list, &lh); + } else { + /* first, move nodes belonging to slave to db remove list */ + nd = rb_first(sl_id_map); + while (nd) { + struct id_map_entry *ent = + rb_entry(nd, struct id_map_entry, node); + nd = rb_next(nd); + if (ent->slave_id == slave) + list_move_tail(&ent->list, &lh); + } + /* remove those nodes from databases */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + rb_erase(&map->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id); + } + + /* add remaining nodes from cm_list */ + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave == map->slave_id) + list_move_tail(&map->list, &lh); + } + } + + spin_unlock(&sriov->id_map_lock); + + /* free any map entries left behind due to cancel_delayed_work above */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + list_del(&map->list); + kfree(map); + } +} diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/cq.c b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c index 31cd00db8196..293917a2c682 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/cq.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c @@ -33,13 +33,14 @@ #include #include -#include +#include #include "mlx4_ib.h" #include "user.h" /* Which firmware version adds support for Resize CQ */ #define MLX4_FW_VER_RESIZE_CQ mlx4_fw_ver(2, 5, 0) +#define MLX4_FW_VER_IGNORE_OVERRUN_CQ mlx4_fw_ver(2, 7, 8200) static void mlx4_ib_cq_comp(struct mlx4_cq *cq) { @@ -53,7 +54,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) struct ib_cq *ibcq; if (type != MLX4_EVENT_TYPE_CQ_ERROR) { - printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " + pr_warn("Unexpected event type %d " "on CQ %06x\n", type, cq->cqn); return; } @@ -69,7 +70,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) { - return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe)); + return mlx4_buf_offset(&buf->buf, n * buf->entry_size); } static void *get_cqe(struct mlx4_ib_cq *cq, int n) @@ -80,8 +81,9 @@ static void *get_cqe(struct mlx4_ib_cq *cq, int n) static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) { struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe); - return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; } @@ -102,12 +104,13 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * { int err; - err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe), + err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, PAGE_SIZE * 2, &buf->buf); if (err) goto out; + buf->entry_size = dev->dev->caps.cqe_size; err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, &buf->mtt); if (err) @@ -123,8 +126,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * mlx4_mtt_cleanup(dev->dev, &buf->mtt); err_buf: - mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe), - &buf->buf); + mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf); out: return err; @@ -132,7 +134,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) { - mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf); + mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); } static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, @@ -140,14 +142,19 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont u64 buf_addr, int cqe) { int err; + int cqe_size = dev->dev->caps.cqe_size; + int shift; + int n; - *umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe), + *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(*umem)) return PTR_ERR(*umem); - err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem), - ilog2((*umem)->page_size), &buf->mtt); + n = ib_umem_page_count(*umem); + shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n); + err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); + if (err) goto err_buf; @@ -175,12 +182,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector struct mlx4_uar *uar; int err; - if (entries < 1 || entries > dev->dev->caps.max_cqes) { - mlx4_ib_dbg("invalid num of entries: %d", entries); + if (entries < 1 || entries > dev->dev->caps.max_cqes) return ERR_PTR(-EINVAL); - } - cq = kzalloc(sizeof *cq, GFP_KERNEL); + cq = kmalloc(sizeof *cq, GFP_KERNEL); if (!cq) return ERR_PTR(-ENOMEM); @@ -227,10 +232,11 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector uar = &dev->priv_uar; } + if (dev->eq_table) + vector = dev->eq_table[vector % ibdev->num_comp_vectors]; + err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, - cq->db.dma, &cq->mcq, - vector == IB_CQ_VECTOR_LEAST_ATTACHED ? - MLX4_LEAST_ATTACHED_VECTOR : vector, 0); + cq->db.dma, &cq->mcq, vector, 0, 0); if (err) goto err_dbmap; @@ -335,16 +341,23 @@ static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) { struct mlx4_cqe *cqe, *new_cqe; int i; + int cqe_size = cq->buf.entry_size; + int cqe_inc = cqe_size == 64 ? 1 : 0; i = cq->mcq.cons_index; cqe = get_cqe(cq, i & cq->ibcq.cqe); + cqe += cqe_inc; + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, (i + 1) & cq->resize_buf->cqe); - memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe)); + memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size); + new_cqe += cqe_inc; + new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); cqe = get_cqe(cq, ++i & cq->ibcq.cqe); + cqe += cqe_inc; } ++cq->mcq.cons_index; } @@ -409,7 +422,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) } else { struct mlx4_ib_cq_buf tmp_buf; int tmp_cqe = 0; - + spin_lock_irq(&cq->lock); if (cq->resize_buf) { mlx4_ib_cq_resize_copy_cqes(cq); @@ -445,9 +458,21 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) out: mutex_unlock(&cq->resize_mutex); + return err; } +int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq) +{ + struct mlx4_ib_dev *dev = to_mdev(ibcq->device); + struct mlx4_ib_cq *cq = to_mcq(ibcq); + + if (dev->dev->caps.fw_ver < MLX4_FW_VER_IGNORE_OVERRUN_CQ) + return -ENOSYS; + + return mlx4_cq_ignore_overrun(dev->dev, &cq->mcq); +} + int mlx4_ib_destroy_cq(struct ib_cq *cq) { struct mlx4_ib_dev *dev = to_mdev(cq->device); @@ -473,7 +498,7 @@ static void dump_cqe(void *cqe) { __be32 *buf = cqe; - printk(KERN_DEBUG "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", + pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]), be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]), be32_to_cpu(buf[6]), be32_to_cpu(buf[7])); @@ -483,7 +508,7 @@ static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ib_wc *wc) { if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) { - printk(KERN_DEBUG "local QP operation err " + pr_debug("local QP operation err " "(QPN %06x, WQE index %x, vendor syndrome %02x, " "opcode = %02x)\n", be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index), @@ -554,6 +579,26 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) checksum == cpu_to_be16(0xffff); } +static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, + unsigned tail, struct mlx4_cqe *cqe) +{ + struct mlx4_ib_proxy_sqp_hdr *hdr; + + ib_dma_sync_single_for_cpu(qp->ibqp.device, + qp->sqp_proxy_rcv[tail].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); + wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; + wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; + wc->dlid_path_bits = 0; + + return 0; +} + static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_ib_qp **cur_qp, struct ib_wc *wc) @@ -562,18 +607,20 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_qp *mqp; struct mlx4_ib_wq *wq; struct mlx4_ib_srq *srq; - struct mlx4_srq *msrq; int is_send; int is_error; u32 g_mlpath_rqpn; - int is_xrc_recv = 0; u16 wqe_ctr; + unsigned tail = 0; repoll: cqe = next_cqe_sw(cq); if (!cqe) return -EAGAIN; + if (cq->buf.entry_size == 64) + cqe++; + ++cq->mcq.cons_index; /* @@ -588,7 +635,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP && is_send)) { - printk(KERN_WARNING "Completion for NOP opcode detected!\n"); + pr_warn("Completion for NOP opcode detected!\n"); return -EINVAL; } @@ -608,24 +655,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, goto repoll; } - if ((be32_to_cpu(cqe->vlan_my_qpn) & (1 << 23)) && !is_send) { - /* - * We do not have to take the XRC SRQ table lock here, - * because CQs will be locked while XRC SRQs are removed - * from the table. - */ - msrq = __mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, - be32_to_cpu(cqe->g_mlpath_rqpn) & - 0xffffff); - if (unlikely(!msrq)) { - printk(KERN_WARNING "CQ %06x with entry for unknown " - "XRC SRQ %06x\n", cq->mcq.cqn, - be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff); - return -EINVAL; - } - is_xrc_recv = 1; - srq = to_mibsrq(msrq); - } else if (!*cur_qp || + if (!*cur_qp || (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) { /* * We do not have to take the QP table lock here, @@ -635,7 +665,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev, be32_to_cpu(cqe->vlan_my_qpn)); if (unlikely(!mqp)) { - printk(KERN_WARNING "CQ %06x with entry for unknown QPN %06x\n", + pr_warn("CQ %06x with entry for unknown QPN %06x\n", cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK); return -EINVAL; } @@ -643,7 +673,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, *cur_qp = to_mibqp(mqp); } - wc->qp = is_xrc_recv ? NULL: &(*cur_qp)->ibqp; + wc->qp = &(*cur_qp)->ibqp; if (is_send) { wq = &(*cur_qp)->sq; @@ -653,10 +683,6 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, } wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; - } else if (is_xrc_recv) { - wqe_ctr = be16_to_cpu(cqe->wqe_index); - wc->wr_id = srq->wrid[wqe_ctr]; - mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else if ((*cur_qp)->ibqp.srq) { srq = to_msrq((*cur_qp)->ibqp.srq); wqe_ctr = be16_to_cpu(cqe->wqe_index); @@ -664,7 +690,8 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else { wq = &(*cur_qp)->rq; - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + tail = wq->tail & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[tail]; ++wq->tail; } @@ -747,14 +774,26 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, break; } + if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { + if ((*cur_qp)->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + return use_tunnel_data(*cur_qp, cq, wc, tail, cqe); + } + wc->slid = be16_to_cpu(cqe->rlid); - wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); wc->src_qp = g_mlpath_rqpn & 0xffffff; wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0; wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; - wc->csum_ok = mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum); + wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, + cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; + if (rdma_port_get_link_layer(wc->qp->device, + (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET) + wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; + else + wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; } return 0; @@ -776,8 +815,7 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) break; } - if (npolled) - mlx4_cq_set_ci(&cq->mcq); + mlx4_cq_set_ci(&cq->mcq); spin_unlock_irqrestore(&cq->lock, flags); @@ -804,10 +842,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) int nfreed = 0; struct mlx4_cqe *cqe, *dest; u8 owner_bit; - int is_xrc_srq = 0; - - if (srq && srq->ibsrq.xrc_cq) - is_xrc_srq = 1; + int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0; /* * First we need to find the current producer index, so we @@ -826,15 +861,16 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) */ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); - if (((be32_to_cpu(cqe->vlan_my_qpn) & 0xffffff) == qpn) || - (is_xrc_srq && - (be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff) == - srq->msrq.srqn)) { + cqe += cqe_inc; + + if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); ++nfreed; } else if (nfreed) { dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest += cqe_inc; + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; memcpy(dest, cqe, sizeof *cqe); dest->owner_sr_opcode = owner_bit | diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mad.c b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c index 2bb87ab7ec50..f130cdc44d8b 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/mad.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c @@ -32,8 +32,13 @@ #include #include +#include +#include +#include #include +#include +#include #include "mlx4_ib.h" @@ -42,7 +47,62 @@ enum { MLX4_IB_VENDOR_CLASS2 = 0xa }; -int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, +#define MLX4_TUN_SEND_WRID_SHIFT 34 +#define MLX4_TUN_QPN_SHIFT 32 +#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT) +#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT) + +#define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1) +#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3) + + /* Port mgmt change event handling */ + +#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr) +#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask) +#define NUM_IDX_IN_PKEY_TBL_BLK 32 +#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */ +#define GUID_TBL_BLK_NUM_ENTRIES 8 +#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) + +struct mlx4_mad_rcv_buf { + struct ib_grh grh; + u8 payload[256]; +} __packed; + +struct mlx4_mad_snd_buf { + u8 payload[256]; +} __packed; + +struct mlx4_tunnel_mad { + struct ib_grh grh; + struct mlx4_ib_tunnel_header hdr; + struct ib_mad mad; +} __packed; + +struct mlx4_rcv_tunnel_mad { + struct mlx4_rcv_tunnel_hdr hdr; + struct ib_grh grh; + struct ib_mad mad; +} __packed; + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num); +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num); +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap); + +__be64 mlx4_ib_gen_node_guid(void) +{ +#define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) + return cpu_to_be64(NODE_GUID_HI | random()); +} + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) +{ + return cpu_to_be64(atomic_inc_return(&ctx->tid)) | + cpu_to_be64(0xff00000000000000LL); +} + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, struct ib_wc *in_wc, struct ib_grh *in_grh, void *in_mad, void *response_mad) { @@ -69,10 +129,13 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, * Key check traps can't be generated unless we have in_wc to * tell us where to send the trap. */ - if (ignore_mkey || !in_wc) + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc) op_modifier |= 0x1; - if (ignore_bkey || !in_wc) + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc) op_modifier |= 0x2; + if (mlx4_is_mfunc(dev->dev) && + (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc)) + op_modifier |= 0x8; if (in_wc) { struct { @@ -105,9 +168,10 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, in_modifier |= in_wc->slid << 16; } - err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, - in_modifier, op_modifier, - MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C); + err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, + mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, + (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED); if (!err) memcpy(response_mad, outmailbox->buf, 256); @@ -122,6 +186,7 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) { struct ib_ah *new_ah; struct ib_ah_attr ah_attr; + unsigned long flags; if (!dev->send_agent[port_num - 1][0]) return; @@ -136,53 +201,134 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) if (IS_ERR(new_ah)) return; - spin_lock(&dev->sm_lock); + spin_lock_irqsave(&dev->sm_lock, flags); if (dev->sm_ah[port_num - 1]) ib_destroy_ah(dev->sm_ah[port_num - 1]); dev->sm_ah[port_num - 1] = new_ah; - spin_unlock(&dev->sm_lock); + spin_unlock_irqrestore(&dev->sm_lock, flags); } /* - * Snoop SM MADs for port info and P_Key table sets, so we can - * synthesize LID change and P_Key change events. + * Snoop SM MADs for port info, GUID info, and P_Key table sets, so we can + * synthesize LID change, Client-Rereg, GID change, and P_Key change events. */ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad, - u16 prev_lid) + u16 prev_lid) { - struct ib_event event; + struct ib_port_info *pinfo; + u16 lid; + __be16 *base; + u32 bn, pkey_change_bitmap; + int i; + + struct mlx4_ib_dev *dev = to_mdev(ibdev); if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && - mad->mad_hdr.method == IB_MGMT_METHOD_SET) { - if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { - struct ib_port_info *pinfo = - (struct ib_port_info *) ((struct ib_smp *) mad)->data; - u16 lid = be16_to_cpu(pinfo->lid); + mad->mad_hdr.method == IB_MGMT_METHOD_SET) + switch (mad->mad_hdr.attr_id) { + case IB_SMP_ATTR_PORT_INFO: + pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data; + lid = be16_to_cpu(pinfo->lid); - update_sm_ah(to_mdev(ibdev), port_num, + update_sm_ah(dev, port_num, be16_to_cpu(pinfo->sm_lid), pinfo->neighbormtu_mastersmsl & 0xf); - event.device = ibdev; - event.element.port_num = port_num; + if (pinfo->clientrereg_resv_subnetto & 0x80) + handle_client_rereg_event(dev, port_num); - if (pinfo->clientrereg_resv_subnetto & 0x80) { - event.event = IB_EVENT_CLIENT_REREGISTER; - ib_dispatch_event(&event); + if (prev_lid != lid) + handle_lid_change_event(dev, port_num); + break; + + case IB_SMP_ATTR_PKEY_TABLE: + if (!mlx4_is_mfunc(dev->dev)) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + break; } - if (prev_lid != lid) { - event.event = IB_EVENT_LID_CHANGE; - ib_dispatch_event(&event); + /* at this point, we are running in the master. + * Slaves do not receive SMPs. + */ + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF; + base = (__be16 *) &(((struct ib_smp *)mad)->data[0]); + pkey_change_bitmap = 0; + for (i = 0; i < 32; i++) { + pr_debug("PKEY[%d] = x%x\n", + i + bn*32, be16_to_cpu(base[i])); + if (be16_to_cpu(base[i]) != + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) { + pkey_change_bitmap |= (1 << i); + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] = + be16_to_cpu(base[i]); + } } + pr_debug("PKEY Change event: port=%d, " + "block=0x%x, change_bitmap=0x%x\n", + port_num, bn, pkey_change_bitmap); + + if (pkey_change_bitmap) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + if (!dev->sriov.is_going_down) + __propagate_pkey_ev(dev, port_num, bn, + pkey_change_bitmap); + } + break; + + case IB_SMP_ATTR_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + if (mlx4_is_master(dev->dev) && + !dev->sriov.is_going_down) { + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod); + mlx4_ib_update_cache_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + } + break; + + default: + break; } +} - if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) { - event.device = ibdev; - event.event = IB_EVENT_PKEY_CHANGE; - event.element.port_num = port_num; - ib_dispatch_event(&event); +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap) +{ + int i, ix, slave, err; + int have_event = 0; + + for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { + if (slave == mlx4_master_func_num(dev->dev)) + continue; + if (!mlx4_is_slave_active(dev->dev, slave)) + continue; + + have_event = 0; + for (i = 0; i < 32; i++) { + if (!(change_bitmap & (1 << i))) + continue; + for (ix = 0; + ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { + if (dev->pkeys.virt2phys_pkey[slave][port_num - 1] + [ix] == i + 32 * block) { + err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); + pr_debug("propagate_pkey_ev: slave %d," + " port %d, ix %d (%d)\n", + slave, port_num, ix, err); + have_event = 1; + break; + } + } + if (have_event) + break; } } } @@ -190,13 +336,15 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad, static void node_desc_override(struct ib_device *dev, struct ib_mad *mad) { + unsigned long flags; + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { - spin_lock(&to_mdev(dev)->sm_lock); + spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags); memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64); - spin_unlock(&to_mdev(dev)->sm_lock); + spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags); } } @@ -206,47 +354,357 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *ma struct ib_mad_send_buf *send_buf; struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; int ret; + unsigned long flags; if (agent) { send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; /* * We rely here on the fact that MLX QPs don't use the * address handle after the send is posted (this is * wrong following the IB spec strictly, but we know * it's OK for our devices). */ - spin_lock(&dev->sm_lock); + spin_lock_irqsave(&dev->sm_lock, flags); memcpy(send_buf->mad, mad, sizeof *mad); if ((send_buf->ah = dev->sm_ah[port_num - 1])) ret = ib_post_send_mad(send_buf, NULL); else ret = -EINVAL; - spin_unlock(&dev->sm_lock); + spin_unlock_irqrestore(&dev->sm_lock, flags); if (ret) ib_free_send_mad(send_buf); } } -static int is_vendor_id(__be16 attr_id) +static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad) { - return (attr_id & IB_SMP_ATTR_VENDOR_MASK) == IB_SMP_ATTR_VENDOR_MASK; + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; } -static int supported_vendor_id(__be16 attr_id) +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) { - return 1; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int i; + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (dev->sriov.demux[port - 1].guid_cache[i] == guid) + return i; + } + return -1; +} + + +static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, + u8 port, u16 pkey, u16 *ix) +{ + int i, ret; + u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; + u16 slot_pkey; + + if (slave == mlx4_master_func_num(dev->dev)) + return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); + + unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1; + + for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { + if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) + continue; + + pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; + + ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); + if (ret) + continue; + if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { + if (slot_pkey & 0x8000) { + *ix = (u16) pkey_ix; + return 0; + } else { + /* take first partial pkey index found */ + if (partial_ix == 0xFF) + partial_ix = pkey_ix; + } + } + } + + if (partial_ix < 0xFF) { + *ix = (u16) partial_ix; + return 0; + } + + return -EINVAL; +} + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *tun_ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_rcv_tunnel_mad *tun_mad; + struct ib_ah_attr attr; + struct ib_ah *ah; + struct ib_qp *src_qp = NULL; + unsigned tun_tx_ix = 0; + int dqpn; + int ret = 0; + u16 tun_pkey_ix; + u16 cached_pkey; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; + + if (dest_qpt > IB_QPT_GSI) + return -EINVAL; + + tun_ctx = dev->sriov.demux[port-1].tun[slave]; + + /* check if proxy qp created */ + if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + /* QP0 forwarding only for Dom0 */ + if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave)) + return -EINVAL; + + if (!dest_qpt) + tun_qp = &tun_ctx->qp[0]; + else + tun_qp = &tun_ctx->qp[1]; + + /* compute P_Key index to put in tunnel header for slave */ + if (dest_qpt) { + u16 pkey_ix; + ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); + if (ret) + return -EINVAL; + + ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); + if (ret) + return -EINVAL; + tun_pkey_ix = pkey_ix; + } else + tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + + dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1; + + /* get tunnel tx data buf for slave */ + src_qp = tun_qp->qp; + + /* create ah. Just need an empty one with the port num for the post send. + * The driver will set the force loopback bit in post_send */ + memset(&attr, 0, sizeof attr); + attr.port_num = port; + if (is_eth) { + memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); + attr.ah_flags = IB_AH_GRH; + } + ah = ib_create_ah(tun_ctx->pd, &attr); + if (IS_ERR(ah)) + return -ENOMEM; + + /* allocate tunnel tx buf after pass failure returns */ + spin_lock(&tun_qp->tx_lock); + if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&tun_qp->tx_lock); + if (ret) + goto out; + + tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); + if (tun_qp->tx_ring[tun_tx_ix].ah) + ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah); + tun_qp->tx_ring[tun_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + /* copy over to tunnel buffer */ + if (grh) + memcpy(&tun_mad->grh, grh, sizeof *grh); + memcpy(&tun_mad->mad, mad, sizeof *mad); + + /* adjust tunnel data */ + tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); + tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + + ib_dma_sync_single_for_device(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; + list.length = sizeof (struct mlx4_rcv_tunnel_mad); + list.lkey = tun_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; + wr.wr.ud.remote_qpn = dqpn; + wr.next = NULL; + wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(src_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, + struct ib_wc *wc, struct ib_grh *grh, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + int slave; + u8 *slave_id; + int is_eth = 0; + + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + is_eth = 0; + else + is_eth = 1; + + if (is_eth) { + if (!wc->wc_flags & IB_WC_GRH) { + mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); + return -EINVAL; + } + if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { + mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); + return -EINVAL; + } + if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad, is_eth)) + return 0; + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; + } + + /* Initially assume that this mad is for us */ + slave = mlx4_master_func_num(dev->dev); + + /* See if the slave id is encoded in a response mad */ + if (mad->mad_hdr.method & 0x80) { + slave_id = (u8 *) &mad->mad_hdr.tid; + slave = *slave_id; + if (slave != 255) /*255 indicates the dom0*/ + *slave_id = 0; /* remap tid */ + } + + /* If a grh is present, we demux according to it */ + if (wc->wc_flags & IB_WC_GRH) { + slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id); + if (slave < 0) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + } + /* Class-specific handling */ + switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_demux_sa_handler(ibdev, port, slave, + (struct ib_sa_mad *) mad)) + return 0; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad, is_eth)) + return 0; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP) + return 0; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + pr_debug("dropping unsupported ingress mad from class:%d " + "for slave:%d\n", mad->mad_hdr.mgmt_class, slave); + return 0; + } + } + /*make sure that no slave==255 was not handled yet.*/ + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; } static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - struct ib_wc *in_wc, struct ib_grh *in_grh, - struct ib_mad *in_mad, struct ib_mad *out_mad) + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) { u16 slid, prev_lid = 0; int err; struct ib_port_attr pattr; + if (in_wc && in_wc->qp->qp_num) { + pr_debug("received MAD: slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, + be16_to_cpu(in_mad->mad_hdr.attr_id)); + if (in_wc->wc_flags & IB_WC_GRH) { + pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", + (long long)be64_to_cpu(in_grh->sgid.global.subnet_prefix), + (long long) + be64_to_cpu(in_grh->sgid.global.interface_id)); + pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", + (long long)be64_to_cpu(in_grh->dgid.global.subnet_prefix), + (long long)be64_to_cpu(in_grh->dgid.global.interface_id)); + } + } + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { @@ -262,12 +720,9 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS; /* - * Don't process SMInfo queries or vendor-specific - * MADs -- the SMA can't handle them. + * Don't process SMInfo queries -- the SMA can't handle them. */ - if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO || - (is_vendor_id(in_mad->mad_hdr.attr_id) && - !supported_vendor_id(in_mad->mad_hdr.attr_id))) + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) return IB_MAD_RESULT_SUCCESS; } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 || @@ -287,15 +742,19 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, prev_lid = pattr.lid; err = mlx4_MAD_IFC(to_mdev(ibdev), - mad_flags & IB_MAD_IGNORE_MKEY, - mad_flags & IB_MAD_IGNORE_BKEY, + (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | + (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) | + MLX4_MAD_IFC_NET_VIEW, port_num, in_wc, in_grh, in_mad, out_mad); if (err) return IB_MAD_RESULT_FAILURE; if (!out_mad->mad_hdr.status) { - smp_snoop(ibdev, port_num, in_mad, prev_lid); - node_desc_override(ibdev, out_mad); + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)) + smp_snoop(ibdev, port_num, in_mad, prev_lid); + /* slaves get node desc from FW */ + if (!mlx4_is_slave(to_mdev(ibdev)->dev)) + node_desc_override(ibdev, out_mad); } /* set return bit in status of directed route responses */ @@ -309,72 +768,235 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } -static __be32 be64_to_be32(__be64 b64) +static void edit_counter_ext(struct mlx4_if_stat_extended *cnt, void *counters, + __be16 attr_id) { - return cpu_to_be32(be64_to_cpu(b64) & 0xffffffff); + switch (attr_id) { + case IB_PMA_PORT_COUNTERS: + { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)counters; + pma_cnt->port_xmit_data = + cpu_to_be32((be64_to_cpu(cnt->counters[0]. + IfTxUnicastOctets) + + be64_to_cpu(cnt->counters[0]. + IfTxMulticastOctets) + + be64_to_cpu(cnt->counters[0]. + IfTxBroadcastOctets) + + be64_to_cpu(cnt->counters[0]. + IfTxDroppedOctets)) >> 2); + pma_cnt->port_rcv_data = + cpu_to_be32((be64_to_cpu(cnt->counters[0]. + IfRxUnicastOctets) + + be64_to_cpu(cnt->counters[0]. + IfRxMulticastOctets) + + be64_to_cpu(cnt->counters[0]. + IfRxBroadcastOctets) + + be64_to_cpu(cnt->counters[0]. + IfRxNoBufferOctets) + + be64_to_cpu(cnt->counters[0]. + IfRxErrorOctets)) >> 2); + pma_cnt->port_xmit_packets = + cpu_to_be32(be64_to_cpu(cnt->counters[0]. + IfTxUnicastFrames) + + be64_to_cpu(cnt->counters[0]. + IfTxMulticastFrames) + + be64_to_cpu(cnt->counters[0]. + IfTxBroadcastFrames) + + be64_to_cpu(cnt->counters[0]. + IfTxDroppedFrames)); + pma_cnt->port_rcv_packets = + cpu_to_be32(be64_to_cpu(cnt->counters[0]. + IfRxUnicastFrames) + + be64_to_cpu(cnt->counters[0]. + IfRxMulticastFrames) + + be64_to_cpu(cnt->counters[0]. + IfRxBroadcastFrames) + + be64_to_cpu(cnt->counters[0]. + IfRxNoBufferFrames) + + be64_to_cpu(cnt->counters[0]. + IfRxErrorFrames)); + pma_cnt->port_rcv_errors = cpu_to_be32(be64_to_cpu(cnt-> + counters[0]. + IfRxErrorFrames)); + break; + } + + case IB_PMA_PORT_COUNTERS_EXT: + { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)counters; + + pma_cnt_ext->port_xmit_data = + cpu_to_be64((be64_to_cpu(cnt->counters[0]. + IfTxUnicastOctets) + + be64_to_cpu(cnt->counters[0]. + IfTxMulticastOctets) + + be64_to_cpu(cnt->counters[0]. + IfTxBroadcastOctets) + + be64_to_cpu(cnt->counters[0]. + IfTxDroppedOctets)) >> 2); + pma_cnt_ext->port_rcv_data = + cpu_to_be64((be64_to_cpu(cnt->counters[0]. + IfRxUnicastOctets) + + be64_to_cpu(cnt->counters[0]. + IfRxMulticastOctets) + + be64_to_cpu(cnt->counters[0]. + IfRxBroadcastOctets) + + be64_to_cpu(cnt->counters[0]. + IfRxNoBufferOctets) + + be64_to_cpu(cnt->counters[0]. + IfRxErrorOctets)) >> 2); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(be64_to_cpu(cnt->counters[0]. + IfTxUnicastFrames) + + be64_to_cpu(cnt->counters[0]. + IfTxMulticastFrames) + + be64_to_cpu(cnt->counters[0]. + IfTxBroadcastFrames) + + be64_to_cpu(cnt->counters[0]. + IfTxDroppedFrames)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(be64_to_cpu(cnt->counters[0]. + IfRxUnicastFrames) + + be64_to_cpu(cnt->counters[0]. + IfRxMulticastFrames) + + be64_to_cpu(cnt->counters[0]. + IfRxBroadcastFrames) + + be64_to_cpu(cnt->counters[0]. + IfRxNoBufferFrames) + + be64_to_cpu(cnt->counters[0]. + IfRxErrorFrames)); + pma_cnt_ext->port_unicast_xmit_packets = cnt->counters[0]. + IfTxUnicastFrames; + pma_cnt_ext->port_unicast_rcv_packets = cnt->counters[0]. + IfRxUnicastFrames; + pma_cnt_ext->port_multicast_xmit_packets = + cpu_to_be64(be64_to_cpu(cnt->counters[0]. + IfTxMulticastFrames) + + be64_to_cpu(cnt->counters[0]. + IfTxBroadcastFrames)); + pma_cnt_ext->port_multicast_rcv_packets = + cpu_to_be64(be64_to_cpu(cnt->counters[0]. + IfTxMulticastFrames) + + be64_to_cpu(cnt->counters[0]. + IfTxBroadcastFrames)); + + break; + } + + default: + pr_warn("Unsupported attr_id 0x%x\n", attr_id); + break; + } + } -static void edit_counters(struct mlx4_counters *cnt, void *data) +static void edit_counter(struct mlx4_if_stat_basic *cnt, void *counters, + __be16 attr_id) { - *(__be32 *)(data + 40 + 24) = be64_to_be32(cnt->tx_bytes); - *(__be32 *)(data + 40 + 28) = be64_to_be32(cnt->rx_bytes); - *(__be32 *)(data + 40 + 32) = be64_to_be32(cnt->tx_frames); - *(__be32 *)(data + 40 + 36) = be64_to_be32(cnt->rx_frames); + switch (attr_id) { + case IB_PMA_PORT_COUNTERS: + { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *) counters; + pma_cnt->port_xmit_data = + cpu_to_be32(be64_to_cpu( + cnt->counters[0].IfTxOctets) >> 2); + pma_cnt->port_rcv_data = + cpu_to_be32(be64_to_cpu( + cnt->counters[0].IfRxOctets) >> 2); + pma_cnt->port_xmit_packets = + cpu_to_be32(be64_to_cpu(cnt->counters[0].IfTxFrames)); + pma_cnt->port_rcv_packets = + cpu_to_be32(be64_to_cpu(cnt->counters[0].IfRxFrames)); + break; + } + case IB_PMA_PORT_COUNTERS_EXT: + { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *) counters; + + pma_cnt_ext->port_xmit_data = + cpu_to_be64((be64_to_cpu(cnt->counters[0]. + IfTxOctets) >> 2)); + pma_cnt_ext->port_rcv_data = + cpu_to_be64((be64_to_cpu(cnt->counters[0]. + IfRxOctets) >> 2)); + pma_cnt_ext->port_xmit_packets = cnt->counters[0].IfTxFrames; + pma_cnt_ext->port_rcv_packets = cnt->counters[0].IfRxFrames; + break; + } + default: + pr_warn("Unsupported attr_id 0x%x\n", attr_id); + break; + } } -static void edit_ext_counters(struct mlx4_counters_ext *cnt, void *data) -{ - *(__be32 *)(data + 40 + 24) = be64_to_be32(cnt->tx_uni_bytes); - *(__be32 *)(data + 40 + 28) = be64_to_be32(cnt->rx_uni_bytes); - *(__be32 *)(data + 40 + 32) = be64_to_be32(cnt->tx_uni_frames); - *(__be32 *)(data + 40 + 36) = be64_to_be32(cnt->rx_uni_frames); - *(__be32 *)(data + 40 + 8) = be64_to_be32(cnt->rx_err_frames); -} - -static int rdmaoe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - struct ib_wc *in_wc, struct ib_grh *in_grh, - struct ib_mad *in_mad, struct ib_mad *out_mad) +int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index, + union mlx4_counter *counter, u8 clear) { struct mlx4_cmd_mailbox *mailbox; - struct mlx4_ib_dev *dev = to_mdev(ibdev); int err; - u32 inmod = dev->counters[port_num - 1] & 0xffff; - int mode; - - if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) - return -EINVAL; + u32 inmod = counter_index | ((clear & 1) << 31); mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return IB_MAD_RESULT_FAILURE; err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0, - MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C); - if (err) - err = IB_MAD_RESULT_FAILURE; - else { - memset(out_mad->data, 0, sizeof out_mad->data); - mode = be32_to_cpu(((struct mlx4_counters *)mailbox->buf)->counter_mode) & 0xf; - switch (mode) { - case 0: - edit_counters(mailbox->buf, out_mad->data); - err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; - break; - case 1: - edit_ext_counters(mailbox->buf, out_mad->data); - err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; - break; - default: - err = IB_MAD_RESULT_FAILURE; - } - } + MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_WRAPPED); + if (!err) + memcpy(counter, mailbox->buf, MLX4_IF_STAT_SZ(1)); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; } -int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, +static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + u32 counter_index = dev->counters[port_num - 1] & 0xffff; + u8 mode; + char counter_buf[MLX4_IF_STAT_SZ(1)]; + union mlx4_counter *counter = (union mlx4_counter *) + counter_buf; + + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return -EINVAL; + + if (mlx4_ib_query_if_stat(dev, counter_index, counter, 0)) { + err = IB_MAD_RESULT_FAILURE; + } else { + memset(out_mad->data, 0, sizeof out_mad->data); + mode = counter->control.cnt_mode & 0xFF; + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + switch (mode & 0xf) { + case 0: + edit_counter((void *)counter, + (void *)(out_mad->data + 40), + in_mad->mad_hdr.attr_id); + break; + case 1: + edit_counter_ext((void *)counter, + (void *)(out_mad->data + 40), + in_mad->mad_hdr.attr_id); + break; + default: + err = IB_MAD_RESULT_FAILURE; + } + } + + + return err; +} + +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { @@ -383,7 +1005,7 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return ib_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); case IB_LINK_LAYER_ETHERNET: - return rdmaoe_process_mad(ibdev, mad_flags, port_num, in_wc, + return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); default: return -EINVAL; @@ -393,6 +1015,8 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { + if (mad_send_wc->send_buf->context[0]) + ib_destroy_ah(mad_send_wc->send_buf->context[0]); ib_free_send_mad(mad_send_wc->send_buf); } @@ -450,3 +1074,1221 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) ib_destroy_ah(dev->sm_ah[p]); } } + +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_LID_CHANGE_MASK); +} + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + /* re-configure the alias-guid and mcg's */ + if (mlx4_is_master(dev->dev)) { + mlx4_ib_invalidate_all_guid_record(dev, port_num); + + if (!dev->sriov.is_going_down) { + mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0); + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK); + } + } + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER); +} + +static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + struct mlx4_eqe *eqe) +{ + __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe), + GET_MASK_FROM_EQE(eqe)); +} + +static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num, + u32 guid_tbl_blk_num, u32 change_bitmap) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + u16 i; + + if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev)) + return; + + in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n"); + goto out; + } + + guid_tbl_blk_num *= 4; + + for (i = 0; i < 4; i++) { + if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff))) + continue; + memset(in_mad, 0, sizeof *in_mad); + memset(out_mad, 0, sizeof *out_mad); + + in_mad->base_version = 1; + in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + in_mad->class_version = 1; + in_mad->method = IB_MGMT_METHOD_GET; + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i); + + if (mlx4_MAD_IFC(dev, + MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW, + port_num, NULL, NULL, in_mad, out_mad)) { + mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n"); + goto out; + } + + mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + } + +out: + kfree(in_mad); + kfree(out_mad); + return; +} + +void handle_port_mgmt_change_event(struct work_struct *work) +{ + struct ib_event_work *ew = container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *dev = ew->ib_dev; + struct mlx4_eqe *eqe = &(ew->ib_eqe); + u8 port = eqe->event.port_mgmt_change.port; + u32 changed_attr; + u32 tbl_block; + u32 change_bitmap; + + switch (eqe->subtype) { + case MLX4_DEV_PMC_SUBTYPE_PORT_INFO: + changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr); + + /* Update the SM ah - This should be done before handling + the other changed attributes so that MADs can be sent to the SM */ + if (changed_attr & MSTR_SM_CHANGE_MASK) { + u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); + u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; + update_sm_ah(dev, port, lid, sl); + } + + /* Check if it is a lid change event */ + if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK) + handle_lid_change_event(dev, port); + + /* Generate GUID changed event */ + if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) { + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify all slaves*/ + if (mlx4_is_master(dev->dev)) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port, + MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK); + } + + if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) + handle_client_rereg_event(dev, port); + break; + + case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: + mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE); + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + propagate_pkey_ev(dev, port, eqe); + break; + case MLX4_DEV_PMC_SUBTYPE_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + else if (!dev->sriov.is_going_down) { + tbl_block = GET_BLK_PTR_FROM_EQE(eqe); + change_bitmap = GET_MASK_FROM_EQE(eqe); + handle_slaves_guid_change(dev, port, tbl_block, change_bitmap); + } + break; + default: + pr_warn("Unsupported subtype 0x%x for " + "Port Management Change event\n", eqe->subtype); + } + + kfree(ew); +} + +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type) +{ + struct ib_event event; + + event.device = &dev->ib_dev; + event.element.port_num = port_num; + event.event = type; + + ib_dispatch_event(&event); +} + +static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg) +{ + unsigned long flags; + struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) + queue_work(ctx->wq, &ctx->work); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, + struct mlx4_ib_demux_pv_qp *tun_qp, + int index) +{ + struct ib_sge sg_list; + struct ib_recv_wr recv_wr, *bad_recv_wr; + int size; + + size = (tun_qp->qp->qp_type == IB_QPT_UD) ? + sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf); + + sg_list.addr = tun_qp->ring[index].map; + sg_list.length = size; + sg_list.lkey = ctx->mr->lkey; + + recv_wr.next = NULL; + recv_wr.sg_list = &sg_list; + recv_wr.num_sge = 1; + recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV | + MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt); + ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map, + size, DMA_FROM_DEVICE); + return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr); +} + +static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) +{ + int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave; + + return (qpn >= proxy_start && qpn <= proxy_start + 1); +} + + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, + u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *sqp_ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct mlx4_mad_snd_buf *sqp_mad; + struct ib_ah *ah; + struct ib_qp *send_qp = NULL; + unsigned wire_tx_ix = 0; + int ret = 0; + u16 wire_pkey_ix; + int src_qpnum; + u8 sgid_index; + + + sqp_ctx = dev->sriov.sqps[port-1]; + + /* check if proxy qp created */ + if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + /* QP0 forwarding only for Dom0 */ + if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave)) + return -EINVAL; + + if (dest_qpt == IB_QPT_SMI) { + src_qpnum = 0; + sqp = &sqp_ctx->qp[0]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + } else { + src_qpnum = 1; + sqp = &sqp_ctx->qp[1]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index]; + } + + send_qp = sqp->qp; + + /* create ah */ + sgid_index = attr->grh.sgid_index; + attr->grh.sgid_index = 0; + ah = ib_create_ah(sqp_ctx->pd, attr); + if (IS_ERR(ah)) + return -ENOMEM; + attr->grh.sgid_index = sgid_index; + to_mah(ah)->av.ib.gid_index = sgid_index; + /* get rid of force-loopback bit */ + to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); + spin_lock(&sqp->tx_lock); + if (sqp->tx_ix_head - sqp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&sqp->tx_lock); + if (ret) + goto out; + + sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr); + if (sqp->tx_ring[wire_tx_ix].ah) + ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah); + sqp->tx_ring[wire_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + memcpy(&sqp_mad->payload, mad, sizeof *mad); + + ib_dma_sync_single_for_device(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + list.addr = sqp->tx_ring[wire_tx_ix].buf.map; + list.length = sizeof (struct mlx4_mad_snd_buf); + list.lkey = sqp_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.pkey_index = wire_pkey_ix; + wr.wr.ud.remote_qkey = qkey; + wr.wr.ud.remote_qpn = remote_qpn; + wr.next = NULL; + wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(send_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ + int gids; + int vfs; + + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + return slave; + + gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; + vfs = dev->dev->num_vfs; + + if (slave == 0) + return 0; + if (slave <= gids % vfs) + return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1); + + return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1)); +} + +static int get_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, + struct ib_ah_attr *ah_attr) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) { + ah_attr->grh.sgid_index = slave; + return 0; + } + ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); + return 0; +} + +static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) +{ + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)]; + int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1); + struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr; + struct mlx4_ib_ah ah; + struct ib_ah_attr ah_attr; + u8 *slave_id; + int slave; + + /* Get slave that sent this packet */ + if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || + wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX || + (wc->src_qp & 0x1) != ctx->port - 1 || + wc->src_qp & 0x4) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp); + return; + } + slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8; + if (slave != ctx->slave) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " + "belongs to another slave\n", wc->src_qp); + return; + } + if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " + "non-master trying to send QP0 packets\n", wc->src_qp); + return; + } + + /* Map transaction ID */ + ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, + sizeof (struct mlx4_tunnel_mad), + DMA_FROM_DEVICE); + switch (tunnel->mad.mad_hdr.method) { + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_REPORT: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_DELETE: + case IB_SA_METHOD_GET_MULTI: + case IB_SA_METHOD_GET_TRACE_TBL: + slave_id = (u8 *) &tunnel->mad.mad_hdr.tid; + if (*slave_id) { + mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d " + "class:%d slave:%d\n", *slave_id, + tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } else + *slave_id = slave; + default: + /* nothing */; + } + + /* Class-specific handling */ + switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_sa_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET && + tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET) + return; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d " + "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } + } + + /* We are using standard ib_core services to send the mad, so generate a + * stadard address handle by decoding the tunnelled mlx4_ah fields */ + memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); + ah.ibah.device = ctx->ib_dev; + mlx4_ib_query_ah(&ah.ibah, &ah_attr); + if (ah_attr.ah_flags & IB_AH_GRH) + if (get_real_sgid_index(dev, slave, ctx->port, &ah_attr)) + return; + + mlx4_ib_send_to_wire(dev, slave, ctx->port, + is_proxy_qp0(dev, wc->src_qp, slave) ? + IB_QPT_SMI : IB_QPT_GSI, + be16_to_cpu(tunnel->hdr.pkey_index), + be32_to_cpu(tunnel->hdr.remote_qpn), + be32_to_cpu(tunnel->hdr.qkey), + &ah_attr, &tunnel->mad); +} + +static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS, + GFP_KERNEL); + if (!tun_qp->ring) + return -ENOMEM; + + tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS, + sizeof (struct mlx4_ib_tun_tx_buf), + GFP_KERNEL); + if (!tun_qp->tx_ring) { + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; + } + + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL); + if (!tun_qp->ring[i].addr) + goto err; + tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev, + tun_qp->ring[i].addr, + rx_buf_size, + DMA_FROM_DEVICE); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->tx_ring[i].buf.addr = + kmalloc(tx_buf_size, GFP_KERNEL); + if (!tun_qp->tx_ring[i].buf.addr) + goto tx_err; + tun_qp->tx_ring[i].buf.map = + ib_dma_map_single(ctx->ib_dev, + tun_qp->tx_ring[i].buf.addr, + tx_buf_size, + DMA_TO_DEVICE); + tun_qp->tx_ring[i].ah = NULL; + } + spin_lock_init(&tun_qp->tx_lock); + tun_qp->tx_ix_head = 0; + tun_qp->tx_ix_tail = 0; + tun_qp->proxy_qpt = qp_type; + + return 0; + +tx_err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + } + kfree(tun_qp->tx_ring); + tun_qp->tx_ring = NULL; + i = MLX4_NUM_TUNNEL_BUFS; +err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; +} + +static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return; + + tun_qp = &ctx->qp[qp_type]; + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + if (tun_qp->tx_ring[i].ah) + ib_destroy_ah(tun_qp->tx_ring[i].ah); + } + kfree(tun_qp->tx_ring); + kfree(tun_qp->ring); +} + +static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct ib_wc wc; + int ret; + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (ib_poll_cq(ctx->cq, 1, &wc) == 1) { + tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_RECV: + mlx4_ib_multiplex_mad(ctx, &wc); + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, + wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)); + if (ret) + pr_err("Failed reposting tunnel " + "buf:%lld\n", wc.wr_id); + break; + case IB_WC_SEND: + pr_debug("received tunnel send completion:" + "wrid=0x%llx, status=0x%x\n", + wc.wr_id, wc.status); + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + + break; + default: + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + } + } + } +} + +static void pv_qp_event_handler(struct ib_event *event, void *qp_context) +{ + struct mlx4_ib_demux_pv_ctx *sqp = qp_context; + + /* It's worse than that! He's dead, Jim! */ + pr_err("Fatal error (%d) on a MAD QP on port %d\n", + event->event, sqp->port); +} + +static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int create_tun) +{ + int i, ret; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_ib_qp_tunnel_init_attr qp_init_attr; + struct ib_qp_attr attr; + int qp_attr_mask_INIT; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + memset(&qp_init_attr, 0, sizeof qp_init_attr); + qp_init_attr.init_attr.send_cq = ctx->cq; + qp_init_attr.init_attr.recv_cq = ctx->cq; + qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_send_sge = 1; + qp_init_attr.init_attr.cap.max_recv_sge = 1; + if (create_tun) { + qp_init_attr.init_attr.qp_type = IB_QPT_UD; + qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_TUNNEL_QP; + qp_init_attr.port = ctx->port; + qp_init_attr.slave = ctx->slave; + qp_init_attr.proxy_qp_type = qp_type; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_QKEY | IB_QP_PORT; + } else { + qp_init_attr.init_attr.qp_type = qp_type; + qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_SQP; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY; + } + qp_init_attr.init_attr.port_num = ctx->port; + qp_init_attr.init_attr.qp_context = ctx; + qp_init_attr.init_attr.event_handler = pv_qp_event_handler; + tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); + if (IS_ERR(tun_qp->qp)) { + ret = PTR_ERR(tun_qp->qp); + tun_qp->qp = NULL; + pr_err("Couldn't create %s QP (%d)\n", + create_tun ? "tunnel" : "special", ret); + return ret; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IB_QPS_INIT; + attr.pkey_index = + to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; + attr.qkey = IB_QP1_QKEY; + attr.port_num = ctx->port; + ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); + if (ret) { + pr_err("Couldn't change %s qp state to INIT (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE); + if (ret) { + pr_err("Couldn't change %s qp state to RTR (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + pr_err("Couldn't change %s qp state to RTS (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i); + if (ret) { + pr_err(" mlx4_ib_post_pv_buf error" + " (err = %d, i = %d)\n", ret, i); + goto err_qp; + } + } + return 0; + +err_qp: + ib_destroy_qp(tun_qp->qp); + tun_qp->qp = NULL; + return ret; +} + +/* + * IB MAD completion callback for real SQPs + */ +static void mlx4_ib_sqp_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct ib_wc wc; + struct ib_grh *grh; + struct ib_mad *mad; + + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) { + sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_SEND: + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + break; + case IB_WC_RECV: + mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload); + grh = &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh); + mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad); + if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1))) + pr_err("Failed reposting SQP " + "buf:%lld\n", wc.wr_id); + break; + default: + BUG_ON(1); + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + } + } + } +} + +static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx **ret_ctx) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + + *ret_ctx = NULL; + ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL); + if (!ctx) { + pr_err("failed allocating pv resource context " + "for port %d, slave %d\n", port, slave); + return -ENOMEM; + } + + ctx->ib_dev = &dev->ib_dev; + ctx->port = port; + ctx->slave = slave; + *ret_ctx = ctx; + return 0; +} + +static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (dev->sriov.demux[port - 1].tun[slave]) { + kfree(dev->sriov.demux[port - 1].tun[slave]); + dev->sriov.demux[port - 1].tun[slave] = NULL; + } +} + +static int create_pv_resources(struct ib_device *ibdev, int slave, int port, + int create_tun, struct mlx4_ib_demux_pv_ctx *ctx) +{ + int ret, cq_size; + + if (ctx->state != DEMUX_PV_STATE_DOWN) + return -EEXIST; + + ctx->state = DEMUX_PV_STATE_STARTING; + /* have QP0 only on port owner, and only if link layer is IB */ + if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) && + rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND) + ctx->has_smi = 1; + + if (ctx->has_smi) { + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret); + goto err_out; + } + } + + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret); + goto err_out_qp0; + } + + cq_size = 2 * MLX4_NUM_TUNNEL_BUFS; + if (ctx->has_smi) + cq_size *= 2; + + ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler, + NULL, ctx, cq_size, 0); + if (IS_ERR(ctx->cq)) { + ret = PTR_ERR(ctx->cq); + pr_err("Couldn't create tunnel CQ (%d)\n", ret); + goto err_buf; + } + + ctx->pd = ib_alloc_pd(ctx->ib_dev); + if (IS_ERR(ctx->pd)) { + ret = PTR_ERR(ctx->pd); + pr_err("Couldn't create tunnel PD (%d)\n", ret); + goto err_cq; + } + + ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(ctx->mr)) { + ret = PTR_ERR(ctx->mr); + pr_err("Couldn't get tunnel DMA MR (%d)\n", ret); + goto err_pd; + } + + if (ctx->has_smi) { + ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP0 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_mr; + } + } + + ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP1 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_qp0; + } + + if (create_tun) + INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker); + else + INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker); + + ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq; + + ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + if (ret) { + pr_err("Couldn't arm tunnel cq (%d)\n", ret); + goto err_wq; + } + ctx->state = DEMUX_PV_STATE_ACTIVE; + return 0; + +err_wq: + ctx->wq = NULL; + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + + +err_qp0: + if (ctx->has_smi) + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + +err_mr: + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + +err_pd: + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + +err_cq: + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + +err_buf: + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun); + +err_out_qp0: + if (ctx->has_smi) + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun); +err_out: + ctx->state = DEMUX_PV_STATE_DOWN; + return ret; +} + +static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx *ctx, int flush) +{ + if (!ctx) + return; + if (ctx->state > DEMUX_PV_STATE_DOWN) { + ctx->state = DEMUX_PV_STATE_DOWNING; + if (flush) + flush_workqueue(ctx->wq); + if (ctx->has_smi) { + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1); + } + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, + int port, int do_init) +{ + int ret = 0; + + if (!do_init) { + clean_vf_mcast(&dev->sriov.demux[port - 1], slave); + /* for master, destroy real sqp resources */ + if (slave == mlx4_master_func_num(dev->dev)) + destroy_pv_resources(dev, slave, port, + dev->sriov.sqps[port - 1], 1); + /* destroy the tunnel qp resources */ + destroy_pv_resources(dev, slave, port, + dev->sriov.demux[port - 1].tun[slave], 1); + return 0; + } + + /* create the tunnel qp resources */ + ret = create_pv_resources(&dev->ib_dev, slave, port, 1, + dev->sriov.demux[port - 1].tun[slave]); + + /* for master, create the real sqp resources */ + if (!ret && slave == mlx4_master_func_num(dev->dev)) + ret = create_pv_resources(&dev->ib_dev, slave, port, 0, + dev->sriov.sqps[port - 1]); + return ret; +} + +void mlx4_ib_tunnels_update_work(struct work_struct *work) +{ + struct mlx4_ib_demux_work *dmxw; + + dmxw = container_of(work, struct mlx4_ib_demux_work, work); + mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port, + dmxw->do_init); + kfree(dmxw); + return; +} + +static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, + struct mlx4_ib_demux_ctx *ctx, + int port) +{ + char name[12]; + int ret = 0; + int i; + + ctx->tun = kcalloc(dev->dev->caps.sqp_demux, + sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL); + if (!ctx->tun) + return -ENOMEM; + + ctx->dev = dev; + ctx->port = port; + ctx->ib_dev = &dev->ib_dev; + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); + if (ret) { + ret = -ENOMEM; + goto err_mcg; + } + } + + ret = mlx4_ib_mcg_port_init(ctx); + if (ret) { + pr_err("Failed initializing mcg para-virt (%d)\n", ret); + goto err_mcg; + } + + snprintf(name, sizeof name, "mlx4_ibt%d", port); + ctx->wq = create_singlethread_workqueue(name); + if (!ctx->wq) { + pr_err("Failed to create tunnelling WQ for port %d\n", port); + ret = -ENOMEM; + goto err_wq; + } + + snprintf(name, sizeof name, "mlx4_ibud%d", port); + ctx->ud_wq = create_singlethread_workqueue(name); + if (!ctx->ud_wq) { + pr_err("Failed to create up/down WQ for port %d\n", port); + ret = -ENOMEM; + goto err_udwq; + } + + return 0; + +err_udwq: + destroy_workqueue(ctx->wq); + ctx->wq = NULL; + +err_wq: + mlx4_ib_mcg_port_cleanup(ctx, 1); +err_mcg: + for (i = 0; i < dev->dev->caps.sqp_demux; i++) + free_pv_object(dev, i, port); + kfree(ctx->tun); + ctx->tun = NULL; + return ret; +} + +static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx) +{ + if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) { + sqp_ctx->state = DEMUX_PV_STATE_DOWNING; + flush_workqueue(sqp_ctx->wq); + if (sqp_ctx->has_smi) { + ib_destroy_qp(sqp_ctx->qp[0].qp); + sqp_ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0); + } + ib_destroy_qp(sqp_ctx->qp[1].qp); + sqp_ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); + ib_dereg_mr(sqp_ctx->mr); + sqp_ctx->mr = NULL; + ib_dealloc_pd(sqp_ctx->pd); + sqp_ctx->pd = NULL; + ib_destroy_cq(sqp_ctx->cq); + sqp_ctx->cq = NULL; + sqp_ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx) +{ + int i; + if (ctx) { + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + mlx4_ib_mcg_port_cleanup(ctx, 1); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (!ctx->tun[i]) + continue; + if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN) + ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING; + } + flush_workqueue(ctx->wq); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0); + free_pv_object(dev, i, ctx->port); + } + kfree(ctx->tun); + destroy_workqueue(ctx->ud_wq); + destroy_workqueue(ctx->wq); + } +} + +static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init) +{ + int i; + + if (!mlx4_is_master(dev->dev)) + return; + /* initialize or tear down tunnel QPs for the master */ + for (i = 0; i < dev->dev->caps.num_ports; i++) + mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init); + return; +} + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) +{ + int i = 0; + int err; + + if (!mlx4_is_mfunc(dev->dev)) + return 0; + + dev->sriov.is_going_down = 0; + spin_lock_init(&dev->sriov.going_down_lock); + mlx4_ib_cm_paravirt_init(dev); + + mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n"); + + if (mlx4_is_slave(dev->dev)) { + mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n"); + return 0; + } + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (i == mlx4_master_func_num(dev->dev)) + mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid); + else + mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid()); + } + + err = mlx4_ib_init_alias_guid_service(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n"); + goto paravirt_err; + } + err = mlx4_ib_device_register_sysfs(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n"); + goto sysfs_err; + } + + mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n", + dev->dev->caps.sqp_demux); + for (i = 0; i < dev->num_ports; i++) { + union ib_gid gid; + err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1); + if (err) + goto demux_err; + dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id; + err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1, + &dev->sriov.sqps[i]); + if (err) + goto demux_err; + err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); + if (err) + goto demux_err; + } + mlx4_ib_master_tunnels(dev, 1); + return 0; + +demux_err: + while (i > 0) { + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + --i; + } + mlx4_ib_device_unregister_sysfs(dev); + +sysfs_err: + mlx4_ib_destroy_alias_guid_service(dev); + +paravirt_err: + mlx4_ib_cm_paravirt_clean(dev, -1); + + return err; +} + +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev) +{ + int i; + unsigned long flags; + + if (!mlx4_is_mfunc(dev->dev)) + return; + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + dev->sriov.is_going_down = 1; + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + if (mlx4_is_master(dev->dev)) { + for (i = 0; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.demux[i].ud_wq); + mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]); + kfree(dev->sriov.sqps[i]); + dev->sriov.sqps[i] = NULL; + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + + mlx4_ib_cm_paravirt_clean(dev, -1); + mlx4_ib_destroy_alias_guid_service(dev); + mlx4_ib_device_unregister_sysfs(dev); + } +} diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/main.c b/sys/ofed/drivers/infiniband/hw/mlx4/main.c index bc99414ce3ba..328bb5a77c67 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/main.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/main.c @@ -32,12 +32,20 @@ */ #include + +#ifdef __linux__ +#include +#endif + #include +#include #include #include #include #include #include +#include +#include #include #include @@ -45,45 +53,63 @@ #include #include - +#include #include "mlx4_ib.h" #include "user.h" #include "wc.h" #define DRV_NAME MLX4_IB_DRV_NAME -#define DRV_VERSION "1.0-ofed1.5.2" -#define DRV_RELDATE "August 4, 2010" +#define DRV_VERSION "1.0" +#define DRV_RELDATE "April 4, 2008" + +#define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib" +#define MLX4_IB_MRS_PROC_DIR_NAME "mrs" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); -#ifdef CONFIG_MLX4_DEBUG +int mlx4_ib_sm_guid_assign = 1; -int mlx4_ib_debug_level = 0; -module_param_named(debug_level, mlx4_ib_debug_level, int, 0644); -MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); +#ifdef __linux__ +struct proc_dir_entry *mlx4_mrs_dir_entry; +static struct proc_dir_entry *mlx4_ib_driver_dir_entry; +#endif -#endif /* CONFIG_MLX4_DEBUG */ +module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); +MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); + +static char dev_assign_str[512]; +//module_param_string(dev_assign_str, dev_assign_str, sizeof(dev_assign_str), 0644); +MODULE_PARM_DESC(dev_assign_str, "Map all device function numbers to " + "IB device numbers following the pattern: " + "bb:dd.f-0,bb:dd.f-1,... (all numbers are hexadecimals)." + " Max supported devices - 32"); static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; -static void *get_ibdev(struct mlx4_dev *dev, void *ctx, u8 port) -{ - struct mlx4_ib_dev *mlxibdev = ctx; - return &mlxibdev->ib_dev; -} - struct update_gid_work { - struct work_struct work; - union ib_gid gids[128]; - int port; - struct mlx4_ib_dev *dev; + struct work_struct work; + union ib_gid gids[128]; + struct mlx4_ib_dev *dev; + int port; }; +struct dev_rec { + int bus; + int dev; + int func; + int nr; +}; + +#define MAX_DR 32 +static struct dev_rec dr[MAX_DR]; + +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); + static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) @@ -112,7 +138,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, + 1, NULL, NULL, in_mad, out_mad); if (err) goto out; @@ -123,7 +150,9 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | - IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK | + IB_DEVICE_SHARED_MR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) @@ -144,42 +173,45 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; - if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY) - props->max_raw_ethy_qp = dev->ib_dev.phys_port_cnt; + props->device_cap_flags |= IB_DEVICE_QPG; + if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { + props->device_cap_flags |= IB_DEVICE_UD_RSS; + props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz; + } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; - props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30)); + props->vendor_part_id = dev->dev->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; - props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; + props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; + props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; - props->max_mr = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws; + props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; - props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; + props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; - props->max_fast_reg_page_list_len = MAX_FAST_REG_PAGES; + props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; - props->masked_atomic_cap = IB_ATOMIC_HCA; + props->masked_atomic_cap = props->atomic_cap; props->max_pkeys = dev->dev->caps.pkey_table_len[1]; props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; - props->max_map_per_fmr = (1 << (32 - ilog2(dev->dev->caps.num_mpts))) - 1; + props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; out: kfree(in_mad); @@ -197,10 +229,33 @@ mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; } -static void ib_link_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props, - struct ib_smp *out_mad) +static int ib_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) { + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); + if (err) + goto out; + + props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); @@ -208,7 +263,10 @@ static void ib_link_query_port(struct ib_device *ibdev, u8 port, props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); - props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; + if (netw_view) + props->gid_tbl_len = out_mad->data[50]; + else + props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); @@ -220,39 +278,46 @@ static void ib_link_query_port(struct ib_device *ibdev, u8 port, props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; - props->link_layer = IB_LINK_LAYER_INFINIBAND; -} -#ifdef notyet -static int eth_to_ib_width(int w) -{ - switch (w) { - case 4: - return IB_WIDTH_4X; - case 8: - case 16: - return IB_WIDTH_8X; - case 32: - return IB_WIDTH_12X; - default: - return IB_WIDTH_1X; - } -} + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; -static int eth_to_ib_speed(int s) -{ - switch (s) { - case 256: - return 1; - case 512: - return 2; - case 1024: - return 4; - default: - return 1; + switch (ext_active_speed) { + case 1: + props->active_speed = IB_SPEED_FDR; + break; + case 2: + props->active_speed = IB_SPEED_EDR; + break; + } } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == IB_SPEED_QDR) { + init_query_mad(in_mad); + in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = IB_SPEED_FDR10; + } + + /* Avoid wrong speed value returned by FW if the IB link is down. */ + if (props->state == IB_PORT_DOWN) + props->active_speed = IB_SPEED_SDR; + +out: + kfree(in_mad); + kfree(out_mad); + return err; } -#endif static u8 state_to_phys_state(enum ib_port_state state) { @@ -260,88 +325,90 @@ static u8 state_to_phys_state(enum ib_port_state state) } static int eth_link_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props, - struct ib_smp *out_mad) + struct ib_port_attr *props, int netw_view) { - struct mlx4_ib_iboe *iboe = &to_mdev(ibdev)->iboe; + + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + struct mlx4_ib_iboe *iboe = &mdev->iboe; struct net_device *ndev; enum ib_mtu tmp; + struct mlx4_cmd_mailbox *mailbox; + int err = 0; - props->active_width = IB_WIDTH_4X; - props->active_speed = 1; + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto out; + + props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? + IB_WIDTH_4X : IB_WIDTH_1X; + props->active_speed = IB_SPEED_QDR; props->port_cap_flags = IB_PORT_CM_SUP; - props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; - props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; + if (netw_view) + props->gid_tbl_len = MLX4_ROCE_MAX_GIDS; + else + props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; + + props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; - props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); - props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); - props->max_mtu = IB_MTU_2048; - props->subnet_timeout = 0; - props->max_vl_num = out_mad->data[37] >> 4; - props->init_type_reply = 0; - props->link_layer = IB_LINK_LAYER_ETHERNET; + props->max_mtu = IB_MTU_4096; + props->max_vl_num = 2; props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; spin_lock(&iboe->lock); ndev = iboe->netdevs[port - 1]; if (!ndev) - goto out; + goto out_unlock; -#ifdef __linux__ - tmp = iboe_get_mtu(ndev->mtu); -#else tmp = iboe_get_mtu(ndev->if_mtu); -#endif props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; - props->state = netif_carrier_ok(ndev) && netif_oper_up(ndev) ? + + props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); - -out: +out_unlock: spin_unlock(&iboe->lock); - return 0; +out: + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return err; +} + +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + int err; + + memset(props, 0, sizeof *props); + + err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? + ib_link_query_port(ibdev, port, props, netw_view) : + eth_link_query_port(ibdev, port, props, netw_view); + + return err; } static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { - struct ib_smp *in_mad = NULL; - struct ib_smp *out_mad = NULL; - int err = -ENOMEM; - - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); - if (!in_mad || !out_mad) - goto out; - - memset(props, 0, sizeof *props); - - init_query_mad(in_mad); - in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; - in_mad->attr_mod = cpu_to_be32(port); - - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); - if (err) - goto out; - - mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? - ib_link_query_port(ibdev, port, props, out_mad) : - eth_link_query_port(ibdev, port, props, out_mad); - -out: - kfree(in_mad); - kfree(out_mad); - - return err; + /* returns host view */ + return __mlx4_ib_query_port(ibdev, port, props, 0); } -static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, - union ib_gid *gid) +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int clear = 0; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); @@ -352,30 +419,45 @@ static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (mlx4_is_mfunc(dev->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); + if (mlx4_is_mfunc(dev->dev) && !netw_view) { + if (index) { + /* For any index > 0, return the null guid */ + err = 0; + clear = 1; + goto out; + } + } + init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: + if (clear) + memset(gid->raw + 8, 0, 8); kfree(in_mad); kfree(out_mad); return err; } static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, - union ib_gid *gid) + union ib_gid *gid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); @@ -388,16 +470,17 @@ static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) - return __mlx4_ib_query_gid(ibdev, port, index, gid); + return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); else return iboe_query_gid(ibdev, port, index, gid); } -static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, - u16 *pkey) +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); @@ -409,7 +492,11 @@ static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); if (err) goto out; @@ -421,11 +508,16 @@ static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, return err; } +static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); +} + static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; - int err; + unsigned long flags; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; @@ -433,12 +525,16 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; - spin_lock(&to_mdev(ibdev)->sm_lock); - memcpy(ibdev->node_desc, props->node_desc, 64); - spin_unlock(&to_mdev(ibdev)->sm_lock); + if (mlx4_is_slave(to_mdev(ibdev)->dev)) + return -EOPNOTSUPP; - /* if possible, pass node desc to FW, so it can generate - * a 144 trap. If cmd fails, just ignore. + spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); + memcpy(ibdev->node_desc, props->node_desc, 64); + spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. */ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); if (IS_ERR(mailbox)) @@ -446,10 +542,8 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); - err = mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, - MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A); - if (err) - mlx4_ib_dbg("SET_NODE command failed (%d)", err); + mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, + MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); @@ -478,7 +572,7 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, } err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; @@ -514,23 +608,36 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; + struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); - resp.qp_tab_size = dev->dev->caps.num_qps; - - if (mlx4_wc_enabled()) { - resp.bf_reg_size = dev->dev->caps.bf_reg_size; - resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + resp_v3.qp_tab_size = dev->dev->caps.num_qps; + if (mlx4_wc_enabled()) { + resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; + resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp_v3.bf_reg_size = 0; + resp_v3.bf_regs_per_page = 0; + } } else { - resp.bf_reg_size = 0; - resp.bf_regs_per_page = 0; + resp.dev_caps = dev->dev->caps.userspace_caps; + resp.qp_tab_size = dev->dev->caps.num_qps; + if (mlx4_wc_enabled()) { + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp.bf_reg_size = 0; + resp.bf_regs_per_page = 0; + } + resp.cqe_size = dev->dev->caps.cqe_size; } - context = kzalloc(sizeof *context, GFP_KERNEL); + context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); @@ -543,7 +650,11 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); - err = ib_copy_to_udata(udata, &resp, sizeof resp); + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); + else + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); @@ -562,22 +673,82 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) return 0; } +#ifdef __linux__ +static unsigned long mlx4_ib_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned long start_addr; + unsigned long page_size_order; + unsigned long command; + + mm = current->mm; + if (addr) + return current->mm->get_unmapped_area(file, addr, len, + pgoff, flags); + + /* Last 8 bits hold the command others are data per that command */ + command = pgoff & MLX4_IB_MMAP_CMD_MASK; + if (command != MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) + return current->mm->get_unmapped_area(file, addr, len, + pgoff, flags); + + page_size_order = pgoff >> MLX4_IB_MMAP_CMD_BITS; + /* code is based on the huge-pages get_unmapped_area code */ + start_addr = mm->free_area_cache; + + if (len <= mm->cached_hole_size) + start_addr = TASK_UNMAPPED_BASE; + + +full_search: + addr = ALIGN(start_addr, 1 << page_size_order); + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = TASK_UNMAPPED_BASE; + goto full_search; + } + return -ENOMEM; + } + + if (!vma || addr + len <= vma->vm_start) + return addr; + addr = ALIGN(vma->vm_end, 1 << page_size_order); + } +} static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); + int err; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; + /* Last 8 bits hold the command others are data per that command */ + unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK; - if (vma->vm_pgoff == 0) { + if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { + /* compatability handling for commands 0 & 1*/ + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + } + if (command == MLX4_IB_MMAP_UAR_PAGE) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; - } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { + } else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE && + dev->dev->caps.bf_reg_size != 0) { vma->vm_page_prot = pgprot_wc(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, @@ -585,11 +756,31 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) dev->dev->caps.num_uars, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; + } else if (command == MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { + /* Getting contiguous physical pages */ + unsigned long total_size = vma->vm_end - vma->vm_start; + unsigned long page_size_order = (vma->vm_pgoff) >> + MLX4_IB_MMAP_CMD_BITS; + struct ib_cmem *ib_cmem; + ib_cmem = ib_cmem_alloc_contiguous_pages(context, total_size, + page_size_order); + if (IS_ERR(ib_cmem)) { + err = PTR_ERR(ib_cmem); + return err; + } + + err = ib_cmem_map_contiguous_pages_to_vma(ib_cmem, vma); + if (err) { + ib_cmem_release_contiguous_pages(ib_cmem); + return err; + } + return 0; } else return -EINVAL; return 0; } +#endif static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, @@ -598,7 +789,7 @@ static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct mlx4_ib_pd *pd; int err; - pd = kzalloc(sizeof *pd, GFP_KERNEL); + pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); @@ -626,11 +817,62 @@ static int mlx4_ib_dealloc_pd(struct ib_pd *pd) return 0; } +static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_xrcd *xrcd; + int err; + + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); + if (err) + goto err1; + + xrcd->pd = ib_alloc_pd(ibdev); + if (IS_ERR(xrcd->pd)) { + err = PTR_ERR(xrcd->pd); + goto err2; + } + + xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); + if (IS_ERR(xrcd->cq)) { + err = PTR_ERR(xrcd->cq); + goto err3; + } + + return &xrcd->ibxrcd; + +err3: + ib_dealloc_pd(xrcd->pd); +err2: + mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); +err1: + kfree(xrcd); + return ERR_PTR(err); +} + +static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + ib_destroy_cq(to_mxrcd(xrcd)->cq); + ib_dealloc_pd(to_mxrcd(xrcd)->pd); + mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); + kfree(xrcd); + + return 0; +} + static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); - struct gid_entry *ge; + struct mlx4_ib_gid_entry *ge; ge = kzalloc(sizeof *ge, GFP_KERNEL); if (!ge) @@ -658,11 +900,13 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, if (!mqp->port) return 0; + spin_lock(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); + if (ndev) { rdma_get_mcast_mac((struct in6_addr *)gid, mac); rtnl_lock(); @@ -675,37 +919,269 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, return ret; } +struct mlx4_ib_steering { + struct list_head list; + u64 reg_id; + union ib_gid gid; +}; + static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); + u64 reg_id; + struct mlx4_ib_steering *ib_steering = NULL; - err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, !!(mqp->flags & - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), - (ibqp->qp_type == IB_QPT_RAW_ETH) ? - MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB); + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); + if (!ib_steering) + return -ENOMEM; + } + + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + MLX4_PROT_IB_IPV6, ®_id); if (err) - return err; + goto err_malloc; err = add_gid_entry(ibqp, gid); if (err) goto err_add; + if (ib_steering) { + memcpy(ib_steering->gid.raw, gid->raw, 16); + ib_steering->reg_id = reg_id; + mutex_lock(&mqp->mutex); + list_add(&ib_steering->list, &mqp->steering_rules); + mutex_unlock(&mqp->mutex); + } return 0; err_add: mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - (ibqp->qp_type == IB_QPT_RAW_ETH) ? - MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB); + MLX4_PROT_IB_IPV6, reg_id); +err_malloc: + kfree(ib_steering); + return err; } -static struct gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) +enum { + IBV_FLOW_L4_NONE = 0, + IBV_FLOW_L4_OTHER = 3, + IBV_FLOW_L4_UDP = 5, + IBV_FLOW_L4_TCP = 6 +}; + +struct mlx4_cm_steering { + struct list_head list; + u64 reg_id; + struct ib_flow_spec spec; +}; + +static int flow_spec_to_net_rule(struct ib_device *dev, struct ib_flow_spec *flow_spec, + struct list_head *rule_list_h) { - struct gid_entry *ge; - struct gid_entry *tmp; - struct gid_entry *ret = NULL; + struct mlx4_spec_list *spec_l2, *spec_l3, *spec_l4; + u64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16); + + spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL); + if (!spec_l2) + return -ENOMEM; + + switch (flow_spec->type) { + case IB_FLOW_ETH: + spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(spec_l2->eth.dst_mac, flow_spec->l2_id.eth.mac, ETH_ALEN); + memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN); + spec_l2->eth.ether_type = flow_spec->l2_id.eth.ethertype; + if (flow_spec->l2_id.eth.vlan_present) { + spec_l2->eth.vlan_id = flow_spec->l2_id.eth.vlan; + spec_l2->eth.vlan_id_msk = cpu_to_be16(0x0fff); + } + break; + case IB_FLOW_IB_UC: + spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB; + if(flow_spec->l2_id.ib_uc.qpn) { + spec_l2->ib.r_u_qpn = cpu_to_be32(flow_spec->l2_id.ib_uc.qpn); + spec_l2->ib.qpn_msk = cpu_to_be32(0xffffff); + } + break; + case IB_FLOW_IB_MC_IPV4: + case IB_FLOW_IB_MC_IPV6: + spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB; + memcpy(spec_l2->ib.dst_gid, flow_spec->l2_id.ib_mc.mgid, 16); + memset(spec_l2->ib.dst_gid_msk, 0xff, 16); + break; + } + + + list_add_tail(&spec_l2->list, rule_list_h); + + if (flow_spec->l2_id.eth.ethertype == cpu_to_be16(ETH_P_IP) || + flow_spec->type != IB_FLOW_ETH) { + spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL); + if (!spec_l3) + return -ENOMEM; + + spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4; + spec_l3->ipv4.src_ip = flow_spec->src_ip; + if (flow_spec->type != IB_FLOW_IB_MC_IPV4 && + flow_spec->type != IB_FLOW_IB_MC_IPV6) + spec_l3->ipv4.dst_ip = flow_spec->dst_ip; + + if (spec_l3->ipv4.src_ip) + spec_l3->ipv4.src_ip_msk = MLX4_BE_WORD_MASK; + if (spec_l3->ipv4.dst_ip) + spec_l3->ipv4.dst_ip_msk = MLX4_BE_WORD_MASK; + + list_add_tail(&spec_l3->list, rule_list_h); + } + + if (flow_spec->l4_protocol) { + spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL); + if (!spec_l4) + return -ENOMEM; + + spec_l4->tcp_udp.src_port = flow_spec->src_port; + spec_l4->tcp_udp.dst_port = flow_spec->dst_port; + if (spec_l4->tcp_udp.src_port) + spec_l4->tcp_udp.src_port_msk = + MLX4_BE_SHORT_MASK; + if (spec_l4->tcp_udp.dst_port) + spec_l4->tcp_udp.dst_port_msk = + MLX4_BE_SHORT_MASK; + + switch (flow_spec->l4_protocol) { + case IBV_FLOW_L4_UDP: + spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP; + break; + case IBV_FLOW_L4_TCP: + spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP; + break; + default: + dev_err(dev->dma_device, + "Unsupported l4 protocol.\n"); + kfree(spec_l4); + return -EPROTONOSUPPORT; + } + list_add_tail(&spec_l4->list, rule_list_h); + } + return 0; +} + +static int __mlx4_ib_flow_attach(struct mlx4_ib_dev *mdev, + struct mlx4_ib_qp *mqp, + struct ib_flow_spec *flow_spec, + int priority, int lock_qp) +{ + u64 reg_id = 0; + int err = 0; + struct mlx4_cm_steering *cm_flow; + struct mlx4_spec_list *spec, *tmp_spec; + + struct mlx4_net_trans_rule rule = + { .queue_mode = MLX4_NET_TRANS_Q_FIFO, + .exclusive = 0, + }; + + rule.promisc_mode = flow_spec->rule_type; + rule.port = mqp->port; + rule.qpn = mqp->mqp.qpn; + INIT_LIST_HEAD(&rule.list); + + cm_flow = kmalloc(sizeof(*cm_flow), GFP_KERNEL); + if (!cm_flow) + return -ENOMEM; + + if (rule.promisc_mode == MLX4_FS_REGULAR) { + rule.allow_loopback = !flow_spec->block_mc_loopback; + rule.priority = MLX4_DOMAIN_UVERBS | priority; + err = flow_spec_to_net_rule(&mdev->ib_dev, flow_spec, + &rule.list); + if (err) + goto free_list; + } + + err = mlx4_flow_attach(mdev->dev, &rule, ®_id); + if (err) + goto free_list; + + memcpy(&cm_flow->spec, flow_spec, sizeof(*flow_spec)); + cm_flow->reg_id = reg_id; + + if (lock_qp) + mutex_lock(&mqp->mutex); + list_add(&cm_flow->list, &mqp->rules_list); + if (lock_qp) + mutex_unlock(&mqp->mutex); + +free_list: + list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) { + list_del(&spec->list); + kfree(spec); + } + if (err) { + kfree(cm_flow); + dev_err(mdev->ib_dev.dma_device, + "Fail to attach flow steering rule\n"); + } + return err; +} + +static int __mlx4_ib_flow_detach(struct mlx4_ib_dev *mdev, + struct mlx4_ib_qp *mqp, + struct ib_flow_spec *spec, int priority, + int lock_qp) +{ + struct mlx4_cm_steering *cm_flow; + int ret; + + if (lock_qp) + mutex_lock(&mqp->mutex); + list_for_each_entry(cm_flow, &mqp->rules_list, list) { + if (!memcmp(&cm_flow->spec, spec, sizeof(*spec))) { + list_del(&cm_flow->list); + break; + } + } + if (lock_qp) + mutex_unlock(&mqp->mutex); + + if (&cm_flow->list == &mqp->rules_list) { + dev_err(mdev->ib_dev.dma_device, "Couldn't find reg_id for flow spec. " + "Steering rule is left attached\n"); + return -EINVAL; + } + + ret = mlx4_flow_detach(mdev->dev, cm_flow->reg_id); + + kfree(cm_flow); + return ret; +} + +static int mlx4_ib_flow_attach(struct ib_qp *qp, struct ib_flow_spec *flow_spec, + int priority) +{ + return __mlx4_ib_flow_attach(to_mdev(qp->device), to_mqp(qp), + flow_spec, priority, 1); +} + +static int mlx4_ib_flow_detach(struct ib_qp *qp, struct ib_flow_spec *spec, + int priority) +{ + return __mlx4_ib_flow_detach(to_mdev(qp->device), to_mqp(qp), + spec, priority, 1); +} + +static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) +{ + struct mlx4_ib_gid_entry *ge; + struct mlx4_ib_gid_entry *tmp; + struct mlx4_ib_gid_entry *ret = NULL; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!memcmp(raw, ge->gid.raw, 16)) { @@ -724,11 +1200,31 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx4_ib_qp *mqp = to_mqp(ibqp); u8 mac[6]; struct net_device *ndev; - struct gid_entry *ge; + struct mlx4_ib_gid_entry *ge; + u64 reg_id = 0; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + struct mlx4_ib_steering *ib_steering; + + mutex_lock(&mqp->mutex); + list_for_each_entry(ib_steering, &mqp->steering_rules, list) { + if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { + list_del(&ib_steering->list); + break; + } + } + mutex_unlock(&mqp->mutex); + if (&ib_steering->list == &mqp->steering_rules) { + pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); + return -EINVAL; + } + reg_id = ib_steering->reg_id; + kfree(ib_steering); + } err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - (ibqp->qp_type == IB_QPT_RAW_ETH) ? - MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB); + MLX4_PROT_IB_IPV6, reg_id); if (err) return err; @@ -750,91 +1246,18 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) list_del(&ge->list); kfree(ge); } else - printk(KERN_WARNING "could not find mgid entry\n"); + pr_warn("could not find mgid entry\n"); mutex_unlock(&mqp->mutex); return 0; } -static void mlx4_dummy_comp_handler(struct ib_cq *cq, void *cq_context) -{ -} - -static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) -{ - struct mlx4_ib_xrcd *xrcd; - struct mlx4_ib_dev *mdev = to_mdev(ibdev); - struct ib_pd *pd; - struct ib_cq *cq; - int err; - - if (!(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) - return ERR_PTR(-ENOSYS); - - xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); - if (!xrcd) - return ERR_PTR(-ENOMEM); - - err = mlx4_xrcd_alloc(mdev->dev, &xrcd->xrcdn); - if (err) - goto err_xrcd; - - pd = mlx4_ib_alloc_pd(ibdev, NULL, NULL); - if (IS_ERR(pd)) { - err = PTR_ERR(pd); - goto err_pd; - } - pd->device = ibdev; - - cq = mlx4_ib_create_cq(ibdev, 1, 0, NULL, NULL); - if (IS_ERR(cq)) { - err = PTR_ERR(cq); - goto err_cq; - } - cq->device = ibdev; - cq->comp_handler = mlx4_dummy_comp_handler; - - if (context) - if (ib_copy_to_udata(udata, &xrcd->xrcdn, sizeof(__u32))) { - err = -EFAULT; - goto err_copy; - } - - xrcd->cq = cq; - xrcd->pd = pd; - return &xrcd->ibxrcd; - -err_copy: - mlx4_ib_destroy_cq(cq); -err_cq: - mlx4_ib_dealloc_pd(pd); -err_pd: - mlx4_xrcd_free(mdev->dev, xrcd->xrcdn); -err_xrcd: - kfree(xrcd); - return ERR_PTR(err); -} - -static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) -{ - struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd); - - mlx4_ib_destroy_cq(mxrcd->cq); - mlx4_ib_dealloc_pd(mxrcd->pd); - mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); - kfree(xrcd); - - return 0; -} - - static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); @@ -844,8 +1267,10 @@ static int init_node_data(struct mlx4_ib_dev *dev) init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + if (mlx4_is_master(dev->dev)) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; - err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; @@ -853,7 +1278,7 @@ static int init_node_data(struct mlx4_ib_dev *dev) in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; - err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; @@ -913,144 +1338,14 @@ static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_board_id }; -/* - * create show function and a device_attribute struct pointing to - * the function for _name - */ -#define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \ -static ssize_t show_rprt_##_name(struct device *dev, \ - struct device_attribute *attr, \ - char *buf){ \ - return show_diag_rprt(dev, buf, _offset, _op_mod); \ -} \ -static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL); - -#define MLX4_DIAG_RPRT_CLEAR_DIAGS 3 - -static size_t show_diag_rprt(struct device *device, char *buf, - u32 offset, u8 op_modifier) -{ - size_t ret; - u32 counter_offset = offset; - u32 diag_counter = 0; - struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, - ib_dev.dev); - - ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier, - &counter_offset, &diag_counter); - if (ret) - return ret; - - return sprintf(buf,"%d\n", diag_counter); -} - -static ssize_t clear_diag_counters(struct device *device, - struct device_attribute *attr, - const char *buf, size_t length) -{ - size_t ret; - struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, - ib_dev.dev); - - ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS, - NULL, NULL); - if (ret) - return ret; - - return length; -} - -DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_leeoe , 0x10, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_leeoe , 0x14, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_rabrte , 0x7C, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_ieecne , 0x84, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_ieecse , 0x8C, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_rsync , 0x110, 2); -DEVICE_DIAG_RPRT_ATTR(sq_num_rsync , 0x114, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2); -DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2); -DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2); -DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2); -DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2); - -static DEVICE_ATTR(clear_diag, S_IWUGO, NULL, clear_diag_counters); - -static struct attribute *diag_rprt_attrs[] = { - &dev_attr_rq_num_lle.attr, - &dev_attr_sq_num_lle.attr, - &dev_attr_rq_num_lqpoe.attr, - &dev_attr_sq_num_lqpoe.attr, - &dev_attr_rq_num_leeoe.attr, - &dev_attr_sq_num_leeoe.attr, - &dev_attr_rq_num_lpe.attr, - &dev_attr_sq_num_lpe.attr, - &dev_attr_rq_num_wrfe.attr, - &dev_attr_sq_num_wrfe.attr, - &dev_attr_sq_num_mwbe.attr, - &dev_attr_sq_num_bre.attr, - &dev_attr_rq_num_lae.attr, - &dev_attr_sq_num_rire.attr, - &dev_attr_rq_num_rire.attr, - &dev_attr_sq_num_rae.attr, - &dev_attr_rq_num_rae.attr, - &dev_attr_sq_num_roe.attr, - &dev_attr_sq_num_tree.attr, - &dev_attr_sq_num_rree.attr, - &dev_attr_rq_num_rnr.attr, - &dev_attr_sq_num_rnr.attr, - &dev_attr_sq_num_rabrte.attr, - &dev_attr_sq_num_ieecne.attr, - &dev_attr_sq_num_ieecse.attr, - &dev_attr_rq_num_oos.attr, - &dev_attr_sq_num_oos.attr, - &dev_attr_rq_num_mce.attr, - &dev_attr_rq_num_rsync.attr, - &dev_attr_sq_num_rsync.attr, - &dev_attr_rq_num_udsdprd.attr, - &dev_attr_rq_num_ucsdprd.attr, - &dev_attr_num_cqovf.attr, - &dev_attr_num_eqovf.attr, - &dev_attr_num_baddb.attr, - &dev_attr_clear_diag.attr, - NULL -}; - -struct attribute_group diag_counters_group = { - .name = "diag_counters", - .attrs = diag_rprt_attrs -}; - static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) { #ifdef __linux__ memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); #else - memcpy(eui, IF_LLADDR(dev), 3); - memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); + memcpy(eui, IF_LLADDR(dev), 3); + memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); #endif if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; @@ -1069,11 +1364,10 @@ static void update_gids_task(struct work_struct *work) union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; - struct ib_event event; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { - printk(KERN_WARNING "update gid table failed %ld\n", PTR_ERR(mailbox)); + pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); return; } @@ -1081,25 +1375,19 @@ static void update_gids_task(struct work_struct *work) memcpy(gids, gw->gids, sizeof gw->gids); err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, - 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B); + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); if (err) - printk(KERN_WARNING "set port command failed\n"); + pr_warn("set port command failed\n"); else { memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); - event.device = &gw->dev->ib_dev; - event.element.port_num = gw->port; - event.event = IB_EVENT_GID_CHANGE; - ib_dispatch_event(&event); + mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); } mlx4_free_cmd_mailbox(dev, mailbox); kfree(gw); } -enum { - MLX4_MAX_EFF_VLANS = 128 - MLX4_VLAN_REGULAR, -}; - static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) { struct net_device *ndev = dev->iboe.netdevs[port - 1]; @@ -1107,40 +1395,42 @@ static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) struct net_device *tmp; int i; u8 *hits; - int ret; union ib_gid gid; - int tofree; + int index_free; int found; int need_update = 0; + int max_gids; u16 vid; work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; - hits = kzalloc(MLX4_MAX_EFF_VLANS + 1, GFP_ATOMIC); + hits = kzalloc(128, GFP_ATOMIC); if (!hits) { - ret = -ENOMEM; - goto out; + kfree(work); + return -ENOMEM; } + max_gids = dev->dev->caps.gid_table_len[port]; + #ifdef __linux__ - read_lock(&dev_base_lock); - for_each_netdev(&init_net, tmp) { + rcu_read_lock(); + for_each_netdev_rcu(&init_net, tmp) { #else - IFNET_RLOCK(); - TAILQ_FOREACH(tmp, &V_ifnet, if_link) { + IFNET_RLOCK(); + TAILQ_FOREACH(tmp, &V_ifnet, if_link) { #endif if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); vid = rdma_vlan_dev_vlan_id(tmp); mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); found = 0; - tofree = -1; - for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i) { - if (tofree < 0 && + index_free = -1; + for (i = 0; i < max_gids; ++i) { + if (index_free < 0 && !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - tofree = i; + index_free = i; if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { hits[i] = 1; found = 1; @@ -1149,33 +1439,36 @@ static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) } if (!found) { - if (tmp == ndev && (memcmp(&dev->iboe.gid_table[port - 1][0], &gid, sizeof gid) || !memcmp(&dev->iboe.gid_table[port - 1][0], &zgid, sizeof gid))) { + if (tmp == ndev && + (memcmp(&dev->iboe.gid_table[port - 1][0], + &gid, sizeof gid) || + !memcmp(&dev->iboe.gid_table[port - 1][0], + &zgid, sizeof gid))) { dev->iboe.gid_table[port - 1][0] = gid; ++need_update; hits[0] = 1; - } else if (tofree >= 0) { - dev->iboe.gid_table[port - 1][tofree] = gid; - hits[tofree] = 1; + } else if (index_free >= 0) { + dev->iboe.gid_table[port - 1][index_free] = gid; + hits[index_free] = 1; ++need_update; } } } -#ifdef __linux__ - } - read_unlock(&dev_base_lock); +#ifdef __linux__ + } + rcu_read_unlock(); #else - } - IFNET_RUNLOCK(); + } + IFNET_RUNLOCK(); #endif - for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i) + for (i = 0; i < max_gids; ++i) if (!hits[i]) { if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) ++need_update; dev->iboe.gid_table[port - 1][i] = zgid; } - if (need_update) { memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); INIT_WORK(&work->work, update_gids_task); @@ -1187,10 +1480,6 @@ static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) kfree(hits); return 0; - -out: - kfree(work); - return ret; } static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) @@ -1239,7 +1528,8 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event spin_lock(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { oldnd = iboe->netdevs[port - 1]; - iboe->netdevs[port - 1] = mlx4_get_prot_dev(ibdev->dev, MLX4_PROT_EN, port); + iboe->netdevs[port - 1] = + mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); if (oldnd != iboe->netdevs[port - 1]) { if (iboe->netdevs[port - 1]) netdev_added(ibdev, port); @@ -1260,20 +1550,352 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event return NOTIFY_DONE; } +static void init_pkeys(struct mlx4_ib_dev *ibdev) +{ + int port; + int slave; + int i; + + if (mlx4_is_master(ibdev->dev)) { + for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) { + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = + /* master has the identity virt2phys pkey mapping */ + (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : + ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; + mlx4_sync_pkey_table(ibdev->dev, slave, port, i, + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); + } + } + } + /* initialize pkey cache */ + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) + ibdev->pkeys.phys_pkey_cache[port-1][i] = + (i) ? 0 : 0xFFFF; + } + } +} + +static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) +{ + char name[32]; + int eq_per_port = 0; + int added_eqs = 0; + int total_eqs = 0; + int i, j, eq; + + /* Legacy mode or comp_pool is not large enough */ + if (dev->caps.comp_pool == 0 || + dev->caps.num_ports > dev->caps.comp_pool) + return; + + eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ + dev->caps.num_ports); + + /* Init eq table */ + added_eqs = 0; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + added_eqs += eq_per_port; + + total_eqs = dev->caps.num_comp_vectors + added_eqs; + + ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL); + if (!ibdev->eq_table) + return; + + ibdev->eq_added = added_eqs; + + eq = 0; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { + for (j = 0; j < eq_per_port; j++) { + //sprintf(name, "mlx4-ib-%d-%d@%s", + // i, j, dev->pdev->bus->conf.pd_name); + /* Set IRQ for specific name (per ring) */ + if (mlx4_assign_eq(dev, name, + &ibdev->eq_table[eq])) { + /* Use legacy (same as mlx4_en driver) */ + pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); + ibdev->eq_table[eq] = + (eq % dev->caps.num_comp_vectors); + } + eq++; + } + } + + /* Fill the reset of the vector with legacy EQ */ + for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++) + ibdev->eq_table[eq++] = i; + + /* Advertise the new number of EQs to clients */ + ibdev->ib_dev.num_comp_vectors = total_eqs; +} + +static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) +{ + int i; + + /* no additional eqs were added */ + if (!ibdev->eq_table) + return; + + /* Reset the advertised EQ number */ + ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; + + /* Free only the added eqs */ + for (i = 0; i < ibdev->eq_added; i++) { + /* Don't free legacy eqs if used */ + if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors) + continue; + mlx4_release_eq(dev, ibdev->eq_table[i]); + } + + kfree(ibdev->eq_table); +} + +/* + * create show function and a device_attribute struct pointing to + * the function for _name + */ +#define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \ +static ssize_t show_rprt_##_name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf){ \ + return show_diag_rprt(dev, buf, _offset, _op_mod); \ +} \ +static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL); + +#define MLX4_DIAG_RPRT_CLEAR_DIAGS 3 + +static size_t show_diag_rprt(struct device *device, char *buf, + u32 offset, u8 op_modifier) +{ + size_t ret; + u32 counter_offset = offset; + u32 diag_counter = 0; + struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, + ib_dev.dev); + + ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier, + &counter_offset, &diag_counter); + if (ret) + return ret; + + return sprintf(buf, "%d\n", diag_counter); +} + +static ssize_t clear_diag_counters(struct device *device, + struct device_attribute *attr, + const char *buf, size_t length) +{ + size_t ret; + struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, + ib_dev.dev); + + ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS, + NULL, NULL); + if (ret) + return ret; + + return length; +} + +DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2); +DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2); +DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2); +DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2); + +static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters); + +static struct attribute *diag_rprt_attrs[] = { + &dev_attr_rq_num_lle.attr, + &dev_attr_sq_num_lle.attr, + &dev_attr_rq_num_lqpoe.attr, + &dev_attr_sq_num_lqpoe.attr, + &dev_attr_rq_num_lpe.attr, + &dev_attr_sq_num_lpe.attr, + &dev_attr_rq_num_wrfe.attr, + &dev_attr_sq_num_wrfe.attr, + &dev_attr_sq_num_mwbe.attr, + &dev_attr_sq_num_bre.attr, + &dev_attr_rq_num_lae.attr, + &dev_attr_sq_num_rire.attr, + &dev_attr_rq_num_rire.attr, + &dev_attr_sq_num_rae.attr, + &dev_attr_rq_num_rae.attr, + &dev_attr_sq_num_roe.attr, + &dev_attr_sq_num_tree.attr, + &dev_attr_sq_num_rree.attr, + &dev_attr_rq_num_rnr.attr, + &dev_attr_sq_num_rnr.attr, + &dev_attr_rq_num_oos.attr, + &dev_attr_sq_num_oos.attr, + &dev_attr_rq_num_mce.attr, + &dev_attr_rq_num_udsdprd.attr, + &dev_attr_rq_num_ucsdprd.attr, + &dev_attr_num_cqovf.attr, + &dev_attr_num_eqovf.attr, + &dev_attr_num_baddb.attr, + &dev_attr_clear_diag.attr, + NULL +}; + +static struct attribute_group diag_counters_group = { + .name = "diag_counters", + .attrs = diag_rprt_attrs +}; + +#ifdef __linux__ +static int mlx4_ib_proc_init(void) +{ + /* Creating procfs directories /proc/drivers/mlx4_ib/ && + /proc/drivers/mlx4_ib/mrs for further use by the driver. + */ + int err; + + mlx4_ib_driver_dir_entry = proc_mkdir(MLX4_IB_DRIVER_PROC_DIR_NAME, + NULL); + if (!mlx4_ib_driver_dir_entry) { + pr_err("mlx4_ib_proc_init has failed for %s\n", + MLX4_IB_DRIVER_PROC_DIR_NAME); + err = -ENODEV; + goto error; + } + + mlx4_mrs_dir_entry = proc_mkdir(MLX4_IB_MRS_PROC_DIR_NAME, + mlx4_ib_driver_dir_entry); + if (!mlx4_mrs_dir_entry) { + pr_err("mlx4_ib_proc_init has failed for %s\n", + MLX4_IB_MRS_PROC_DIR_NAME); + err = -ENODEV; + goto remove_entry; + } + + return 0; + +remove_entry: + remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, + NULL); +error: + return err; +} +#endif + +static void init_dev_assign(void) +{ + int bus, slot, fn, ib_idx; + char *p = dev_assign_str, *t; + char curr_val[32] = {0}; + int ret; + int j, i = 0; + + memset(dr, 0, sizeof dr); + + if (dev_assign_str[0] == 0) + return; + + while (strlen(p)) { + ret = sscanf(p, "%02x:%02x.%x-%x", &bus, &slot, &fn, &ib_idx); + if (ret != 4 || ib_idx < 0) + goto err; + + for (j = 0; j < i; j++) + if (dr[j].nr == ib_idx) + goto err; + + dr[i].bus = bus; + dr[i].dev = slot; + dr[i].func = fn; + dr[i].nr = ib_idx; + + t = strchr(p, ','); + sprintf(curr_val, "%02x:%02x.%x-%x", bus, slot, fn, ib_idx); + if ((!t) && strlen(p) == strlen(curr_val)) + return; + + if (!t || (t + 1) >= dev_assign_str + sizeof dev_assign_str) + goto err; + + ++i; + if (i >= MAX_DR) + goto err; + + p = t + 1; + } + + return; +err: + memset(dr, 0, sizeof dr); + printk(KERN_WARNING "mlx4_ib: The value of 'dev_assign_str' parameter " + "is incorrect. The parameter value is discarded!"); +} + +static int mlx4_ib_dev_idx(struct mlx4_dev *dev) +{ + int /*bus,*/ slot, fn; + int i; + + if (!dev) + return -1; + else if (!dev->pdev) + return -1; + //else if (!dev->pdev->bus) + // return -1; + + //bus = dev->pdev->bus->conf.pc_sel.pc_bus; + slot = PCI_SLOT(dev->pdev->devfn); + fn = PCI_FUNC(dev->pdev->devfn); + + for (i = 0; i < MAX_DR; ++i) { + if (/*dr[i].bus == bus &&*/ + dr[i].dev == slot && + dr[i].func == fn) { + return dr[i].nr; + } + } + + return -1; +} + static void *mlx4_ib_add(struct mlx4_dev *dev) { - static int mlx4_ib_version_printed; struct mlx4_ib_dev *ibdev; int num_ports = 0; - int i; + int i, j; int err; struct mlx4_ib_iboe *iboe; - int k; + int dev_idx; - if (!mlx4_ib_version_printed) { - printk(KERN_INFO "%s", mlx4_ib_version); - ++mlx4_ib_version_printed; - } + printk(KERN_INFO "%s", mlx4_ib_version); mlx4_foreach_ib_transport_port(i, dev) num_ports++; @@ -1296,14 +1918,22 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; - ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); + ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, + PAGE_SIZE); + if (!ibdev->priv_uar.map) goto err_uar; + MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; - strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); + dev_idx = mlx4_ib_dev_idx(dev); + if (dev_idx >= 0) + sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx); + else + strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); + ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; @@ -1312,7 +1942,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; - ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + if (dev->caps.userspace_caps) + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + else + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | @@ -1334,6 +1968,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_FLOW) | + (1ull << IB_USER_VERBS_CMD_DETACH_FLOW) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); ibdev->ib_dev.query_device = mlx4_ib_query_device; @@ -1345,7 +1984,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.modify_port = mlx4_ib_modify_port; ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; +#ifdef __linux__ ibdev->ib_dev.mmap = mlx4_ib_mmap; + ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area; +#endif ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; ibdev->ib_dev.create_ah = mlx4_ib_create_ah; @@ -1376,87 +2018,139 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; + ibdev->ib_dev.attach_flow = mlx4_ib_flow_attach; + ibdev->ib_dev.detach_flow = mlx4_ib_flow_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; - ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; - ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; - ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; - ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { - ibdev->ib_dev.create_xrc_srq = mlx4_ib_create_xrc_srq; - ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; - ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; - ibdev->ib_dev.create_xrc_rcv_qp = mlx4_ib_create_xrc_rcv_qp; - ibdev->ib_dev.modify_xrc_rcv_qp = mlx4_ib_modify_xrc_rcv_qp; - ibdev->ib_dev.query_xrc_rcv_qp = mlx4_ib_query_xrc_rcv_qp; - ibdev->ib_dev.reg_xrc_rcv_qp = mlx4_ib_reg_xrc_rcv_qp; - ibdev->ib_dev.unreg_xrc_rcv_qp = mlx4_ib_unreg_xrc_rcv_qp; - ibdev->ib_dev.uverbs_cmd_mask |= - (1ull << IB_USER_VERBS_CMD_CREATE_XRC_SRQ) | - (1ull << IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN) | - (1ull << IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN) | - (1ull << IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP) | - (1ull << IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP) | - (1ull << IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP) | - (1ull << IB_USER_VERBS_CMD_REG_XRC_RCV_QP) | - (1ull << IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP); + if (!mlx4_is_slave(ibdev->dev)) { + ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; + ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; + ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; + ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { + ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; + ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); + if (init_node_data(ibdev)) goto err_map; - for (k = 0; k < ibdev->num_ports; ++k) { - err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[k]); - if (err) - ibdev->counters[k] = -1; - else - mlx4_set_iboe_counter(dev, ibdev->counters[k], k + 1); + for (i = 0; i < ibdev->num_ports; ++i) { + if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]); + if (err) + ibdev->counters[i] = -1; + } else + ibdev->counters[i] = -1; } spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); - mutex_init(&ibdev->xrc_reg_mutex); - if (ib_register_device(&ibdev->ib_dev)) - goto err_counter; + if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && + !mlx4_is_slave(dev)) { + ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; + err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, + MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0); + if (err) + goto err_counter; + + ibdev->ib_uc_qpns_bitmap = + kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * + sizeof(long), + GFP_KERNEL); + if (!ibdev->ib_uc_qpns_bitmap) { + dev_err(&dev->pdev->dev, "bit map alloc failed\n"); + goto err_steer_qp_release; + } + + bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); + + err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1); + if (err) + goto err_steer_free_bitmap; + } + + if (ib_register_device(&ibdev->ib_dev, NULL)) + goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; + + if (mlx4_ib_init_sriov(ibdev)) + goto err_mad; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); if (err) - goto err_reg; + goto err_sriov; } - for (i = 0; i < ARRAY_SIZE(mlx4_class_attributes); ++i) { + + for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { if (device_create_file(&ibdev->ib_dev.dev, - mlx4_class_attributes[i])) + mlx4_class_attributes[j])) goto err_notif; } - - if(sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) + if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) goto err_notif; - ibdev->ib_active = 1; + ibdev->ib_active = true; + if (mlx4_is_mfunc(ibdev->dev)) + init_pkeys(ibdev); + + /* create paravirt contexts for any VFs which are active */ + if (mlx4_is_master(ibdev->dev)) { + for (j = 0; j < MLX4_MFUNC_MAX; j++) { + if (j == mlx4_master_func_num(ibdev->dev)) + continue; + if (mlx4_is_slave_active(ibdev->dev, j)) + do_slave_init(ibdev, j, 1); + } + } return ibdev; err_notif: if (unregister_netdevice_notifier(&ibdev->iboe.nb)) - printk(KERN_WARNING "failure unregistering notifier\n"); + pr_warn("failure unregistering notifier\n"); flush_workqueue(wq); +err_sriov: + mlx4_ib_close_sriov(ibdev); + +err_mad: + mlx4_ib_mad_cleanup(ibdev); + err_reg: ib_unregister_device(&ibdev->ib_dev); +err_steer_free_bitmap: + kfree(ibdev->ib_uc_qpns_bitmap); + +err_steer_qp_release: + if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); err_counter: - for (; k; --k) - mlx4_counter_free(ibdev->dev, ibdev->counters[k - 1]); + for (; i; --i) + if (ibdev->counters[i - 1] != -1) + mlx4_counter_free(ibdev->dev, ibdev->counters[i - 1]); err_map: iounmap(ibdev->priv_uar.map); + mlx4_ib_free_eqs(dev, ibdev); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); @@ -1470,73 +2164,215 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) return NULL; } +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) +{ + int offset; + + WARN_ON(!dev->ib_uc_qpns_bitmap); + + offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, + dev->steer_qpn_count, + get_count_order(count)); + if (offset < 0) + return offset; + + *qpn = dev->steer_qpn_base + offset; + return 0; +} + +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) +{ + if (!qpn || + dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) + return; + + BUG_ON(qpn < dev->steer_qpn_base); + + bitmap_release_region(dev->ib_uc_qpns_bitmap, + qpn - dev->steer_qpn_base, get_count_order(count)); +} + +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach) +{ + struct ib_flow_spec spec = { + .type = IB_FLOW_IB_UC, + .l2_id.ib_uc.qpn = mqp->ibqp.qp_num, + }; + + return is_attach ? + __mlx4_ib_flow_attach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0) + : __mlx4_ib_flow_detach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0); +} + static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; - int p; - int k; + int p,j; + mlx4_ib_close_sriov(ibdev); sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group); - mlx4_ib_mad_cleanup(ibdev); + + for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { + device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); + } + ib_unregister_device(&ibdev->ib_dev); - for (k = 0; k < ibdev->num_ports; ++k) - mlx4_counter_free(ibdev->dev, ibdev->counters[k]); + + if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); + } if (ibdev->iboe.nb.notifier_call) { - unregister_netdevice_notifier(&ibdev->iboe.nb); - flush_workqueue(wq); + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } iounmap(ibdev->priv_uar.map); - + for (p = 0; p < ibdev->num_ports; ++p) + if (ibdev->counters[p] != -1) + mlx4_counter_free(ibdev->dev, ibdev->counters[p]); mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); + mlx4_ib_free_eqs(dev, ibdev); + mlx4_uar_free(dev, &ibdev->priv_uar); mlx4_pd_free(dev, ibdev->priv_pdn); ib_dealloc_device(&ibdev->ib_dev); } +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) +{ + struct mlx4_ib_demux_work **dm = NULL; + struct mlx4_dev *dev = ibdev->dev; + int i; + unsigned long flags; + + if (!mlx4_is_master(dev)) + return; + + dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); + if (!dm) { + pr_err("failed to allocate memory for tunneling qp update\n"); + goto out; + } + + for (i = 0; i < dev->caps.num_ports; i++) { + dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); + if (!dm[i]) { + pr_err("failed to allocate memory for tunneling qp update work struct\n"); + for (i = 0; i < dev->caps.num_ports; i++) { + if (dm[i]) + kfree(dm[i]); + } + goto out; + } + } + /* initialize or tear down tunnel QPs for the slave */ + for (i = 0; i < dev->caps.num_ports; i++) { + INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); + dm[i]->port = i + 1; + dm[i]->slave = slave; + dm[i]->do_init = do_init; + dm[i]->dev = ibdev; + spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); + if (!ibdev->sriov.is_going_down) + queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); + spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + } +out: + if (dm) + kfree(dm); + return; +} + static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, - enum mlx4_dev_event event, int port) + enum mlx4_dev_event event, unsigned long param) { struct ib_event ibev; struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); + struct mlx4_eqe *eqe = NULL; + struct ib_event_work *ew; + int p = 0; - if (port > ibdev->num_ports) - return; + if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) + eqe = (struct mlx4_eqe *)param; + else + p = (int) param; switch (event) { case MLX4_DEV_EVENT_PORT_UP: + if (p > ibdev->num_ports) + return; + if (mlx4_is_master(dev) && + rdma_port_get_link_layer(&ibdev->ib_dev, p) == + IB_LINK_LAYER_INFINIBAND) { + mlx4_ib_invalidate_all_guid_record(ibdev, p); + } + mlx4_ib_info((struct ib_device *) ibdev_ptr, + "Port %d logical link is up\n", p); ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: + if (p > ibdev->num_ports) + return; + mlx4_ib_info((struct ib_device *) ibdev_ptr, + "Port %d logical link is down\n", p); ibev.event = IB_EVENT_PORT_ERR; break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: - ibdev->ib_active = 0; + ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; break; + case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: + ew = kmalloc(sizeof *ew, GFP_ATOMIC); + if (!ew) { + pr_err("failed to allocate memory for events work\n"); + break; + } + + INIT_WORK(&ew->work, handle_port_mgmt_change_event); + memcpy(&ew->ib_eqe, eqe, sizeof *eqe); + ew->ib_dev = ibdev; + /* need to queue only for port owner, which uses GEN_EQE */ + if (mlx4_is_master(dev)) + queue_work(wq, &ew->work); + else + handle_port_mgmt_change_event(&ew->work); + return; + + case MLX4_DEV_EVENT_SLAVE_INIT: + /* here, p is the slave id */ + do_slave_init(ibdev, p, 1); + return; + + case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + /* here, p is the slave id */ + do_slave_init(ibdev, p, 0); + return; + default: return; } ibev.device = ibdev_ptr; - ibev.element.port_num = port; + ibev.element.port_num = (u8) p; ib_dispatch_event(&ibev); } static struct mlx4_interface mlx4_ib_interface = { - .add = mlx4_ib_add, - .remove = mlx4_ib_remove, - .event = mlx4_ib_event, - .get_prot_dev = get_ibdev, - .protocol = MLX4_PROT_IB, + .add = mlx4_ib_add, + .remove = mlx4_ib_remove, + .event = mlx4_ib_event, + .protocol = MLX4_PROT_IB_IPV6 }; static int __init mlx4_ib_init(void) @@ -1547,22 +2383,55 @@ static int __init mlx4_ib_init(void) if (!wq) return -ENOMEM; +#ifdef __linux__ + err = mlx4_ib_proc_init(); + if (err) + goto clean_wq; +#endif + + err = mlx4_ib_mcg_init(); + if (err) + goto clean_proc; + + init_dev_assign(); + err = mlx4_register_interface(&mlx4_ib_interface); - if (err) { - destroy_workqueue(wq); - return err; - } + if (err) + goto clean_mcg; return 0; + +clean_mcg: + mlx4_ib_mcg_destroy(); + +clean_proc: +#ifdef __linux__ + remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME, + mlx4_ib_driver_dir_entry); + remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); + +clean_wq: +#endif + destroy_workqueue(wq); + return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); + mlx4_ib_mcg_destroy(); destroy_workqueue(wq); + + /* Remove proc entries */ +#ifdef __linux__ + remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME, + mlx4_ib_driver_dir_entry); + remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); +#endif + } -module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE); +module_init(mlx4_ib_init); module_exit(mlx4_ib_cleanup); #undef MODULE_VERSION @@ -1572,9 +2441,12 @@ mlx4ib_evhand(module_t mod, int event, void *arg) { return (0); } + static moduledata_t mlx4ib_mod = { .name = "mlx4ib", .evhand = mlx4ib_evhand, }; + DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY); MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); +MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1); diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c b/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c new file mode 100644 index 000000000000..5489323df048 --- /dev/null +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mcg.c @@ -0,0 +1,1254 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include + +#include "mlx4_ib.h" + +#define MAX_VFS 80 +#define MAX_PEND_REQS_PER_FUNC 4 +#define MAD_TIMEOUT_MS 2000 + +#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg) +#define mcg_error(fmt, arg...) pr_err(fmt, ##arg) +#define mcg_warn_group(group, format, arg...) \ + pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ + (group)->name, group->demux->port, ## arg) + +#define mcg_error_group(group, format, arg...) \ + pr_err(" %16s: " format, (group)->name, ## arg) + +static union ib_gid mgid0; + +static struct workqueue_struct *clean_wq; + +enum mcast_state { + MCAST_NOT_MEMBER = 0, + MCAST_MEMBER, +}; + +enum mcast_group_state { + MCAST_IDLE, + MCAST_JOIN_SENT, + MCAST_LEAVE_SENT, + MCAST_RESP_READY +}; + +struct mcast_member { + enum mcast_state state; + uint8_t join_state; + int num_pend_reqs; + struct list_head pending; +}; + +struct ib_sa_mcmember_data { + union ib_gid mgid; + union ib_gid port_gid; + __be32 qkey; + __be16 mlid; + u8 mtusel_mtu; + u8 tclass; + __be16 pkey; + u8 ratesel_rate; + u8 lifetmsel_lifetm; + __be32 sl_flowlabel_hoplimit; + u8 scope_join_state; + u8 proxy_join; + u8 reserved[2]; +}; + +struct mcast_group { + struct ib_sa_mcmember_data rec; + struct rb_node node; + struct list_head mgid0_list; + struct mlx4_ib_demux_ctx *demux; + struct mcast_member func[MAX_VFS]; + struct mutex lock; + struct work_struct work; + struct list_head pending_list; + int members[3]; + enum mcast_group_state state; + enum mcast_group_state prev_state; + struct ib_sa_mad response_sa_mad; + __be64 last_req_tid; + + char name[33]; /* MGID string */ + struct device_attribute dentry; + + /* refcount is the reference count for the following: + 1. Each queued request + 2. Each invocation of the worker thread + 3. Membership of the port at the SA + */ + atomic_t refcount; + + /* delayed work to clean pending SM request */ + struct delayed_work timeout_work; + struct list_head cleanup_list; +}; + +struct mcast_req { + int func; + struct ib_sa_mad sa_mad; + struct list_head group_list; + struct list_head func_list; + struct mcast_group *group; + int clean; +}; + + +#define safe_atomic_dec(ref) \ + do {\ + if (atomic_dec_and_test(ref)) \ + mcg_warn_group(group, "did not expect to reach zero\n"); \ + } while (0) + +static const char *get_state_string(enum mcast_group_state state) +{ + switch (state) { + case MCAST_IDLE: + return "MCAST_IDLE"; + case MCAST_JOIN_SENT: + return "MCAST_JOIN_SENT"; + case MCAST_LEAVE_SENT: + return "MCAST_LEAVE_SENT"; + case MCAST_RESP_READY: + return "MCAST_RESP_READY"; + } + return "Invalid State"; +} + +static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid) +{ + struct rb_node *node = ctx->mcg_table.rb_node; + struct mcast_group *group; + int ret; + + while (node) { + group = rb_entry(node, struct mcast_group, node); + ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); + if (!ret) + return group; + + if (ret < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx, + struct mcast_group *group) +{ + struct rb_node **link = &ctx->mcg_table.rb_node; + struct rb_node *parent = NULL; + struct mcast_group *cur_group; + int ret; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct mcast_group, node); + + ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, + sizeof group->rec.mgid); + if (ret < 0) + link = &(*link)->rb_left; + else if (ret > 0) + link = &(*link)->rb_right; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &ctx->mcg_table); + return NULL; +} + +static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_ah_attr ah_attr; + + spin_lock(&dev->sm_lock); + if (!dev->sm_ah[ctx->port - 1]) { + /* port is not yet Active, sm_ah not ready */ + spin_unlock(&dev->sm_lock); + return -EAGAIN; + } + mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + spin_unlock(&dev->sm_lock); + return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port, + IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad); +} + +static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1]; + struct ib_wc wc; + struct ib_ah_attr ah_attr; + + /* Our agent might not yet be registered when mads start to arrive */ + if (!agent) + return -EAGAIN; + + ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + + if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index)) + return -EINVAL; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = ctx->port; + wc.slid = ah_attr.dlid; /* opensm lid */ + wc.src_qp = 1; + return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad); +} + +static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + /* we rely on a mad request as arrived from a VF */ + memcpy(&mad, sa_mad, sizeof mad); + + /* fix port GID to be the real one (slave 0) */ + sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0]; + + /* assign our own TID */ + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_leave_to_wire(struct mcast_group *group, u8 join_state) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_SA_METHOD_DELETE; + mad.mad_hdr.status = cpu_to_be16(0); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = 0x0; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE; + + *sa_data = group->rec; + sa_data->scope_join_state = join_state; + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + if (ret) + group->state = MCAST_IDLE; + + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_reply_to_slave(int slave, struct mcast_group *group, + struct ib_sa_mad *req_sa_mad, u16 status) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + mad.mad_hdr.status = cpu_to_be16(status); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid; + *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */ + + *sa_data = group->rec; + + /* reconstruct VF's requested join_state and port_gid */ + sa_data->scope_join_state &= 0xf0; + sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f); + memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid); + + ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad); + return ret; +} + +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 src_value, u8 dst_value) +{ + int err; + u8 selector = dst_value >> 6; + dst_value &= 0x3f; + src_value &= 0x3f; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +static u16 cmp_rec(struct ib_sa_mcmember_data *src, + struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask) +{ + /* src is group record, dst is request record */ + /* MGID must already match */ + /* Port_GID we always replace to our Port_GID, so it is a match */ + +#define MAD_STATUS_REQ_INVALID 0x0200 + if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, + src->mtusel_mtu, dst->mtusel_mtu)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && + src->tclass != dst->tclass) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, + src->ratesel_rate, dst->ratesel_rate)) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + src->lifetmsel_lifetm, dst->lifetmsel_lifetm)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && + (src->scope_join_state & 0xf0) != + (dst->scope_join_state & 0xf0)) + return MAD_STATUS_REQ_INVALID; + + /* join_state checked separately, proxy_join ignored */ + + return 0; +} + +/* release group, return 1 if this was last release and group is destroyed + * timout work is canceled sync */ +static int release_group(struct mcast_group *group, int from_timeout_handler) +{ + struct mlx4_ib_demux_ctx *ctx = group->demux; + int nzgroup; + + mutex_lock(&ctx->mcg_table_lock); + mutex_lock(&group->lock); + if (atomic_dec_and_test(&group->refcount)) { + if (!from_timeout_handler) { + if (group->state != MCAST_IDLE && + !cancel_delayed_work(&group->timeout_work)) { + atomic_inc(&group->refcount); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return 0; + } + } + + nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0); + if (nzgroup) + del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + if (!list_empty(&group->pending_list)) + mcg_warn_group(group, "releasing a group with non empty pending list\n"); + if (nzgroup) + rb_erase(&group->node, &ctx->mcg_table); + list_del_init(&group->mgid0_list); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return 1; + } else { + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + } + return 0; +} + +static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) +{ + int i; + + for (i = 0; i < 3; i++, join_state >>= 1) + if (join_state & 0x1) + group->members[i] += inc; +} + +static u8 get_leave_state(struct mcast_group *group) +{ + u8 leave_state = 0; + int i; + + for (i = 0; i < 3; i++) + if (!group->members[i]) + leave_state |= (1 << i); + + return leave_state & (group->rec.scope_join_state & 7); +} + +static int join_group(struct mcast_group *group, int slave, u8 join_mask) +{ + int ret = 0; + u8 join_state; + + /* remove bits that slave is already member of, and adjust */ + join_state = join_mask & (~group->func[slave].join_state); + adjust_membership(group, join_state, 1); + group->func[slave].join_state |= join_state; + if (group->func[slave].state != MCAST_MEMBER && join_state) { + group->func[slave].state = MCAST_MEMBER; + ret = 1; + } + return ret; +} + +static int leave_group(struct mcast_group *group, int slave, u8 leave_state) +{ + int ret = 0; + + adjust_membership(group, leave_state, -1); + group->func[slave].join_state &= ~leave_state; + if (!group->func[slave].join_state) { + group->func[slave].state = MCAST_NOT_MEMBER; + ret = 1; + } + return ret; +} + +static int check_leave(struct mcast_group *group, int slave, u8 leave_mask) +{ + if (group->func[slave].state != MCAST_MEMBER) + return MAD_STATUS_REQ_INVALID; + + /* make sure we're not deleting unset bits */ + if (~group->func[slave].join_state & leave_mask) + return MAD_STATUS_REQ_INVALID; + + if (!leave_mask) + return MAD_STATUS_REQ_INVALID; + + return 0; +} + +static void mlx4_ib_mcg_timeout_handler(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mcast_group *group; + struct mcast_req *req = NULL; + + group = container_of(delay, typeof(*group), timeout_work); + + mutex_lock(&group->lock); + if (group->state == MCAST_JOIN_SENT) { + if (!list_empty(&group->pending_list)) { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + list_del(&req->group_list); + list_del(&req->func_list); + --group->func[req->func].num_pend_reqs; + mutex_unlock(&group->lock); + kfree(req); + if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) { + if (release_group(group, 1)) + return; + } else { + kfree(group); + return; + } + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "DRIVER BUG\n"); + } else if (group->state == MCAST_LEAVE_SENT) { + if (group->rec.scope_join_state & 7) + group->rec.scope_join_state &= 0xf8; + group->state = MCAST_IDLE; + mutex_unlock(&group->lock); + if (release_group(group, 1)) + return; + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state)); + group->state = MCAST_IDLE; + atomic_inc(&group->refcount); + queue_work(group->demux->mcg_wq, &group->work); + safe_atomic_dec(&group->refcount); + + mutex_unlock(&group->lock); +} + +static int handle_leave_req(struct mcast_group *group, u8 leave_mask, + struct mcast_req *req) +{ + u16 status; + + if (req->clean) + leave_mask = group->func[req->func].join_state; + + status = check_leave(group, req->func, leave_mask); + if (!status) + leave_group(group, req->func, leave_mask); + + if (!req->clean) + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + return 1; +} + +static int handle_join_req(struct mcast_group *group, u8 join_mask, + struct mcast_req *req) +{ + u8 group_join_state = group->rec.scope_join_state & 7; + int ref = 0; + u16 status; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + + if (join_mask == (group_join_state & join_mask)) { + /* port's membership need not change */ + status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask); + if (!status) + join_group(group, req->func, join_mask); + + --group->func[req->func].num_pend_reqs; + send_reply_to_slave(req->func, group, &req->sa_mad, status); + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++ref; + } else { + /* port's membership needs to be updated */ + group->prev_state = group->state; + if (send_join_to_wire(group, &req->sa_mad)) { + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ref = 1; + group->state = group->prev_state; + } else + group->state = MCAST_JOIN_SENT; + } + + return ref; +} + +static void mlx4_ib_mcg_work_handler(struct work_struct *work) +{ + struct mcast_group *group; + struct mcast_req *req = NULL; + struct ib_sa_mcmember_data *sa_data; + u8 req_join_state; + int rc = 1; /* release_count - this is for the scheduled work */ + u16 status; + u8 method; + + group = container_of(work, typeof(*group), work); + + mutex_lock(&group->lock); + + /* First, let's see if a response from SM is waiting regarding this group. + * If so, we need to update the group's REC. If this is a bad response, we + * may need to send a bad response to a VF waiting for it. If VF is waiting + * and this is a good response, the VF will be answered later in this func. */ + if (group->state == MCAST_RESP_READY) { + /* cancels mlx4_ib_mcg_timeout_handler */ + cancel_delayed_work(&group->timeout_work); + status = be16_to_cpu(group->response_sa_mad.mad_hdr.status); + method = group->response_sa_mad.mad_hdr.method; + if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) { + mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n", + (long long unsigned int)be64_to_cpu(group->response_sa_mad.mad_hdr.tid), + (long long unsigned int)be64_to_cpu(group->last_req_tid)); + group->state = group->prev_state; + goto process_requests; + } + if (status) { + if (!list_empty(&group->pending_list)) + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + if ((method == IB_MGMT_METHOD_GET_RESP)) { + if (req) { + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++rc; + } else + mcg_warn_group(group, "no request for failed join\n"); + } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing) + ++rc; + } else { + u8 resp_join_state; + u8 cur_join_state; + + resp_join_state = ((struct ib_sa_mcmember_data *) + group->response_sa_mad.data)->scope_join_state & 7; + cur_join_state = group->rec.scope_join_state & 7; + + if (method == IB_MGMT_METHOD_GET_RESP) { + /* successfull join */ + if (!cur_join_state && resp_join_state) + --rc; + } else if (!resp_join_state) + ++rc; + memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec); + } + group->state = MCAST_IDLE; + } + +process_requests: + /* We should now go over pending join/leave requests, as long as we are idle. */ + while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) { + req = list_first_entry(&group->pending_list, struct mcast_req, + group_list); + sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + req_join_state = sa_data->scope_join_state & 0x7; + + /* For a leave request, we will immediately answer the VF, and + * update our internal counters. The actual leave will be sent + * to SM later, if at all needed. We dequeue the request now. */ + if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE) + rc += handle_leave_req(group, req_join_state, req); + else + rc += handle_join_req(group, req_join_state, req); + } + + /* Handle leaves */ + if (group->state == MCAST_IDLE) { + req_join_state = get_leave_state(group); + if (req_join_state) { + group->rec.scope_join_state &= ~req_join_state; + group->prev_state = group->state; + if (send_leave_to_wire(group, req_join_state)) { + group->state = group->prev_state; + ++rc; + } else + group->state = MCAST_LEAVE_SENT; + } + } + + if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) + goto process_requests; + mutex_unlock(&group->lock); + + while (rc--) + release_group(group, 0); +} + +static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx, + __be64 tid, + union ib_gid *new_mgid) +{ + struct mcast_group *group = NULL, *cur_group; + struct mcast_req *req; + struct list_head *pos; + struct list_head *n; + + mutex_lock(&ctx->mcg_table_lock); + list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { + group = list_entry(pos, struct mcast_group, mgid0_list); + mutex_lock(&group->lock); + if (group->last_req_tid == tid) { + if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { + group->rec.mgid = *new_mgid; + sprintf(group->name, "%016llx%016llx", + (long long unsigned int)be64_to_cpu(group->rec.mgid.global.subnet_prefix), + (long long unsigned int)be64_to_cpu(group->rec.mgid.global.interface_id)); + list_del_init(&group->mgid0_list); + cur_group = mcast_insert(ctx, group); + if (cur_group) { + /* A race between our code and SM. Silently cleaning the new one */ + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + release_group(group, 0); + return NULL; + } + + atomic_inc(&group->refcount); + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return group; + } else { + struct mcast_req *tmp1, *tmp2; + + list_del(&group->mgid0_list); + if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE) + cancel_delayed_work_sync(&group->timeout_work); + + list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) { + list_del(&tmp1->group_list); + kfree(tmp1); + } + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return NULL; + } + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); + + return NULL; +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf); + +static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid, int create, + gfp_t gfp_mask) +{ + struct mcast_group *group, *cur_group; + int is_mgid0; + int i; + + is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); + if (!is_mgid0) { + group = mcast_find(ctx, mgid); + if (group) + goto found; + } + + if (!create) + return ERR_PTR(-ENOENT); + + group = kzalloc(sizeof *group, gfp_mask); + if (!group) + return ERR_PTR(-ENOMEM); + + group->demux = ctx; + group->rec.mgid = *mgid; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->mgid0_list); + for (i = 0; i < MAX_VFS; ++i) + INIT_LIST_HEAD(&group->func[i].pending); + INIT_WORK(&group->work, mlx4_ib_mcg_work_handler); + INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler); + mutex_init(&group->lock); + sprintf(group->name, "%016llx%016llx", + (long long unsigned int)be64_to_cpu(group->rec.mgid.global.subnet_prefix), + (long long unsigned int)be64_to_cpu(group->rec.mgid.global.interface_id)); + sysfs_attr_init(&group->dentry.attr); + group->dentry.show = sysfs_show_group; + group->dentry.store = NULL; + group->dentry.attr.name = group->name; + group->dentry.attr.mode = 0400; + group->state = MCAST_IDLE; + + if (is_mgid0) { + list_add(&group->mgid0_list, &ctx->mcg_mgid0_list); + goto found; + } + + cur_group = mcast_insert(ctx, group); + if (cur_group) { + mcg_warn("group just showed up %s - confused\n", cur_group->name); + kfree(group); + return ERR_PTR(-EINVAL); + } + + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + +found: + atomic_inc(&group->refcount); + return group; +} + +static void queue_req(struct mcast_req *req) +{ + struct mcast_group *group = req->group; + + atomic_inc(&group->refcount); /* for the request */ + atomic_inc(&group->refcount); /* for scheduling the work */ + list_add_tail(&req->group_list, &group->pending_list); + list_add_tail(&req->func_list, &group->func[req->func].pending); + /* calls mlx4_ib_mcg_work_handler */ + queue_work(group->demux->mcg_wq, &group->work); + safe_atomic_dec(&group->refcount); +} + +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + + switch (mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_DELETE_RESP: + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) { + __be64 tid = mad->mad_hdr.tid; + *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */ + group = search_relocate_mgid0_group(ctx, tid, &rec->mgid); + } else + group = NULL; + } + + if (!group) + return 1; + + mutex_lock(&group->lock); + group->response_sa_mad = *mad; + group->prev_state = group->state; + group->state = MCAST_RESP_READY; + /* calls mlx4_ib_mcg_work_handler */ + atomic_inc(&group->refcount); + queue_work(ctx->mcg_wq, &group->work); + safe_atomic_dec(&group->refcount); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_MGMT_METHOD_SET: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE: + return 0; /* not consumed, pass-through to guest over tunnel */ + default: + mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n", + port, mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + struct mcast_req *req; + int may_create = 0; + + if (ctx->flushing) + return -EAGAIN; + + switch (sa_mad->mad_hdr.method) { + case IB_MGMT_METHOD_SET: + may_create = 1; + case IB_SA_METHOD_DELETE: + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->func = slave; + req->sa_mad = *sa_mad; + + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + kfree(req); + return PTR_ERR(group); + } + mutex_lock(&group->lock); + if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) { + mutex_unlock(&group->lock); + mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n", + port, slave, MAX_PEND_REQS_PER_FUNC); + release_group(group, 0); + kfree(req); + return -ENOMEM; + } + ++group->func[slave].num_pend_reqs; + req->group = group; + queue_req(req); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_SA_METHOD_GET_TABLE: + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE_RESP: + return 0; /* not consumed, pass-through */ + default: + mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n", + port, slave, sa_mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mcast_group *group = + container_of(attr, struct mcast_group, dentry); + struct mcast_req *req = NULL; + char pending_str[40]; + char state_str[40]; + ssize_t len = 0; + int f; + + if (group->state == MCAST_IDLE) + sprintf(state_str, "%s", get_state_string(group->state)); + else + sprintf(state_str, "%s(TID=0x%llx)", + get_state_string(group->state), + (long long unsigned int)be64_to_cpu(group->last_req_tid)); + if (list_empty(&group->pending_list)) { + sprintf(pending_str, "No"); + } else { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + sprintf(pending_str, "Yes(TID=0x%llx)", + (long long unsigned int)be64_to_cpu(req->sa_mad.mad_hdr.tid)); + } + len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ", + group->rec.scope_join_state & 0xf, + group->members[2], group->members[1], group->members[0], + atomic_read(&group->refcount), + pending_str, + state_str); + for (f = 0; f < MAX_VFS; ++f) + if (group->func[f].state == MCAST_MEMBER) + len += sprintf(buf + len, "%d[%1x] ", + f, group->func[f].join_state); + + len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x " + "%4x %4x %2x %2x)\n", + be16_to_cpu(group->rec.pkey), + be32_to_cpu(group->rec.qkey), + (group->rec.mtusel_mtu & 0xc0) >> 6, + group->rec.mtusel_mtu & 0x3f, + group->rec.tclass, + (group->rec.ratesel_rate & 0xc0) >> 6, + group->rec.ratesel_rate & 0x3f, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8, + be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff, + group->rec.proxy_join); + + return len; +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx) +{ + char name[20]; + + atomic_set(&ctx->tid, 0); + sprintf(name, "mlx4_ib_mcg%d", ctx->port); + ctx->mcg_wq = create_singlethread_workqueue(name); + if (!ctx->mcg_wq) + return -ENOMEM; + + mutex_init(&ctx->mcg_table_lock); + ctx->mcg_table = RB_ROOT; + INIT_LIST_HEAD(&ctx->mcg_mgid0_list); + ctx->flushing = 0; + + return 0; +} + +static void force_clean_group(struct mcast_group *group) +{ + struct mcast_req *req, *tmp + ; + list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) { + list_del(&req->group_list); + kfree(req); + } + del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr); + rb_erase(&group->node, &group->demux->mcg_table); + kfree(group); +} + +static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + int i; + struct rb_node *p; + struct mcast_group *group; + unsigned long end; + int count; + + for (i = 0; i < MAX_VFS; ++i) + clean_vf_mcast(ctx, i); + + end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); + do { + count = 0; + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) + ++count; + mutex_unlock(&ctx->mcg_table_lock); + if (!count) + break; + + msleep(1); + } while (time_after(end, jiffies)); + + flush_workqueue(ctx->mcg_wq); + if (destroy_wq) + destroy_workqueue(ctx->mcg_wq); + + mutex_lock(&ctx->mcg_table_lock); + while ((p = rb_first(&ctx->mcg_table)) != NULL) { + group = rb_entry(p, struct mcast_group, node); + if (atomic_read(&group->refcount)) + mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group); + + force_clean_group(group); + } + mutex_unlock(&ctx->mcg_table_lock); +} + +struct clean_work { + struct work_struct work; + struct mlx4_ib_demux_ctx *ctx; + int destroy_wq; +}; + +static void mcg_clean_task(struct work_struct *work) +{ + struct clean_work *cw = container_of(work, struct clean_work, work); + + _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq); + cw->ctx->flushing = 0; + kfree(cw); +} + +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + struct clean_work *work; + + if (ctx->flushing) + return; + + ctx->flushing = 1; + + if (destroy_wq) { + _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq); + ctx->flushing = 0; + return; + } + + work = kmalloc(sizeof *work, GFP_KERNEL); + if (!work) { + ctx->flushing = 0; + mcg_warn("failed allocating work for cleanup\n"); + return; + } + + work->ctx = ctx; + work->destroy_wq = destroy_wq; + INIT_WORK(&work->work, mcg_clean_task); + queue_work(clean_wq, &work->work); +} + +static void build_leave_mad(struct mcast_req *req) +{ + struct ib_sa_mad *mad = &req->sa_mad; + + mad->mad_hdr.method = IB_SA_METHOD_DELETE; +} + + +static void clear_pending_reqs(struct mcast_group *group, int vf) +{ + struct mcast_req *req, *tmp, *group_first = NULL; + int clear; + int pend = 0; + + if (!list_empty(&group->pending_list)) + group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list); + + list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) { + clear = 1; + if (group_first == req && + (group->state == MCAST_JOIN_SENT || + group->state == MCAST_LEAVE_SENT)) { + clear = cancel_delayed_work(&group->timeout_work); + pend = !clear; + group->state = MCAST_IDLE; + } + if (clear) { + --group->func[vf].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + atomic_dec(&group->refcount); + } + } + + if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) { + mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n", + list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs); + } +} + +static int push_deleteing_req(struct mcast_group *group, int slave) +{ + struct mcast_req *req; + struct mcast_req *pend_req; + + if (!group->func[slave].join_state) + return 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) { + mcg_warn_group(group, "failed allocation - may leave stall groups\n"); + return -ENOMEM; + } + + if (!list_empty(&group->func[slave].pending)) { + pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list); + if (pend_req->clean) { + kfree(req); + return 0; + } + } + + req->clean = 1; + req->func = slave; + req->group = group; + ++group->func[slave].num_pend_reqs; + build_leave_mad(req); + queue_req(req); + return 0; +} + +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave) +{ + struct mcast_group *group; + struct rb_node *p; + + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) { + group = rb_entry(p, struct mcast_group, node); + mutex_lock(&group->lock); + if (atomic_read(&group->refcount)) { + /* clear pending requests of this VF */ + clear_pending_reqs(group, slave); + push_deleteing_req(group, slave); + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); +} + + +int mlx4_ib_mcg_init(void) +{ + clean_wq = create_singlethread_workqueue("mlx4_ib_mcg"); + if (!clean_wq) + return -ENOMEM; + + return 0; +} + +void mlx4_ib_mcg_destroy(void) +{ + destroy_workqueue(clean_wq); +} diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h index b8f6996c278a..ffd293676077 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -37,38 +37,50 @@ #include #include #include +#include #include #include +#include +#include #include #include - +#include #define MLX4_IB_DRV_NAME "mlx4_ib" -#ifdef CONFIG_MLX4_DEBUG -extern int mlx4_ib_debug_level; +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ -#define mlx4_ib_dbg(format, arg...) \ - do { \ - if (mlx4_ib_debug_level) \ - printk(KERN_DEBUG "<" MLX4_IB_DRV_NAME "> %s: " format "\n",\ - __func__, ## arg); \ - } while (0) +#define mlx4_ib_warn(ibdev, format, arg...) \ + dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) -#else /* CONFIG_MLX4_DEBUG */ - -#define mlx4_ib_dbg(format, arg...) do {} while (0) - -#endif /* CONFIG_MLX4_DEBUG */ +#define mlx4_ib_info(ibdev, format, arg...) \ + dev_info((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) enum { - MLX4_IB_SQ_MIN_WQE_SHIFT = 6 + MLX4_IB_SQ_MIN_WQE_SHIFT = 6, + MLX4_IB_MAX_HEADROOM = 2048 }; -#define MLX4_IB_SQ_HEADROOM(shift) ((2048 >> (shift)) + 1) -#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) +#define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) +#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) + +/*module param to indicate if SM assigns the alias_GUID*/ +extern int mlx4_ib_sm_guid_assign; +#ifdef __linux__ +extern struct proc_dir_entry *mlx4_mrs_dir_entry; +#endif + +#define MLX4_IB_UC_STEER_QPN_ALIGN 1 +#define MLX4_IB_UC_MAX_NUM_QPS (256 * 1024) + + +#define MLX4_IB_MMAP_CMD_MASK 0xFF +#define MLX4_IB_MMAP_CMD_BITS 8 struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; @@ -83,15 +95,16 @@ struct mlx4_ib_pd { }; struct mlx4_ib_xrcd { - struct ib_xrcd ibxrcd; - u32 xrcdn; - struct ib_pd *pd; - struct ib_cq *cq; + struct ib_xrcd ibxrcd; + u32 xrcdn; + struct ib_pd *pd; + struct ib_cq *cq; }; struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; + int entry_size; }; struct mlx4_ib_cq_resize { @@ -99,6 +112,11 @@ struct mlx4_ib_cq_resize { int cqe; }; +struct mlx4_shared_mr_info { + int mr_id; + struct ib_umem *umem; +}; + struct mlx4_ib_cq { struct ib_cq ibcq; struct mlx4_cq mcq; @@ -115,6 +133,7 @@ struct mlx4_ib_mr { struct ib_mr ibmr; struct mlx4_mr mmr; struct ib_umem *umem; + struct mlx4_shared_mr_info *smr_info; }; struct mlx4_ib_fast_reg_page_list { @@ -141,18 +160,127 @@ struct mlx4_ib_wq { }; enum mlx4_ib_qp_flags { - MLX4_IB_QP_LSO = 1 << 0, - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 1, - MLX4_IB_XRC_RCV = 1 << 2, + MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, + MLX4_IB_SRIOV_SQP = 1 << 31, }; -struct gid_entry { +struct mlx4_ib_gid_entry { struct list_head list; union ib_gid gid; int added; u8 port; }; +enum mlx4_ib_mmap_cmd { + MLX4_IB_MMAP_UAR_PAGE = 0, + MLX4_IB_MMAP_BLUE_FLAME_PAGE = 1, + MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES = 2, +}; + +enum mlx4_ib_qp_type { + /* + * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries + * here (and in that order) since the MAD layer uses them as + * indices into a 2-entry table. + */ + MLX4_IB_QPT_SMI = IB_QPT_SMI, + MLX4_IB_QPT_GSI = IB_QPT_GSI, + + MLX4_IB_QPT_RC = IB_QPT_RC, + MLX4_IB_QPT_UC = IB_QPT_UC, + MLX4_IB_QPT_UD = IB_QPT_UD, + MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, + MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE, + MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET, + MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI, + MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT, + + MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16, + MLX4_IB_QPT_PROXY_SMI = 1 << 17, + MLX4_IB_QPT_PROXY_GSI = 1 << 18, + MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19, + MLX4_IB_QPT_TUN_SMI = 1 << 20, + MLX4_IB_QPT_TUN_GSI = 1 << 21, +}; + +#define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \ + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \ + MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI) + +enum mlx4_ib_mad_ifc_flags { + MLX4_MAD_IFC_IGNORE_MKEY = 1, + MLX4_MAD_IFC_IGNORE_BKEY = 2, + MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY | + MLX4_MAD_IFC_IGNORE_BKEY), + MLX4_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX4_NUM_TUNNEL_BUFS = 256, +}; + +struct mlx4_ib_tunnel_header { + struct mlx4_av av; + __be32 remote_qpn; + __be32 qkey; + __be16 vlan; + u8 mac[6]; + __be16 pkey_index; + u8 reserved[6]; +}; + +struct mlx4_ib_buf { + void *addr; + dma_addr_t map; +}; + +struct mlx4_rcv_tunnel_hdr { + __be32 flags_src_qp; /* flags[6:5] is defined for VLANs: + * 0x0 - no vlan was in the packet + * 0x01 - C-VLAN was in the packet */ + u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */ + u8 reserved; + __be16 pkey_index; + __be16 sl_vid; + __be16 slid_mac_47_32; + __be32 mac_31_0; +}; + +struct mlx4_ib_proxy_sqp_hdr { + struct ib_grh grh; + struct mlx4_rcv_tunnel_hdr tun; +} __packed; + +struct mlx4_roce_smac_vlan_info { + u64 smac; + int smac_index; + int smac_port; + u64 candidate_smac; + int candidate_smac_index; + int candidate_smac_port; + u16 vid; + int vlan_index; + int vlan_port; + u16 candidate_vid; + int candidate_vlan_index; + int candidate_vlan_port; + int update_vid; +}; + +struct mlx4_ib_qpg_data { + unsigned long *tss_bitmap; + unsigned long *rss_bitmap; + struct mlx4_ib_qp *qpg_parent; + int tss_qpn_base; + int rss_qpn_base; + u32 tss_child_count; + u32 rss_child_count; + u32 qpg_tss_mask_sz; +}; + struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; @@ -168,14 +296,13 @@ struct mlx4_ib_qp { int sq_spare_wqes; struct mlx4_ib_wq sq; + enum mlx4_ib_qp_type mlx4_ib_qp_type; struct ib_umem *umem; struct mlx4_mtt mtt; int buf_size; struct mutex mutex; - u32 flags; - struct list_head xrc_reg_list; - spinlock_t xrc_reg_list_lock; u16 xrcdn; + u32 flags; u8 port; u8 alt_port; u8 atomic_rd_en; @@ -183,9 +310,16 @@ struct mlx4_ib_qp { u8 sq_no_prefetch; u8 state; int mlx_type; + enum ib_qpg_type qpg_type; + struct mlx4_ib_qpg_data *qpg_data; struct list_head gid_list; - int max_inline_data; - struct mlx4_bf bf; + struct list_head steering_rules; + struct mlx4_ib_buf *sqp_proxy_rcv; + struct mlx4_roce_smac_vlan_info pri; + struct mlx4_roce_smac_vlan_info alt; + struct list_head rules_list; + int max_inline_data; + struct mlx4_bf bf; }; struct mlx4_ib_srq { @@ -208,6 +342,138 @@ struct mlx4_ib_ah { union mlx4_ext_av av; }; +/****************************************/ +/* alias guid support */ +/****************************************/ +#define NUM_PORT_ALIAS_GUID 2 +#define NUM_ALIAS_GUID_IN_REC 8 +#define NUM_ALIAS_GUID_REC_IN_PORT 16 +#define GUID_REC_SIZE 8 +#define NUM_ALIAS_GUID_PER_PORT 128 +#define MLX4_NOT_SET_GUID (0x00LL) +#define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) + +enum mlx4_guid_alias_rec_status { + MLX4_GUID_INFO_STATUS_IDLE, + MLX4_GUID_INFO_STATUS_SET, + MLX4_GUID_INFO_STATUS_PENDING, +}; + +enum mlx4_guid_alias_rec_ownership { + MLX4_GUID_DRIVER_ASSIGN, + MLX4_GUID_SYSADMIN_ASSIGN, + MLX4_GUID_NONE_ASSIGN, /*init state of each record*/ +}; + +enum mlx4_guid_alias_rec_method { + MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, + MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, +}; + +struct mlx4_sriov_alias_guid_info_rec_det { + u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; + ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ + enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ + u8 method; /*set or delete*/ + enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/ +}; + +struct mlx4_sriov_alias_guid_port_rec_det { + struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; + struct workqueue_struct *wq; + struct delayed_work alias_guid_work; + u8 port; + struct mlx4_sriov_alias_guid *parent; + struct list_head cb_list; +}; + +struct mlx4_sriov_alias_guid { + struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; + spinlock_t ag_work_lock; + struct ib_sa_client *sa_client; +}; + +struct mlx4_ib_demux_work { + struct work_struct work; + struct mlx4_ib_dev *dev; + int slave; + int do_init; + u8 port; + +}; + +struct mlx4_ib_tun_tx_buf { + struct mlx4_ib_buf buf; + struct ib_ah *ah; +}; + +struct mlx4_ib_demux_pv_qp { + struct ib_qp *qp; + enum ib_qp_type proxy_qpt; + struct mlx4_ib_buf *ring; + struct mlx4_ib_tun_tx_buf *tx_ring; + spinlock_t tx_lock; + unsigned tx_ix_head; + unsigned tx_ix_tail; +}; + +enum mlx4_ib_demux_pv_state { + DEMUX_PV_STATE_DOWN, + DEMUX_PV_STATE_STARTING, + DEMUX_PV_STATE_ACTIVE, + DEMUX_PV_STATE_DOWNING, +}; + +struct mlx4_ib_demux_pv_ctx { + int port; + int slave; + enum mlx4_ib_demux_pv_state state; + int has_smi; + struct ib_device *ib_dev; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_mr *mr; + struct work_struct work; + struct workqueue_struct *wq; + struct mlx4_ib_demux_pv_qp qp[2]; +}; + +struct mlx4_ib_demux_ctx { + struct ib_device *ib_dev; + int port; + struct workqueue_struct *wq; + struct workqueue_struct *ud_wq; + spinlock_t ud_lock; + __be64 subnet_prefix; + __be64 guid_cache[128]; + struct mlx4_ib_dev *dev; + /* the following lock protects both mcg_table and mcg_mgid0_list */ + struct mutex mcg_table_lock; + struct rb_root mcg_table; + struct list_head mcg_mgid0_list; + struct workqueue_struct *mcg_wq; + struct mlx4_ib_demux_pv_ctx **tun; + atomic_t tid; + int flushing; /* flushing the work queue */ +}; + +struct mlx4_ib_sriov { + struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; + struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; + /* when using this spinlock you should use "irq" because + * it may be called from interrupt context.*/ + spinlock_t going_down_lock; + int is_going_down; + + struct mlx4_sriov_alias_guid alias_guid; + + /* CM paravirtualization fields */ + struct list_head cm_list; + spinlock_t id_map_lock; + struct rb_root sl_id_map; + struct idr pv_id_table; +}; + struct mlx4_ib_iboe { spinlock_t lock; struct net_device *netdevs[MLX4_MAX_PORTS]; @@ -215,6 +481,42 @@ struct mlx4_ib_iboe { union ib_gid gid_table[MLX4_MAX_PORTS][128]; }; +struct pkey_mgt { + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + struct list_head pkey_port_list[MLX4_MFUNC_MAX]; + struct kobject *device_parent[MLX4_MFUNC_MAX]; +}; + +struct mlx4_ib_iov_sysfs_attr { + void *ctx; + struct kobject *kobj; + unsigned long data; + u32 entry_num; + char name[15]; + struct device_attribute dentry; + struct device *dev; +}; + +struct mlx4_ib_iov_sysfs_attr_ar { + struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; +}; + +struct mlx4_ib_iov_port { + char name[100]; + u8 num; + struct mlx4_ib_dev *dev; + struct list_head list; + struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; + struct ib_port_attr attr; + struct kobject *cur_port; + struct kobject *admin_alias_parent; + struct kobject *gids_parent; + struct kobject *pkeys_parent; + struct kobject *mcgs_parent; + struct mlx4_ib_iov_sysfs_attr mcg_dentry; +}; + struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; @@ -226,12 +528,35 @@ struct mlx4_ib_dev { struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; struct ib_ah *sm_ah[MLX4_MAX_PORTS]; spinlock_t sm_lock; + struct mlx4_ib_sriov sriov; struct mutex cap_mask_mutex; - struct mutex xrc_reg_mutex; - int ib_active; + bool ib_active; struct mlx4_ib_iboe iboe; int counters[MLX4_MAX_PORTS]; + int *eq_table; + int eq_added; + struct kobject *iov_parent; + struct kobject *ports_parent; + struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; + struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; + struct pkey_mgt pkeys; + unsigned long *ib_uc_qpns_bitmap; + int steer_qpn_count; + int steer_qpn_base; +}; + +struct ib_event_work { + struct work_struct work; + struct mlx4_ib_dev *ib_dev; + struct mlx4_eqe ib_eqe; +}; + +struct mlx4_ib_qp_tunnel_init_attr { + struct ib_qp_init_attr init_attr; + int slave; + enum ib_qp_type proxy_qp_type; + u8 port; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) @@ -303,6 +628,9 @@ static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) return container_of(ibah, struct mlx4_ib_ah, ibah); } +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); + int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db); void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); @@ -310,9 +638,12 @@ void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db) struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc); int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem); +int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, + u64 start_va, + int *num_of_mtts); struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, - struct ib_udata *udata); + struct ib_udata *udata, int mr_id); int mlx4_ib_dereg_mr(struct ib_mr *mr); struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); @@ -322,6 +653,7 @@ void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, struct ib_ucontext *context, struct ib_udata *udata); @@ -338,11 +670,6 @@ int mlx4_ib_destroy_ah(struct ib_ah *ah); struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata); -struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd, - struct ib_cq *xrc_cq, - struct ib_xrcd *xrcd, - struct ib_srq_init_attr *init_attr, - struct ib_udata *udata); int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); @@ -364,7 +691,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); -int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, struct ib_wc *in_wc, struct ib_grh *in_grh, void *in_mad, void *response_mad); int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, @@ -379,20 +706,20 @@ int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova); int mlx4_ib_unmap_fmr(struct list_head *fmr_list); int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); -int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr, - u32 *qp_num); -int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num, - struct ib_qp_attr *attr, int attr_mask); -int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num, - struct ib_qp_attr *attr, int attr_mask, - struct ib_qp_init_attr *init_attr); -int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num); -int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num); +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view); +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view); +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view); int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, u8 *mac, int *is_mcast, u8 port); +int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index, + union mlx4_counter *counter, u8 clear); + static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; @@ -403,7 +730,73 @@ static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) return !!(ah->av.ib.g_slid & 0x80); } +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); +int mlx4_ib_mcg_init(void); +void mlx4_ib_mcg_destroy(void); + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad); +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad); + int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid); +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type); + +void mlx4_ib_tunnels_update_work(struct work_struct *work); + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad); +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, + u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad); +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad, int is_eth); + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad); + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); + +/* alias guid support */ +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port); +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); + +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, + u8 port_num, u8 *p_data); + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data); + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); + +__be64 mlx4_ib_gen_node_guid(void); + +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach); + #endif /* MLX4_IB_H */ diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c index c49b46022278..24d95200e9b7 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c @@ -31,6 +31,15 @@ * SOFTWARE. */ +#include +#include +#include + +#ifdef __linux__ +#include +#include +#endif + #include "mlx4_ib.h" static u32 convert_access(int acc) @@ -41,13 +50,67 @@ static u32 convert_access(int acc) (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | MLX4_PERM_LOCAL_READ; } +#ifdef __linux__ +static ssize_t shared_mr_proc_read(struct file *file, + char __user *buffer, + size_t len, + loff_t *offset) +{ + return -ENOSYS; + +} + +static ssize_t shared_mr_proc_write(struct file *file, + const char __user *buffer, + size_t len, + loff_t *offset) +{ + + return -ENOSYS; +} + +static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma) +{ + + struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode); + struct mlx4_shared_mr_info *smr_info = + (struct mlx4_shared_mr_info *)pde->data; + + /* Prevent any mapping not on start of area */ + if (vma->vm_pgoff != 0) + return -EINVAL; + + return ib_umem_map_to_vma(smr_info->umem, + vma); + +} + +static const struct file_operations shared_mr_proc_ops = { + .owner = THIS_MODULE, + .read = shared_mr_proc_read, + .write = shared_mr_proc_write, + .mmap = shared_mr_mmap +}; + +static mode_t convert_shared_access(int acc) +{ + + return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR : 0) | + (acc & IB_ACCESS_SHARED_MR_USER_WRITE ? S_IWUSR : 0) | + (acc & IB_ACCESS_SHARED_MR_GROUP_READ ? S_IRGRP : 0) | + (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE ? S_IWGRP : 0) | + (acc & IB_ACCESS_SHARED_MR_OTHER_READ ? S_IROTH : 0) | + (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE ? S_IWOTH : 0); + +} +#endif struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) { struct mlx4_ib_mr *mr; int err; - mr = kmalloc(sizeof *mr, GFP_KERNEL); + mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -74,118 +137,350 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) return ERR_PTR(err); } +static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, + struct mlx4_mtt *mtt, + u64 mtt_size, + u64 mtt_shift, + u64 len, + u64 cur_start_addr, + u64 *pages, + int *start_index, + int *npages) +{ + int k; + int err = 0; + u64 mtt_entries; + u64 cur_end_addr = cur_start_addr + len; + u64 cur_end_addr_aligned = 0; + + len += (cur_start_addr & (mtt_size-1ULL)); + cur_end_addr_aligned = round_up(cur_end_addr, mtt_size); + len += (cur_end_addr_aligned - cur_end_addr); + if (len & (mtt_size-1ULL)) { + WARN(1 , + "write_block: len %llx is not aligned to mtt_size %llx\n", + len, mtt_size); + return -EINVAL; + } + + + mtt_entries = (len >> mtt_shift); + + /* Align the MTT start address to + the mtt_size. + Required to handle cases when the MR + starts in the middle of an MTT record. + Was not required in old code since + the physical addresses provided by + the dma subsystem were page aligned, + which was also the MTT size. + */ + cur_start_addr = round_down(cur_start_addr, mtt_size); + /* A new block is started ...*/ + for (k = 0; k < mtt_entries; ++k) { + pages[*npages] = cur_start_addr + (mtt_size * k); + (*npages)++; + /* + * Be friendly to mlx4_write_mtt() and + * pass it chunks of appropriate size. + */ + if (*npages == PAGE_SIZE / sizeof(u64)) { + err = mlx4_write_mtt(dev->dev, + mtt, *start_index, + *npages, pages); + if (err) + return err; + + (*start_index) += *npages; + *npages = 0; + } + } + + return 0; +} + int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { u64 *pages; struct ib_umem_chunk *chunk; - int i, j, k; - int n; - int len; + int j; + u64 len = 0; int err = 0; + u64 mtt_size; + u64 cur_start_addr = 0; + u64 mtt_shift; + int start_index = 0; + int npages = 0; pages = (u64 *) __get_free_page(GFP_KERNEL); if (!pages) return -ENOMEM; - i = n = 0; + mtt_shift = mtt->page_shift; + mtt_size = 1ULL << mtt_shift; list_for_each_entry(chunk, &umem->chunk_list, list) for (j = 0; j < chunk->nmap; ++j) { - len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift; - for (k = 0; k < len; ++k) { - pages[i++] = sg_dma_address(&chunk->page_list[j]) + - umem->page_size * k; - /* - * Be friendly to mlx4_write_mtt() and - * pass it chunks of appropriate size. - */ - if (i == PAGE_SIZE / sizeof (u64)) { - err = mlx4_write_mtt(dev->dev, mtt, n, - i, pages); - if (err) - goto out; - n += i; - i = 0; - } + if (cur_start_addr + len == + sg_dma_address(&chunk->page_list[j])) { + /* still the same block */ + len += sg_dma_len(&chunk->page_list[j]); + continue; } + /* A new block is started ...*/ + /* If len is malaligned, write an extra mtt entry to + cover the misaligned area (round up the division) + */ + err = mlx4_ib_umem_write_mtt_block(dev, + mtt, mtt_size, mtt_shift, + len, cur_start_addr, + pages, + &start_index, + &npages); + if (err) + goto out; + + cur_start_addr = + sg_dma_address(&chunk->page_list[j]); + len = sg_dma_len(&chunk->page_list[j]); } - if (i) - err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); + /* Handle the last block */ + if (len > 0) { + /* If len is malaligned, write an extra mtt entry to cover + the misaligned area (round up the division) + */ + err = mlx4_ib_umem_write_mtt_block(dev, + mtt, mtt_size, mtt_shift, + len, cur_start_addr, + pages, + &start_index, + &npages); + if (err) + goto out; + } + + + if (npages) + err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages); out: free_page((unsigned long) pages); return err; } -static int handle_hugetlb_user_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr, - u64 start, u64 virt_addr, int access_flags) +static inline u64 alignment_of(u64 ptr) +{ + return ilog2(ptr & (~(ptr-1))); +} + +static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start, + u64 current_block_end, + u64 block_shift) +{ + /* Check whether the alignment of the new block + is aligned as well as the previous block. + Block address must start with zeros till size of entity_size. + */ + if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0) + /* It is not as well aligned as the + previous block-reduce the mtt size + accordingly. + Here we take the last right bit + which is 1. + */ + block_shift = alignment_of(next_block_start); + + /* Check whether the alignment of the + end of previous block - is it aligned + as well as the start of the block + */ + if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0) + /* It is not as well aligned as + the start of the block - reduce the + mtt size accordingly. + */ + block_shift = alignment_of(current_block_end); + + return block_shift; +} + +/* Calculate optimal mtt size based on contiguous pages. +* Function will return also the number of pages that are not aligned to the + calculated mtt_size to be added to total number + of pages. For that we should check the first chunk length & last chunk + length and if not aligned to mtt_size we should increment + the non_aligned_pages number. + All chunks in the middle already handled as part of mtt shift calculation + for both their start & end addresses. +*/ +int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, + u64 start_va, + int *num_of_mtts) { -#if defined(CONFIG_HUGETLB_PAGE) && !defined(__powerpc__) && !defined(__ia64__) - struct mlx4_ib_dev *dev = to_mdev(pd->device); struct ib_umem_chunk *chunk; - unsigned dsize; - dma_addr_t daddr; - unsigned cur_size = 0; - dma_addr_t uninitialized_var(cur_addr); - int n; - struct ib_umem *umem = mr->umem; - u64 *arr; - int err = 0; - int i; - int j = 0; - int off = start & (HPAGE_SIZE - 1); + int j; + u64 block_shift = MLX4_MAX_MTT_SHIFT; + u64 current_block_len = 0; + u64 current_block_start = 0; + u64 misalignment_bits; + u64 first_block_start = 0; + u64 last_block_end = 0; + u64 total_len = 0; + u64 last_block_aligned_end = 0; + u64 min_shift = ilog2(umem->page_size); - n = DIV_ROUND_UP(off + umem->length, HPAGE_SIZE); - arr = kmalloc(n * sizeof *arr, GFP_KERNEL); - if (!arr) - return -ENOMEM; - - list_for_each_entry(chunk, &umem->chunk_list, list) - for (i = 0; i < chunk->nmap; ++i) { - daddr = sg_dma_address(&chunk->page_list[i]); - dsize = sg_dma_len(&chunk->page_list[i]); - if (!cur_size) { - cur_addr = daddr; - cur_size = dsize; - } else if (cur_addr + cur_size != daddr) { - err = -EINVAL; - goto out; - } else - cur_size += dsize; - - if (cur_size > HPAGE_SIZE) { - err = -EINVAL; - goto out; - } else if (cur_size == HPAGE_SIZE) { - cur_size = 0; - arr[j++] = cur_addr; - } + list_for_each_entry(chunk, &umem->chunk_list, list) { + /* Initialization - save the first chunk start as + the current_block_start - block means contiguous pages. + */ + if (current_block_len == 0 && current_block_start == 0) { + first_block_start = current_block_start = + sg_dma_address(&chunk->page_list[0]); + /* Find the bits that are different between + the physical address and the virtual + address for the start of the MR. + */ + /* umem_get aligned the start_va to a page + boundry. Therefore, we need to align the + start va to the same boundry */ + /* misalignment_bits is needed to handle the + case of a single memory region. In this + case, the rest of the logic will not reduce + the block size. If we use a block size + which is bigger than the alignment of the + misalignment bits, we might use the virtual + page number instead of the physical page + number, resulting in access to the wrong + data. */ + misalignment_bits = + (start_va & (~(((u64)(umem->page_size))-1ULL))) + ^ current_block_start; + block_shift = min(alignment_of(misalignment_bits) + , block_shift); } - if (cur_size) { - arr[j++] = cur_addr; + /* Go over the scatter entries in the current chunk, check + if they continue the previous scatter entry. + */ + for (j = 0; j < chunk->nmap; ++j) { + u64 next_block_start = + sg_dma_address(&chunk->page_list[j]); + u64 current_block_end = current_block_start + + current_block_len; + /* If we have a split (non-contig.) between two block*/ + if (current_block_end != next_block_start) { + block_shift = mlx4_ib_umem_calc_block_mtt( + next_block_start, + current_block_end, + block_shift); + + /* If we reached the minimum shift for 4k + page we stop the loop. + */ + if (block_shift <= min_shift) + goto end; + + /* If not saved yet we are in first block - + we save the length of first block to + calculate the non_aligned_pages number at + * the end. + */ + total_len += current_block_len; + + /* Start a new block */ + current_block_start = next_block_start; + current_block_len = + sg_dma_len(&chunk->page_list[j]); + continue; + } + /* The scatter entry is another part of + the current block, increase the block size + * An entry in the scatter can be larger than + 4k (page) as of dma mapping + which merge some blocks together. + */ + current_block_len += + sg_dma_len(&chunk->page_list[j]); + } } - err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, umem->length, - convert_access(access_flags), n, HPAGE_SHIFT, &mr->mmr); - if (err) - goto out; + /* Account for the last block in the total len */ + total_len += current_block_len; + /* Add to the first block the misalignment that it suffers from.*/ + total_len += (first_block_start & ((1ULL<dev, &mr->mmr.mtt, 0, n, arr); + WARN((total_len & ((1ULL<> block_shift; +end: + if (block_shift < min_shift) { + /* If shift is less than the min we set a WARN and + return the min shift. + */ + WARN(1, + "mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n", + block_shift); + + block_shift = min_shift; + } + return block_shift; } +#ifdef __linux__ +static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id) +{ + struct proc_dir_entry *mr_proc_entry; + mode_t mode = S_IFREG; + char name_buff[16]; + + mode |= convert_shared_access(access_flags); + sprintf(name_buff, "%X", mr_id); + mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL); + mr->smr_info->mr_id = mr_id; + mr->smr_info->umem = mr->umem; + + mr_proc_entry = proc_create_data(name_buff, mode, + mlx4_mrs_dir_entry, + &shared_mr_proc_ops, + mr->smr_info); + + if (!mr_proc_entry) { + pr_err("prepare_shared_mr failed via proc\n"); + kfree(mr->smr_info); + return -ENODEV; + } + + current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid)); + mr_proc_entry->size = mr->umem->length; + return 0; + +} +static int is_shared_mr(int access_flags) +{ + /* We should check whether IB_ACCESS_SHARED_MR_USER_READ or + other shared bits were turned on. + */ + return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ | + IB_ACCESS_SHARED_MR_USER_WRITE | + IB_ACCESS_SHARED_MR_GROUP_READ | + IB_ACCESS_SHARED_MR_GROUP_WRITE | + IB_ACCESS_SHARED_MR_OTHER_READ | + IB_ACCESS_SHARED_MR_OTHER_WRITE)); + +} +#endif + struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, - struct ib_udata *udata) + struct ib_udata *udata, + int mr_id) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_mr *mr; @@ -193,38 +488,49 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int err; int n; - mr = kmalloc(sizeof *mr, GFP_KERNEL); + mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); mr->umem = ib_umem_get(pd->uobject->context, start, length, - access_flags, 0); + access_flags, 0); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; } - if (!mr->umem->hugetlb || - handle_hugetlb_user_mr(pd, mr, start, virt_addr, access_flags)) { - n = ib_umem_page_count(mr->umem); - shift = ilog2(mr->umem->page_size); + n = ib_umem_page_count(mr->umem); + shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, + &n); + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, + convert_access(access_flags), n, shift, &mr->mmr); + if (err) + goto err_umem; - err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, - convert_access(access_flags), n, shift, &mr->mmr); - if (err) - goto err_umem; - - err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); - if (err) - goto err_mr; - } + err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); + if (err) + goto err_mr; err = mlx4_mr_enable(dev->dev, &mr->mmr); if (err) goto err_mr; mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; +#ifdef __linux__ + /* Check whether MR should be shared */ + if (is_shared_mr(access_flags)) { + /* start address and length must be aligned to page size in order + to map a full page and preventing leakage of data */ + if (mr->umem->offset || (length & ~PAGE_MASK)) { + err = -EINVAL; + goto err_mr; + } + err = prepare_shared_mr(mr, access_flags, mr_id); + if (err) + goto err_mr; + } +#endif return &mr->ibmr; err_mr: @@ -239,13 +545,36 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(err); } + int mlx4_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + if (mr->smr_info) { + /* When master/parent shared mr is dereged there is + no ability to share this mr any more - its mr_id will be + returned to the kernel as part of ib_uverbs_dereg_mr + and may be allocated again as part of other reg_mr. + */ + char name_buff[16]; + + sprintf(name_buff, "%X", mr->smr_info->mr_id); + /* Remove proc entry is checking internally that no operation + was strated on that proc fs file and if in the middle + current process will wait till end of operation. + That's why no sync mechanism is needed when we release + below the shared umem. + */ +#ifdef __linux__ + remove_proc_entry(name_buff, mlx4_mrs_dir_entry); + kfree(mr->smr_info); +#endif + } + if (mr->umem) ib_umem_release(mr->umem); + kfree(mr); return 0; @@ -258,7 +587,7 @@ struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr; int err; - mr = kmalloc(sizeof *mr, GFP_KERNEL); + mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -291,7 +620,7 @@ struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device struct mlx4_ib_fast_reg_page_list *mfrpl; int size = page_list_len * sizeof (u64); - if (page_list_len > MAX_FAST_REG_PAGES) + if (page_list_len > MLX4_MAX_FAST_REG_PAGES) return ERR_PTR(-EINVAL); mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL); @@ -403,7 +732,7 @@ int mlx4_ib_unmap_fmr(struct list_head *fmr_list) err = mlx4_SYNC_TPT(mdev); if (err) - printk(KERN_WARNING "mlx4_ib: SYNC_TPT error %d when " + pr_warn("SYNC_TPT error %d when " "unmapping FMRs\n", err); return 0; diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/qp.c b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c index 8958c1ed12b3..980d121b56bb 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/qp.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c @@ -32,15 +32,24 @@ */ #include +#include #include +#include +#include #include #include #include +#include #include +#include #include +#ifndef __linux__ +#define asm __asm +#endif + #include "mlx4_ib.h" #include "user.h" @@ -52,25 +61,22 @@ enum { MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, MLX4_IB_LINK_TYPE_IB = 0, - MLX4_IB_LINK_TYPE_ETH = 1, + MLX4_IB_LINK_TYPE_ETH = 1 }; enum { /* - * Largest possible UD header: send with GRH and immediate data. - * 4 bytes added to accommodate for eth header instead of lrh + * Largest possible UD header: send with GRH and immediate + * data plus 18 bytes for an Ethernet header with VLAN/802.1Q + * tag. (LRH would only use 8 bytes, so Ethernet is the + * biggest case) */ - MLX4_IB_UD_HEADER_SIZE = 76, - MLX4_IB_MAX_RAW_ETY_HDR_SIZE = 12 + MLX4_IB_UD_HEADER_SIZE = 82, + MLX4_IB_LSO_HEADER_SPARE = 128, }; enum { - MLX4_IBOE_ETHERTYPE = 0x8915 -}; - -struct mlx4_ib_xrc_reg_entry { - struct list_head list; - void *context; + MLX4_IB_IBOE_ETHERTYPE = 0x8915 }; struct mlx4_ib_sqp { @@ -83,7 +89,13 @@ struct mlx4_ib_sqp { }; enum { - MLX4_IB_MIN_SQ_STRIDE = 6 + MLX4_IB_MIN_SQ_STRIDE = 6, + MLX4_IB_CACHE_LINE_SIZE = 64, +}; + +enum { + MLX4_RAW_QP_MTU = 7, + MLX4_RAW_QP_MSGMAX = 31, }; static const __be32 mlx4_ib_opcode[] = { @@ -104,32 +116,77 @@ static const __be32 mlx4_ib_opcode[] = { #ifndef wc_wmb #if defined(__i386__) - #define wc_wmb() __asm volatile("lock; addl $0,0(%%esp) " ::: "memory") + #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") #elif defined(__x86_64__) - #define wc_wmb() __asm volatile("sfence" ::: "memory") + #define wc_wmb() asm volatile("sfence" ::: "memory") #elif defined(__ia64__) - #define wc_wmb() __asm volatile("fwb" ::: "memory") + #define wc_wmb() asm volatile("fwb" ::: "memory") #else #define wc_wmb() wmb() #endif #endif - static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) { return container_of(mqp, struct mlx4_ib_sqp, qp); } -static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { - return qp->mqp.qpn >= dev->dev->caps.sqp_start && - qp->mqp.qpn <= dev->dev->caps.sqp_start + 3; + if (!mlx4_is_master(dev->dev)) + return 0; + + return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn && + qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn + + 8 * MLX4_MFUNC_MAX; } +static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + int proxy_sqp = 0; + int real_sqp = 0; + int i; + /* PPF or Native -- real SQP */ + real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3); + if (real_sqp) + return 1; + /* VF or PF -- proxy SQP */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] || + qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) { + proxy_sqp = 1; + break; + } + } + } + return proxy_sqp; +} + +/* used for INIT/CLOSE port logic */ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { - return qp->mqp.qpn >= dev->dev->caps.sqp_start && - qp->mqp.qpn <= dev->dev->caps.sqp_start + 1; + int proxy_qp0 = 0; + int real_qp0 = 0; + int i; + /* PPF or Native -- real QP0 */ + real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1); + if (real_qp0) + return 1; + /* VF or PF -- proxy QP0 */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) { + proxy_qp0 = 1; + break; + } + } + } + return proxy_qp0; } static void *get_wqe(struct mlx4_ib_qp *qp, int offset) @@ -237,16 +294,14 @@ static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) { struct ib_event event; - struct mlx4_ib_qp *mqp = to_mibqp(qp); - struct ib_qp *ibqp = &mqp->ibqp; - struct mlx4_ib_xrc_reg_entry *ctx_entry; - unsigned long flags; + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; if (type == MLX4_EVENT_TYPE_PATH_MIG) to_mibqp(qp)->port = to_mibqp(qp)->alt_port; if (ibqp->event_handler) { event.device = ibqp->device; + event.element.qp = ibqp; switch (type) { case MLX4_EVENT_TYPE_PATH_MIG: event.event = IB_EVENT_PATH_MIG; @@ -273,27 +328,16 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) event.event = IB_EVENT_QP_ACCESS_ERR; break; default: - printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " + pr_warn("Unexpected event type %d " "on QP %06x\n", type, qp->qpn); return; } - if (unlikely(ibqp->qp_type == IB_QPT_XRC && - mqp->flags & MLX4_IB_XRC_RCV)) { - event.event |= IB_XRC_QP_EVENT_FLAG; - event.element.xrc_qp_num = ibqp->qp_num; - spin_lock_irqsave(&mqp->xrc_reg_list_lock, flags); - list_for_each_entry(ctx_entry, &mqp->xrc_reg_list, list) - ibqp->event_handler(&event, ctx_entry->context); - spin_unlock_irqrestore(&mqp->xrc_reg_list_lock, flags); - return; - } - event.element.qp = ibqp; ibqp->event_handler(&event, ibqp->qp_context); } } -static int send_wqe_overhead(enum ib_qp_type type, u32 flags) +static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) { /* * UD WQEs must have a datagram segment. @@ -302,20 +346,29 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) * header and space for the ICRC). */ switch (type) { - case IB_QPT_UD: + case MLX4_IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + - ((flags & MLX4_IB_QP_LSO) ? 128 : 0); - case IB_QPT_UC: + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + 64; + case MLX4_IB_QPT_TUN_SMI_OWNER: + case MLX4_IB_QPT_TUN_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg); + + case MLX4_IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); - case IB_QPT_XRC: - case IB_QPT_RC: + case MLX4_IB_QPT_RC: return sizeof (struct mlx4_wqe_ctrl_seg) + - sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_masked_atomic_seg) + sizeof (struct mlx4_wqe_raddr_seg); - case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + ALIGN(MLX4_IB_UD_HEADER_SIZE + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, @@ -325,44 +378,28 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) ALIGN(4 + sizeof (struct mlx4_wqe_inline_seg), sizeof (struct mlx4_wqe_data_seg)); - case IB_QPT_RAW_ETY: - return sizeof(struct mlx4_wqe_ctrl_seg) + - ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE + - sizeof(struct mlx4_wqe_inline_seg), - sizeof(struct mlx4_wqe_data_seg)); - default: return sizeof (struct mlx4_wqe_ctrl_seg); } } static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - int is_user, int has_srq_or_is_xrc, struct mlx4_ib_qp *qp) + int is_user, int has_rq, struct mlx4_ib_qp *qp) { /* Sanity check RQ size before proceeding */ if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || - cap->max_recv_sge > - min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) { - mlx4_ib_dbg("Requested RQ size (sge or wr) too large"); + cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) return -EINVAL; - } - if (has_srq_or_is_xrc) { - /* QPs attached to an SRQ should have no RQ */ - if (cap->max_recv_wr) { - mlx4_ib_dbg("non-zero RQ size for QP using SRQ"); + if (!has_rq) { + if (cap->max_recv_wr) return -EINVAL; - } qp->rq.wqe_cnt = qp->rq.max_gs = 0; } else { /* HW requires >= 1 RQ entry with >= 1 gather entry */ - if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) { - mlx4_ib_dbg("user QP RQ has 0 wr's or 0 sge's " - "(wr: 0x%x, sge: 0x%x)", cap->max_recv_wr, - cap->max_recv_sge); + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) return -EINVAL; - } qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); @@ -378,44 +415,32 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); cap->max_recv_sge = min(qp->rq.max_gs, min(dev->dev->caps.max_sq_sg, - dev->dev->caps.max_rq_sg)); + dev->dev->caps.max_rq_sg)); } - /* We don't support inline sends for kernel QPs (yet) */ - return 0; } static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - enum ib_qp_type type, struct mlx4_ib_qp *qp) + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) { int s; /* Sanity check SQ size before proceeding */ - if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || - cap->max_send_sge > - min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || cap->max_inline_data + send_wqe_overhead(type, qp->flags) + - sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) { - mlx4_ib_dbg("Requested SQ resources exceed device maxima"); + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) return -EINVAL; - } /* * For MLX transport we need 2 extra S/G entries: * one for the header and one for the checksum at the end */ - if ((type == IB_QPT_SMI || type == IB_QPT_GSI) && - cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) { - mlx4_ib_dbg("No space for SQP hdr/csum sge's"); + if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI || + type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) && + cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) return -EINVAL; - } - - if (type == IB_QPT_RAW_ETY && - cap->max_send_sge + 1 > dev->dev->caps.max_sq_sg) { - mlx4_ib_dbg("No space for RAW ETY hdr"); - return -EINVAL; - } s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + @@ -434,7 +459,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, * anymore, so we do this only if selective signaling is off. * * Further, on 32-bit platforms, we can't use vmap() to make - * the QP buffer virtually contigious. Thus we have to use + * the QP buffer virtually contiguous. Thus we have to use * constant-sized WRs to make sure a WR is always fully within * a single page-sized chunk. * @@ -457,7 +482,9 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, */ if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && qp->sq_signal_bits && BITS_PER_LONG == 64 && - type != IB_QPT_SMI && type != IB_QPT_GSI && type != IB_QPT_RAW_ETY) + type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && + !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) qp->sq.wqe_shift = ilog2(64); else qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); @@ -516,10 +543,8 @@ static int set_user_sq_size(struct mlx4_ib_dev *dev, if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || ucmd->log_sq_stride > ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || - ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) { - mlx4_ib_dbg("Requested max wqes or wqe stride exceeds max"); + ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) return -EINVAL; - } qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; qp->sq.wqe_shift = ucmd->log_sq_stride; @@ -530,30 +555,398 @@ static int set_user_sq_size(struct mlx4_ib_dev *dev, return 0; } +static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + qp->sqp_proxy_rcv = + kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt, + GFP_KERNEL); + if (!qp->sqp_proxy_rcv) + return -ENOMEM; + for (i = 0; i < qp->rq.wqe_cnt; i++) { + qp->sqp_proxy_rcv[i].addr = + kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr), + GFP_KERNEL); + if (!qp->sqp_proxy_rcv[i].addr) + goto err; + qp->sqp_proxy_rcv[i].map = + ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + } + return 0; + +err: + while (i > 0) { + --i; + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); + qp->sqp_proxy_rcv = NULL; + return -ENOMEM; +} + +static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + for (i = 0; i < qp->rq.wqe_cnt; i++) { + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) + return 0; + + return !attr->srq; +} + +static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp, + struct ib_qp_init_attr *attr, int *qpn) +{ + struct mlx4_ib_qpg_data *qpg_data; + int tss_num, rss_num; + int tss_align_num, rss_align_num; + int tss_base, rss_base = 0; + int err; + + /* Parent is part of the TSS range (in SW TSS ARP is sent via parent) */ + tss_num = 1 + attr->parent_attrib.tss_child_count; + tss_align_num = roundup_pow_of_two(tss_num); + rss_num = attr->parent_attrib.rss_child_count; + rss_align_num = roundup_pow_of_two(rss_num); + + if (rss_num > 1) { + /* RSS is requested */ + if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS)) + return -ENOSYS; + if (rss_align_num > dev->dev->caps.max_rss_tbl_sz) + return -EINVAL; + /* We must work with power of two */ + attr->parent_attrib.rss_child_count = rss_align_num; + } + + qpg_data = kzalloc(sizeof *qpg_data, GFP_KERNEL); + if (!qpg_data) + return -ENOMEM; + + if(pqp->flags & MLX4_IB_QP_NETIF) + err = mlx4_ib_steer_qp_alloc(dev, tss_align_num, &tss_base); + else + err = mlx4_qp_reserve_range(dev->dev, tss_align_num, + tss_align_num, &tss_base, 1); + if (err) + goto err1; + + if (tss_num > 1) { + u32 alloc = BITS_TO_LONGS(tss_align_num) * sizeof(long); + qpg_data->tss_bitmap = kzalloc(alloc, GFP_KERNEL); + if (qpg_data->tss_bitmap == NULL) { + err = -ENOMEM; + goto err2; + } + bitmap_fill(qpg_data->tss_bitmap, tss_num); + /* Note parent takes first index */ + clear_bit(0, qpg_data->tss_bitmap); + } + + if (rss_num > 1) { + u32 alloc = BITS_TO_LONGS(rss_align_num) * sizeof(long); + err = mlx4_qp_reserve_range(dev->dev, rss_align_num, + 1, &rss_base, 0); + if (err) + goto err3; + qpg_data->rss_bitmap = kzalloc(alloc, GFP_KERNEL); + if (qpg_data->rss_bitmap == NULL) { + err = -ENOMEM; + goto err4; + } + bitmap_fill(qpg_data->rss_bitmap, rss_align_num); + } + + qpg_data->tss_child_count = attr->parent_attrib.tss_child_count; + qpg_data->rss_child_count = attr->parent_attrib.rss_child_count; + qpg_data->qpg_parent = pqp; + qpg_data->qpg_tss_mask_sz = ilog2(tss_align_num); + qpg_data->tss_qpn_base = tss_base; + qpg_data->rss_qpn_base = rss_base; + + pqp->qpg_data = qpg_data; + *qpn = tss_base; + + return 0; + +err4: + mlx4_qp_release_range(dev->dev, rss_base, rss_align_num); + +err3: + if (tss_num > 1) + kfree(qpg_data->tss_bitmap); + +err2: + if(pqp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, tss_base, tss_align_num); + else + mlx4_qp_release_range(dev->dev, tss_base, tss_align_num); + +err1: + kfree(qpg_data); + return err; +} + +static void free_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp) +{ + struct mlx4_ib_qpg_data *qpg_data = pqp->qpg_data; + int align_num; + + if (qpg_data->tss_child_count > 1) + kfree(qpg_data->tss_bitmap); + + align_num = roundup_pow_of_two(1 + qpg_data->tss_child_count); + if(pqp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qpg_data->tss_qpn_base, align_num); + else + mlx4_qp_release_range(dev->dev, qpg_data->tss_qpn_base, align_num); + + if (qpg_data->rss_child_count > 1) { + kfree(qpg_data->rss_bitmap); + align_num = roundup_pow_of_two(qpg_data->rss_child_count); + mlx4_qp_release_range(dev->dev, qpg_data->rss_qpn_base, + align_num); + } + + kfree(qpg_data); +} + +static int alloc_qpg_qpn(struct ib_qp_init_attr *init_attr, + struct mlx4_ib_qp *pqp, int *qpn) +{ + struct mlx4_ib_qp *mqp = to_mqp(init_attr->qpg_parent); + struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data; + u32 idx, old; + + switch (init_attr->qpg_type) { + case IB_QPG_CHILD_TX: + if (qpg_data->tss_child_count == 0) + return -EINVAL; + do { + /* Parent took index 0 */ + idx = find_first_bit(qpg_data->tss_bitmap, + qpg_data->tss_child_count + 1); + if (idx >= qpg_data->tss_child_count + 1) + return -ENOMEM; + old = test_and_clear_bit(idx, qpg_data->tss_bitmap); + } while (old == 0); + idx += qpg_data->tss_qpn_base; + break; + case IB_QPG_CHILD_RX: + if (qpg_data->rss_child_count == 0) + return -EINVAL; + do { + idx = find_first_bit(qpg_data->rss_bitmap, + qpg_data->rss_child_count); + if (idx >= qpg_data->rss_child_count) + return -ENOMEM; + old = test_and_clear_bit(idx, qpg_data->rss_bitmap); + } while (old == 0); + idx += qpg_data->rss_qpn_base; + break; + default: + return -EINVAL; + } + + pqp->qpg_data = qpg_data; + *qpn = idx; + + return 0; +} + +static void free_qpg_qpn(struct mlx4_ib_qp *mqp, int qpn) +{ + struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data; + + switch (mqp->qpg_type) { + case IB_QPG_CHILD_TX: + /* Do range check */ + qpn -= qpg_data->tss_qpn_base; + set_bit(qpn, qpg_data->tss_bitmap); + break; + case IB_QPG_CHILD_RX: + qpn -= qpg_data->rss_qpn_base; + set_bit(qpn, qpg_data->rss_bitmap); + break; + default: + /* error */ + pr_warn("wrong qpg type (%d)\n", mqp->qpg_type); + break; + } +} + +static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + struct ib_qp_init_attr *attr, int *qpn) +{ + int err = 0; + + switch (attr->qpg_type) { + case IB_QPG_NONE: + /* Raw packet QPNs must be aligned to 8 bits. If not, the WQE + * BlueFlame setup flow wrongly causes VLAN insertion. */ + if (attr->qp_type == IB_QPT_RAW_PACKET) { + err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 1); + } else { + if(qp->flags & MLX4_IB_QP_NETIF) + err = mlx4_ib_steer_qp_alloc(dev, 1, qpn); + else + err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 0); + } + break; + case IB_QPG_PARENT: + err = init_qpg_parent(dev, qp, attr, qpn); + break; + case IB_QPG_CHILD_TX: + case IB_QPG_CHILD_RX: + err = alloc_qpg_qpn(attr, qp, qpn); + break; + default: + qp->qpg_type = IB_QPG_NONE; + err = -EINVAL; + break; + } + if (err) + return err; + qp->qpg_type = attr->qpg_type; + return 0; +} + +static void free_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + enum ib_qpg_type qpg_type, int qpn) +{ + switch (qpg_type) { + case IB_QPG_NONE: + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qpn, 1); + else + mlx4_qp_release_range(dev->dev, qpn, 1); + break; + case IB_QPG_PARENT: + free_qpg_parent(dev, qp); + break; + case IB_QPG_CHILD_TX: + case IB_QPG_CHILD_RX: + free_qpg_qpn(qp, qpn); + break; + default: + break; + } +} + +/* Revert allocation on create_qp_common */ +static void unalloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + struct ib_qp_init_attr *attr, int qpn) +{ + free_qpn_common(dev, qp, attr->qpg_type, qpn); +} + +static void release_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + free_qpn_common(dev, qp, qp->qpg_type, qp->mqp.qpn); +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) { int qpn; int err; + struct mlx4_ib_sqp *sqp; + struct mlx4_ib_qp *qp; + enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; + + /* When tunneling special qps, we use a plain UD qp */ + if (sqpn) { + if (mlx4_is_mfunc(dev->dev) && + (!mlx4_is_master(dev->dev) || + !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { + if (init_attr->qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_PROXY_GSI; + else if (mlx4_is_master(dev->dev)) + qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_PROXY_SMI; + } + qpn = sqpn; + /* add extra sg entry for tunneling */ + init_attr->cap.max_recv_sge++; + } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) { + struct mlx4_ib_qp_tunnel_init_attr *tnl_init = + container_of(init_attr, + struct mlx4_ib_qp_tunnel_init_attr, init_attr); + if ((tnl_init->proxy_qp_type != IB_QPT_SMI && + tnl_init->proxy_qp_type != IB_QPT_GSI) || + !mlx4_is_master(dev->dev)) + return -EINVAL; + if (tnl_init->proxy_qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_TUN_GSI; + else if (tnl_init->slave == mlx4_master_func_num(dev->dev)) + qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_TUN_SMI; + /* we are definitely in the PPF here, since we are creating + * tunnel QPs. base_tunnel_sqpn is therefore valid. */ + qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave + + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1; + sqpn = qpn; + } + + if (!*caller_qp) { + if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || + (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { + sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL); + if (!sqp) + return -ENOMEM; + qp = &sqp->qp; + qp->pri.vid = qp->alt.vid = 0xFFFF; + } else { + qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL); + if (!qp) + return -ENOMEM; + qp->pri.vid = qp->alt.vid = 0xFFFF; + } + } else + qp = *caller_qp; + + qp->mlx4_ib_qp_type = qp_type; mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); - spin_lock_init(&qp->xrc_reg_list_lock); INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); + INIT_LIST_HEAD(&qp->rules_list); qp->state = IB_QPS_RESET; if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, - !!init_attr->srq || !!init_attr->xrc_domain , qp); + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp); if (err) goto err; if (pd->uobject) { struct mlx4_ib_create_qp ucmd; + int shift; + int n; if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { err = -EFAULT; @@ -570,30 +963,25 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); - mlx4_ib_dbg("ib_umem_get error (%d)", err); goto err; } - err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem), - ilog2(qp->umem->page_size), &qp->mtt); - if (err) { - mlx4_ib_dbg("mlx4_mtt_init error (%d)", err); + n = ib_umem_page_count(qp->umem); + shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); + err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); + + if (err) goto err_buf; - } err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); - if (err) { - mlx4_ib_dbg("mlx4_ib_umem_write_mtt error (%d)", err); + if (err) goto err_mtt; - } - if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) { + if (qp_has_rq(init_attr)) { err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &qp->db); - if (err) { - mlx4_ib_dbg("mlx4_ib_db_map_user error (%d)", err); + if (err) goto err_mtt; - } } } else { qp->sq_no_prefetch = 0; @@ -604,11 +992,17 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) qp->flags |= MLX4_IB_QP_LSO; - err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp); + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP && + dev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED && + !mlx4_is_mfunc(dev->dev)) + qp->flags |= MLX4_IB_QP_NETIF; + + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); if (err) goto err; - if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) { + if (qp_has_rq(init_attr)) { err = mlx4_db_alloc(dev->dev, &qp->db, 0); if (err) goto err; @@ -617,9 +1011,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } if (qp->max_inline_data) { - err = mlx4_bf_alloc(dev->dev, &qp->bf); + err = mlx4_bf_alloc(dev->dev, &qp->bf, 0); if (err) { - mlx4_ib_dbg("failed to allocate blue flame register (%d)", err); + pr_debug("failed to allocate blue flame" + " register (%d)", err); qp->bf.uar = &dev->priv_uar; } } else @@ -632,16 +1027,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, &qp->mtt); - if (err) { - mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err); + if (err) goto err_buf; - } err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); - if (err) { - mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err); + if (err) goto err_mtt; - } qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); @@ -653,18 +1044,24 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } if (sqpn) { - qpn = sqpn; + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + if (alloc_proxy_bufs(pd->device, qp)) { + err = -ENOMEM; + goto err_wrid; + } + } } else { - err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn); + err = alloc_qpn_common(dev, qp, init_attr, &qpn); if (err) - goto err_wrid; + goto err_proxy; } err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); if (err) goto err_qpn; - if (init_attr->qp_type == IB_QPT_XRC) + if (init_attr->qp_type == IB_QPT_XRC_TGT) qp->mqp.qpn |= (1 << 23); /* @@ -675,18 +1072,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); qp->mqp.event = mlx4_ib_qp_event; - + if (!*caller_qp) + *caller_qp = qp; return 0; err_qpn: - if (!sqpn) - mlx4_qp_release_range(dev->dev, qpn, 1); + unalloc_qpn_common(dev, qp, init_attr, qpn); +err_proxy: + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + free_proxy_bufs(pd->device, qp); err_wrid: if (pd->uobject) { - if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) - mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), - &qp->db); + if (qp_has_rq(init_attr)) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); } else { kfree(qp->sq.wrid); kfree(qp->rq.wrid); @@ -702,13 +1101,15 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); err_db: - if (!pd->uobject && !init_attr->srq && init_attr->qp_type != IB_QPT_XRC) + if (!pd->uobject && qp_has_rq(init_attr)) mlx4_db_free(dev->dev, &qp->db); if (qp->max_inline_data) mlx4_bf_free(dev->dev, &qp->bf); err: + if (!*caller_qp) + kfree(qp); return err; } @@ -727,10 +1128,12 @@ static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) } static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { spin_lock_irq(&send_cq->lock); - else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + __acquire(&recv_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { @@ -740,10 +1143,12 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv } static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { + __release(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); - else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else { @@ -754,7 +1159,7 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re static void del_gid_entries(struct mlx4_ib_qp *qp) { - struct gid_entry *ge, *tmp; + struct mlx4_ib_gid_entry *ge, *tmp; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { list_del(&ge->list); @@ -762,19 +1167,66 @@ static void del_gid_entries(struct mlx4_ib_qp *qp) } } +static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_XRC_TGT) + return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd); + else + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx4_ib_qp *qp, + struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq); + *recv_cq = *send_cq; + break; + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = *send_cq; + break; + default: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = to_mcq(qp->ibqp.recv_cq); + break; + } +} + static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, int is_user) { struct mlx4_ib_cq *send_cq, *recv_cq; - if (qp->state != IB_QPS_RESET) + if (qp->state != IB_QPS_RESET) { if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) - printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n", + pr_warn("modify QP %06x to RESET failed.\n", qp->mqp.qpn); + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } - send_cq = to_mcq(qp->ibqp.send_cq); - recv_cq = to_mcq(qp->ibqp.recv_cq); + get_cqs(qp, &send_cq, &recv_cq); mlx4_ib_lock_cqs(send_cq, recv_cq); @@ -791,106 +1243,201 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_qp_free(dev->dev, &qp->mqp); - if (!is_sqp(dev, qp)) - mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) + release_qpn_common(dev, qp); mlx4_mtt_cleanup(dev->dev, &qp->mtt); if (is_user) { - if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC) + if (qp->rq.wqe_cnt) mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), &qp->db); ib_umem_release(qp->umem); } else { kfree(qp->sq.wrid); kfree(qp->rq.wrid); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + free_proxy_bufs(&dev->ib_dev, qp); mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); if (qp->max_inline_data) mlx4_bf_free(dev->dev, &qp->bf); - if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC) + + if (qp->rq.wqe_cnt) mlx4_db_free(dev->dev, &qp->db); } del_gid_entries(qp); } +static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) +{ + /* Native or PPF */ + if (!mlx4_is_mfunc(dev->dev) || + (mlx4_is_master(dev->dev) && + attr->create_flags & MLX4_IB_SRIOV_SQP)) { + return dev->dev->phys_caps.base_sqpn + + (attr->qp_type == IB_QPT_SMI ? 0 : 2) + + attr->port_num - 1; + } + /* PF or VF -- creating proxies */ + if (attr->qp_type == IB_QPT_SMI) + return dev->dev->caps.qp0_proxy[attr->port_num - 1]; + else + return dev->dev->caps.qp1_proxy[attr->port_num - 1]; +} + +static int check_qpg_attr(struct mlx4_ib_dev *dev, + struct ib_qp_init_attr *attr) +{ + if (attr->qpg_type == IB_QPG_NONE) + return 0; + + if (attr->qp_type != IB_QPT_UD) + return -EINVAL; + + if (attr->qpg_type == IB_QPG_PARENT) { + if (attr->parent_attrib.tss_child_count == 1) + return -EINVAL; /* Doesn't make sense */ + if (attr->parent_attrib.rss_child_count == 1) + return -EINVAL; /* Doesn't make sense */ + if ((attr->parent_attrib.tss_child_count == 0) && + (attr->parent_attrib.rss_child_count == 0)) + /* Should be called with IP_QPG_NONE */ + return -EINVAL; + if (attr->parent_attrib.rss_child_count > 1) { + int rss_align_num; + if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS)) + return -ENOSYS; + rss_align_num = roundup_pow_of_two( + attr->parent_attrib.rss_child_count); + if (rss_align_num > dev->dev->caps.max_rss_tbl_sz) + return -EINVAL; + } + } else { + struct mlx4_ib_qpg_data *qpg_data; + if (attr->qpg_parent == NULL) + return -EINVAL; + if (IS_ERR(attr->qpg_parent)) + return -EINVAL; + qpg_data = to_mqp(attr->qpg_parent)->qpg_data; + if (qpg_data == NULL) + return -EINVAL; + if (attr->qpg_type == IB_QPG_CHILD_TX && + !qpg_data->tss_child_count) + return -EINVAL; + if (attr->qpg_type == IB_QPG_CHILD_RX && + !qpg_data->rss_child_count) + return -EINVAL; + } + return 0; +} + +#define RESERVED_FLAGS_MASK ((((unsigned int)IB_QP_CREATE_RESERVED_END - 1) | IB_QP_CREATE_RESERVED_END) \ + & ~(IB_QP_CREATE_RESERVED_START - 1)) + +static enum mlx4_ib_qp_flags to_mlx4_ib_qp_flags(enum ib_qp_create_flags ib_qp_flags) +{ + enum mlx4_ib_qp_flags mlx4_ib_qp_flags = 0; + + if (ib_qp_flags & IB_QP_CREATE_IPOIB_UD_LSO) + mlx4_ib_qp_flags |= MLX4_IB_QP_LSO; + + if (ib_qp_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + mlx4_ib_qp_flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + + if (ib_qp_flags & IB_QP_CREATE_NETIF_QP) + mlx4_ib_qp_flags |= MLX4_IB_QP_NETIF; + + /* reserved flags */ + mlx4_ib_qp_flags |= (ib_qp_flags & RESERVED_FLAGS_MASK); + + return mlx4_ib_qp_flags; +} + struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { - struct mlx4_ib_dev *dev = to_mdev(pd->device); - struct mlx4_ib_sqp *sqp; - struct mlx4_ib_qp *qp; + struct mlx4_ib_qp *qp = NULL; int err; + u16 xrcdn = 0; + enum mlx4_ib_qp_flags mlx4_qp_flags = to_mlx4_ib_qp_flags(init_attr->create_flags); + struct ib_device *device; + /* see ib_core::ib_create_qp same handling */ + device = pd ? pd->device : init_attr->xrcd->device; /* - * We only support LSO and multicast loopback blocking, and - * only for kernel UD QPs. + * We only support LSO, vendor flag1, and multicast loopback blocking, + * and only for kernel UD QPs. */ - if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO | - IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + if (mlx4_qp_flags & ~(MLX4_IB_QP_LSO | + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | + MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_NETIF)) return ERR_PTR(-EINVAL); + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (init_attr->qp_type != IB_QPT_UD) + return ERR_PTR(-EINVAL); + } + if (init_attr->create_flags && - (pd->uobject || init_attr->qp_type != IB_QPT_UD)) + (udata || + ((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) && + init_attr->qp_type != IB_QPT_UD) || + ((mlx4_qp_flags & MLX4_IB_SRIOV_SQP) && + init_attr->qp_type > IB_QPT_GSI))) return ERR_PTR(-EINVAL); + err = check_qpg_attr(to_mdev(device), init_attr); + if (err) + return ERR_PTR(err); + switch (init_attr->qp_type) { - case IB_QPT_XRC: - if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + case IB_QPT_XRC_TGT: + pd = to_mxrcd(init_attr->xrcd)->pd; + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq; + /* fall through */ + case IB_QPT_XRC_INI: + if (!(to_mdev(device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); + init_attr->recv_cq = init_attr->send_cq; + /* fall through */ case IB_QPT_RC: case IB_QPT_UC: - case IB_QPT_UD: - case IB_QPT_RAW_ETH: - { + case IB_QPT_RAW_PACKET: qp = kzalloc(sizeof *qp, GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); - - err = create_qp_common(dev, pd, init_attr, udata, 0, qp); + qp->pri.vid = qp->alt.vid = 0xFFFF; + /* fall through */ + case IB_QPT_UD: + { + err = create_qp_common(to_mdev(device), pd, init_attr, udata, 0, &qp); if (err) { kfree(qp); return ERR_PTR(err); } - if (init_attr->qp_type == IB_QPT_XRC) - qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn; - else - qp->xrcdn = 0; - qp->ibqp.qp_num = qp->mqp.qpn; + qp->xrcdn = xrcdn; break; } - case IB_QPT_RAW_ETY: - if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY)) - return ERR_PTR(-ENOSYS); case IB_QPT_SMI: case IB_QPT_GSI: { /* Userspace is not allowed to create special QPs: */ - if (pd->uobject) { - mlx4_ib_dbg("Userspace is not allowed to create special QPs"); + if (udata) return ERR_PTR(-EINVAL); - } - sqp = kzalloc(sizeof *sqp, GFP_KERNEL); - if (!sqp) - return ERR_PTR(-ENOMEM); - - qp = &sqp->qp; - - err = create_qp_common(dev, pd, init_attr, udata, - dev->dev->caps.sqp_start + - (init_attr->qp_type == IB_QPT_RAW_ETY ? 4 : - (init_attr->qp_type == IB_QPT_SMI ? 0 : 2)) + - init_attr->port_num - 1, - qp); - if (err) { - kfree(sqp); + err = create_qp_common(to_mdev(device), pd, init_attr, udata, + get_sqp_num(to_mdev(device), init_attr), + &qp); + if (err) return ERR_PTR(err); - } qp->port = init_attr->port_num; qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; @@ -898,8 +1445,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, break; } default: - mlx4_ib_dbg("Invalid QP type requested for create_qp (%d)", - init_attr->qp_type); + /* Don't support raw QPs */ return ERR_PTR(-EINVAL); } @@ -910,11 +1456,13 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) { struct mlx4_ib_dev *dev = to_mdev(qp->device); struct mlx4_ib_qp *mqp = to_mqp(qp); + struct mlx4_ib_pd *pd; if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); - destroy_qp_common(dev, mqp, !!qp->pd->uobject); + pd = get_pd(mqp); + destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); if (is_sqp(dev, mqp)) kfree(to_msqp(mqp)); @@ -924,18 +1472,27 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) return 0; } -static int to_mlx4_st(enum ib_qp_type type) +static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) { switch (type) { - case IB_QPT_RC: return MLX4_QP_ST_RC; - case IB_QPT_UC: return MLX4_QP_ST_UC; - case IB_QPT_UD: return MLX4_QP_ST_UD; - case IB_QPT_XRC: return MLX4_QP_ST_XRC; - case IB_QPT_RAW_ETY: - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_RAW_ETH: return MLX4_QP_ST_MLX; - default: return -1; + case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC; + case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC; + case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD; + case MLX4_IB_QPT_XRC_INI: + case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX; + + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_MLX : -1); + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_PROXY_GSI: + case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_UD : -1); + default: return -1; } } @@ -986,8 +1543,10 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) } static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, - struct mlx4_qp_path *path, u8 port) + struct mlx4_ib_qp *qp, struct mlx4_qp_path *path, + u8 port, int is_primary) { + struct net_device *ndev; int err; int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_ETHERNET; @@ -995,6 +1554,10 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, int is_mcast; u16 vlan_tag; int vidx; + int smac_index; + u64 u64_mac; + u8 *smac; + struct mlx4_roce_smac_vlan_info *smac_info; path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); @@ -1008,7 +1571,7 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { - printk(KERN_ERR "sgid_index (%u) too large. max is %d\n", + pr_err("sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1); return -1; } @@ -1023,29 +1586,96 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, } if (is_eth) { - path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | - ((port - 1) << 6) | ((ah->sl & 0x7) << 3) | ((ah->sl & 8) >> 1); - if (!(ah->ah_flags & IB_AH_GRH)) return -1; + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 7) << 3); + + if (is_primary) + smac_info = &qp->pri; + else + smac_info = &qp->alt; + + vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); + if (vlan_tag < 0x1000) { + if (smac_info->vid < 0x1000) { + /* both valid vlan ids */ + if (smac_info->vid != vlan_tag) { + /* different VIDs. unreg old and reg new */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + path->fl = 1 << 6; + } else { + path->vlan_index = smac_info->vlan_index; + path->fl = 1 << 6; + } + } else { + /* no current vlan tag in qp */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + path->fl = 1 << 6; + } + } else { + /* have current vlan tag. unregister it at modify-qp success */ + if (smac_info->vid < 0x1000) { + smac_info->candidate_vid = 0xFFFF; + smac_info->update_vid = 1; + } + } + err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port); if (err) return err; + /* get smac_index for RoCE use. + * If no smac was yet assigned, register one. + * If one was already assigned, but the new mac differs, + * unregister the old one and register the new one. + */ + spin_lock(&dev->iboe.lock); + ndev = dev->iboe.netdevs[port - 1]; + if (ndev) { +#ifdef __linux__ + smac = ndev->dev_addr; /* fixme: cache this value */ +#else + smac = IF_LLADDR(ndev); /* fixme: cache this value */ +#endif + + u64_mac = mlx4_mac_to_u64(smac); + } else + u64_mac = dev->dev->caps.def_mac[port]; + spin_unlock(&dev->iboe.lock); + + if (!smac_info->smac || smac_info->smac != u64_mac) { + /* register candidate now, unreg if needed, after success */ + smac_index = mlx4_register_mac(dev->dev, port, u64_mac); + if (smac_index >= 0) { + smac_info->candidate_smac_index = smac_index; + smac_info->candidate_smac = u64_mac; + smac_info->candidate_smac_port = port; + } else + return -EINVAL; + } else + smac_index = smac_info->smac_index; + memcpy(path->dmac, mac, 6); path->ackto = MLX4_IB_LINK_TYPE_ETH; - /* use index 0 into MAC table for IBoE */ - path->grh_mylmc &= 0x80; + /* put MAC table smac index for IBoE */ + path->grh_mylmc = (u8) (smac_index) | 0x80 ; - vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); - if (vlan_tag < 0x1000) { - if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx)) - return -ENOENT; - - path->vlan_index = vidx; - path->fl = 1 << 6; - } } else path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); @@ -1055,7 +1685,7 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { - struct gid_entry *ge, *tmp; + struct mlx4_ib_gid_entry *ge, *tmp; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) { @@ -1065,23 +1695,59 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) } } +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + struct mlx4_qp_context *context) +{ + struct net_device *ndev; + u64 u64_mac; + u8 *smac; + int smac_index; + + ndev = dev->iboe.netdevs[qp->port - 1]; + if (ndev) { +#ifdef __linux__ + smac = ndev->dev_addr; /* fixme: cache this value */ +#else + smac = IF_LLADDR(ndev); /* fixme: cache this value */ +#endif + u64_mac = mlx4_mac_to_u64(smac); + } else + u64_mac = dev->dev->caps.def_mac[qp->port]; + + context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); + if (!qp->pri.smac) { + smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); + if (smac_index >= 0) { + qp->pri.candidate_smac_index = smac_index; + qp->pri.candidate_smac = u64_mac; + qp->pri.candidate_smac_port = qp->port; + context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; + } else + return -ENOENT; + } + return 0; +} static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_ib_pd *pd; + struct mlx4_ib_cq *send_cq, *recv_cq; struct mlx4_qp_context *context; enum mlx4_qp_optpar optpar = 0; int sqd_event; + int steer_qp = 0; int err = -EINVAL; + int is_eth = -1; context = kzalloc(sizeof *context, GFP_KERNEL); if (!context) return -ENOMEM; context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | - (to_mlx4_st(ibqp->qp_type) << 16)); + (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); @@ -1099,11 +1765,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, break; } } - if (ibqp->qp_type == IB_QPT_RAW_ETH) - context->mtu_msgmax = 0xff; - else if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || - ibqp->qp_type == IB_QPT_RAW_ETY) + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; + else if (ibqp->qp_type == IB_QPT_RAW_PACKET) + context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX; else if (ibqp->qp_type == IB_QPT_UD) { if (qp->flags & MLX4_IB_QP_LSO) context->mtu_msgmax = (IB_MTU_4096 << 5) | @@ -1112,7 +1778,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; } else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { - printk(KERN_ERR "path MTU (%u) is invalid\n", + pr_err("path MTU (%u) is invalid\n", attr->path_mtu); goto out; } @@ -1130,8 +1796,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; - if (ibqp->qp_type == IB_QPT_XRC) - context->xrcd = cpu_to_be32((u32) qp->xrcdn); + context->xrcd = cpu_to_be32((u32) qp->xrcdn); + context->param3 |= cpu_to_be32(1 << 30); } if (qp->ibqp.uobject) @@ -1150,63 +1816,67 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR && - dev->counters[qp->port - 1] != -1) { - context->pri_path.counter_index = dev->counters[qp->port - 1]; - optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; + if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + if (dev->counters[qp->port - 1] != -1) { + context->pri_path.counter_index = + dev->counters[qp->port - 1]; + optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; + } else + context->pri_path.counter_index = 0xff; + + if (qp->flags & MLX4_IB_QP_NETIF && + (qp->qpg_type == IB_QPG_NONE || qp->qpg_type == IB_QPG_PARENT)) { + mlx4_ib_steer_qp_reg(dev, qp, 1); + steer_qp = 1; + } } if (attr_mask & IB_QP_PKEY_INDEX) { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.disable_pkey_check = 0x40; context->pri_path.pkey_index = attr->pkey_index; optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; } if (attr_mask & IB_QP_AV) { - if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, - attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) { - mlx4_ib_dbg("qpn 0x%x: could not set pri path params", - ibqp->qp_num); + if (mlx4_set_path(dev, &attr->ah_attr, qp, &context->pri_path, + attr_mask & IB_QP_PORT ? + attr->port_num : qp->port, 1)) goto out; - } optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE); } if (attr_mask & IB_QP_TIMEOUT) { - context->pri_path.ackto |= (attr->timeout << 3); + context->pri_path.ackto |= attr->timeout << 3; optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; } if (attr_mask & IB_QP_ALT_PATH) { if (attr->alt_port_num == 0 || - attr->alt_port_num > dev->num_ports) { - mlx4_ib_dbg("qpn 0x%x: invalid alternate port num (%d)", - ibqp->qp_num, attr->alt_port_num); + attr->alt_port_num > dev->dev->caps.num_ports) goto out; - } if (attr->alt_pkey_index >= - dev->dev->caps.pkey_table_len[attr->alt_port_num]) { - mlx4_ib_dbg("qpn 0x%x: invalid alt pkey index (0x%x)", - ibqp->qp_num, attr->alt_pkey_index); + dev->dev->caps.pkey_table_len[attr->alt_port_num]) goto out; - } - if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, - attr->alt_port_num)) { - mlx4_ib_dbg("qpn 0x%x: could not set alt path params", - ibqp->qp_num); + if (mlx4_set_path(dev, &attr->alt_ah_attr, qp, &context->alt_path, + attr->alt_port_num, 0)) goto out; - } context->alt_path.pkey_index = attr->alt_pkey_index; context->alt_path.ackto = attr->alt_timeout << 3; optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } - context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn); - context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); + pd = get_pd(qp); + get_cqs(qp, &send_cq, &recv_cq); + context->pd = cpu_to_be32(pd->pdn); + context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); + context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); + context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); /* Set "fast registration enabled" for all kernel QPs */ if (!qp->ibqp.uobject) @@ -1232,8 +1902,6 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (attr_mask & IB_QP_SQ_PSN) context->next_send_psn = cpu_to_be32(attr->sq_psn); - context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn); - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic) context->params2 |= @@ -1246,6 +1914,18 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; } + if (attr_mask & IB_M_EXT_CLASS_1) + context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER); + + /* for now we enable also sqe on send */ + if (attr_mask & IB_M_EXT_CLASS_2) { + context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_SQ); + context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER); + } + + if (attr_mask & IB_M_EXT_CLASS_3) + context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_RQ); + if (ibqp->srq) context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); @@ -1256,30 +1936,65 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (attr_mask & IB_QP_RQ_PSN) context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); - context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn); - + /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */ if (attr_mask & IB_QP_QKEY) { - context->qkey = cpu_to_be32(attr->qkey); + if (qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) + context->qkey = cpu_to_be32(IB_QP_SET_QKEY); + else { + if (mlx4_is_mfunc(dev->dev) && + !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) && + (attr->qkey & MLX4_RESERVED_QKEY_MASK) == + MLX4_RESERVED_QKEY_BASE) { + pr_err("Cannot use reserved QKEY" + " 0x%x (range 0xffff0000..0xffffffff" + " is reserved)\n", attr->qkey); + err = -EINVAL; + goto out; + } + context->qkey = cpu_to_be32(attr->qkey); + } optpar |= MLX4_QP_OPTPAR_Q_KEY; } if (ibqp->srq) context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); - if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC && - cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR && (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || - ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_RAW_ETY || - ibqp->qp_type == IB_QPT_RAW_ETH)) { + ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == IB_QPT_RAW_PACKET)) { context->pri_path.sched_queue = (qp->port - 1) << 6; - if (is_qp0(dev, qp)) + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) { context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; - else + if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI) + context->pri_path.fl = 0x80; + } else { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; + } + is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) + context->pri_path.feup = 1 << 7; /* don't fsm */ + /* handle smac_index */ + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { + err = handle_eth_ud_smac_index(dev, qp, context); + if (err) + return -EINVAL; + } + } } if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && @@ -1291,6 +2006,43 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->rlkey |= (1 << 4); + if ((attr_mask & IB_QP_GROUP_RSS) && + (qp->qpg_data->rss_child_count > 1)) { + struct mlx4_ib_qpg_data *qpg_data = qp->qpg_data; + void *rss_context_base = &context->pri_path; + struct mlx4_rss_context *rss_context = + (struct mlx4_rss_context *) (rss_context_base + + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH); + + context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET); + + /* This should be tbl_sz_base_qpn */ + rss_context->base_qpn = cpu_to_be32(qpg_data->rss_qpn_base | + (ilog2(qpg_data->rss_child_count) << 24)); + rss_context->default_qpn = cpu_to_be32(qpg_data->rss_qpn_base); + /* This should be flags_hash_fn */ + rss_context->flags = MLX4_RSS_TCP_IPV6 | + MLX4_RSS_TCP_IPV4; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS) { + rss_context->base_qpn_udp = rss_context->default_qpn; + rss_context->flags |= MLX4_RSS_IPV6 | + MLX4_RSS_IPV4 | + MLX4_RSS_UDP_IPV6 | + MLX4_RSS_UDP_IPV4; + } + if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP) { + static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, + 0x1983A2FC, 0x943E1ADB, 0xD9389E6B, 0xD1039C2C, + 0xA74499AD, 0x593D56D9, 0xF3253C06, 0x2ADC1FFC}; + rss_context->hash_fn = MLX4_RSS_HASH_TOP; + memcpy(rss_context->rss_key, rsskey, + sizeof(rss_context->rss_key)); + } else { + rss_context->hash_fn = MLX4_RSS_HASH_XOR; + memset(rss_context->rss_key, 0, + sizeof(rss_context->rss_key)); + } + } /* * Before passing a kernel QP to the HW, make sure that the * ownership bits of the send queue are set and the SQ @@ -1333,6 +2085,29 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (is_sqp(dev, qp)) store_sqp_attrs(to_msqp(qp), attr, attr_mask); + /* Set 'ignore_cq_overrun' bits for collectives offload */ + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + if (attr_mask & (IB_M_EXT_CLASS_2 | IB_M_EXT_CLASS_3)) { + err = mlx4_ib_ignore_overrun_cq(ibqp->send_cq); + if (err) { + pr_err("Failed to set ignore CQ " + "overrun for QP 0x%x's send CQ\n", + ibqp->qp_num); + goto out; + } + + if (ibqp->recv_cq != ibqp->send_cq) { + err = mlx4_ib_ignore_overrun_cq(ibqp->recv_cq); + if (err) { + pr_err("Failed to set ignore " + "CQ overrun for QP 0x%x's recv " + "CQ\n", ibqp->qp_num); + goto out; + } + } + } + } + /* * If we moved QP0 to RTR, bring the IB link up; if we moved * QP0 to RESET or ERROR, bring the link back down. @@ -1340,7 +2115,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (is_qp0(dev, qp)) { if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) if (mlx4_INIT_PORT(dev->dev, qp->port)) - printk(KERN_WARNING "INIT_PORT failed for port %d\n", + pr_warn("INIT_PORT failed for port %d\n", qp->port); if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && @@ -1352,23 +2127,120 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ - if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn, - ibqp->srq ? to_msrq(ibqp->srq): NULL); - if (ibqp->send_cq != ibqp->recv_cq) - mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL); + if (new_state == IB_QPS_RESET) { + if (!ibqp->uobject) { + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); - qp->rq.head = 0; - qp->rq.tail = 0; - qp->sq.head = 0; - qp->sq.tail = 0; - qp->sq_next_wqe = 0; - if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC) - *qp->db.db = 0; + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq_next_wqe = 0; + if (qp->rq.wqe_cnt) + *qp->db.db = 0; + + if (qp->flags & MLX4_IB_QP_NETIF && + (qp->qpg_type == IB_QPG_NONE || + qp->qpg_type == IB_QPG_PARENT)) + mlx4_ib_steer_qp_reg(dev, qp, 0); + } + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } } out: + if (err && steer_qp) + mlx4_ib_steer_qp_reg(dev, qp, 0); kfree(context); + if (qp->pri.candidate_smac) { + if (err) + mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); + else { + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + } + qp->pri.smac = qp->pri.candidate_smac; + qp->pri.smac_index = qp->pri.candidate_smac_index; + qp->pri.smac_port = qp->pri.candidate_smac_port; + + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + if (qp->alt.candidate_smac) { + if (err) + mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->pri.candidate_smac); + else { + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + } + qp->alt.smac = qp->alt.candidate_smac; + qp->alt.smac_index = qp->alt.candidate_smac_index; + qp->alt.smac_port = qp->alt.candidate_smac_port; + + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + + if (qp->pri.update_vid) { + if (err) { + if (qp->pri.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, + qp->pri.candidate_vid); + } else { + if (qp->pri.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, + qp->pri.vid); + qp->pri.vid = qp->pri.candidate_vid; + qp->pri.vlan_port = qp->pri.candidate_vlan_port; + qp->pri.vlan_index = qp->pri.candidate_vlan_index; + } + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.update_vid) { + if (err) { + if (qp->alt.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, + qp->alt.candidate_vid); + } else { + if (qp->alt.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, + qp->alt.vid); + qp->alt.vid = qp->alt.candidate_vid; + qp->alt.vlan_port = qp->alt.candidate_vlan_port; + qp->alt.vlan_index = qp->alt.candidate_vlan_index; + } + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + return err; } @@ -1385,59 +2257,62 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { - mlx4_ib_dbg("qpn 0x%x: invalid attribute mask specified " - "for transition %d to %d. qp_type %d, attr_mask 0x%x", - ibqp->qp_num, cur_state, new_state, - ibqp->qp_type, attr_mask); + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask & ~IB_M_QP_MOD_VEND_MASK)) { + pr_debug("qpn 0x%x: invalid attribute mask specified " + "for transition %d to %d. qp_type %d," + " attr_mask 0x%x\n", + ibqp->qp_num, cur_state, new_state, + ibqp->qp_type, attr_mask); goto out; } - if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type != IB_QPT_RAW_ETH) && + if ((attr_mask & IB_M_QP_MOD_VEND_MASK) && !dev->dev->caps.sync_qp) { + pr_err("extended verbs are not supported by %s\n", + dev->ib_dev.name); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > dev->num_ports)) { - mlx4_ib_dbg("qpn 0x%x: invalid port number (%d) specified " - "for transition %d to %d. qp_type %d", - ibqp->qp_num, attr->port_num, cur_state, - new_state, ibqp->qp_type); + pr_debug("qpn 0x%x: invalid port number (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->port_num, cur_state, + new_state, ibqp->qp_type); goto out; } - if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_ETH) && - (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) - != IB_LINK_LAYER_ETHERNET)) { - mlx4_ib_dbg("qpn 0x%x: invalid port (%d) specified (not RDMAoE)" - "for transition %d to %d. qp_type %d", - ibqp->qp_num, attr->port_num, cur_state, - new_state, ibqp->qp_type); + if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) && + (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) != + IB_LINK_LAYER_ETHERNET)) goto out; - } if (attr_mask & IB_QP_PKEY_INDEX) { int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { - mlx4_ib_dbg("qpn 0x%x: invalid pkey index (%d) specified " - "for transition %d to %d. qp_type %d", - ibqp->qp_num, attr->pkey_index, cur_state, - new_state, ibqp->qp_type); + pr_debug("qpn 0x%x: invalid pkey index (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->pkey_index, cur_state, + new_state, ibqp->qp_type); goto out; } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { - mlx4_ib_dbg("qpn 0x%x: max_rd_atomic (%d) too large. " - "Transition %d to %d. qp_type %d", - ibqp->qp_num, attr->max_rd_atomic, cur_state, - new_state, ibqp->qp_type); + pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_rd_atomic, cur_state, + new_state, ibqp->qp_type); goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { - mlx4_ib_dbg("qpn 0x%x: max_dest_rd_atomic (%d) too large. " - "Transition %d to %d. qp_type %d", - ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, - new_state, ibqp->qp_type); + pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, + new_state, ibqp->qp_type); goto out; } @@ -1453,181 +2328,73 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, return err; } -static int build_raw_ety_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, - void *wqe, unsigned *mlx_seg_len) +static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, + struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) { - int payload = 0; - int header_size, packet_length; - struct mlx4_wqe_mlx_seg *mlx = wqe; - struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; - u32 *lrh = wqe + sizeof *mlx + sizeof *inl; - int i; - - /* Only IB_WR_SEND is supported */ - if (wr->opcode != IB_WR_SEND) - return -EINVAL; - - for (i = 0; i < wr->num_sge; ++i) - payload += wr->sg_list[i].length; - - header_size = IB_LRH_BYTES + 4; /* LRH + RAW_HEADER (32 bits) */ - - /* headers + payload and round up */ - packet_length = (header_size + payload + 3) / 4; - - mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - - mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_ICRC | - (wr->wr.raw_ety.lrh->service_level << 8)); - - mlx->rlid = wr->wr.raw_ety.lrh->destination_lid; - - wr->wr.raw_ety.lrh->packet_length = cpu_to_be16(packet_length); - - ib_lrh_header_pack(wr->wr.raw_ety.lrh, lrh); - lrh += IB_LRH_BYTES / 4; /* LRH size is a dword multiple */ - *lrh = cpu_to_be32(wr->wr.raw_ety.eth_type); - - inl->byte_count = cpu_to_be32(1 << 31 | header_size); - - *mlx_seg_len = - ALIGN(sizeof(struct mlx4_wqe_inline_seg) + header_size, 16); - - return 0; -} - -static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, - void *wqe, unsigned *mlx_seg_len) -{ - struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; + struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); + struct ib_device *ib_dev = &mdev->ib_dev; struct mlx4_wqe_mlx_seg *mlx = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); u16 pkey; + u32 qkey; int send_size; int header_size; int spc; int i; - union ib_gid sgid; - int is_eth; - int is_grh; - int is_vlan = 0; - int err; - u16 vlan; - vlan = 0; + if (wr->opcode != IB_WR_SEND) + return -EINVAL; + send_size = 0; + for (i = 0; i < wr->num_sge; ++i) send_size += wr->sg_list[i].length; - is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; - is_grh = mlx4_ib_ah_grh_present(ah); - err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, - ah->av.ib.gid_index, &sgid); - if (err) - return err; - if (is_eth) { - is_vlan = rdma_get_vlan_id(&sgid) < 0x1000; - vlan = rdma_get_vlan_id(&sgid); - } + /* for proxy-qp0 sends, need to add in size of tunnel header */ + /* for tunnel-qp0 sends, tunnel header is already in s/g list */ + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) + send_size += sizeof (struct mlx4_ib_tunnel_header); - ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); - if (!is_eth) { + ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header); + + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) { sqp->ud_header.lrh.service_level = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; - sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; - sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); - } - - if (is_grh) { - sqp->ud_header.grh.traffic_class = - (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; - sqp->ud_header.grh.flow_label = - ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); - sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; - ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, - ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); - memcpy(sqp->ud_header.grh.destination_gid.raw, - ah->av.ib.dgid, 16); + sqp->ud_header.lrh.destination_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + sqp->ud_header.lrh.source_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); } mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - if (!is_eth) { - mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | - (sqp->ud_header.lrh.destination_lid == - IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | - (sqp->ud_header.lrh.service_level << 8)); - mlx->rlid = sqp->ud_header.lrh.destination_lid; - } + /* force loopback */ + mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR); + mlx->rlid = sqp->ud_header.lrh.destination_lid; - switch (wr->opcode) { - case IB_WR_SEND: - sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; - sqp->ud_header.immediate_present = 0; - break; - case IB_WR_SEND_WITH_IMM: - sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; - sqp->ud_header.immediate_present = 1; - sqp->ud_header.immediate_data = wr->ex.imm_data; - break; - default: - return -EINVAL; - } - - if (is_eth) { - u8 *smac; - - memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); -#ifdef __linux__ - smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */ -#else - smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]); /* fixme: cache this value */ -#endif - memcpy(sqp->ud_header.eth.smac_h, smac, 6); - if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) - mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); - if (!is_vlan) - sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE); - else { - u16 pcp; - - sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE); - pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13; - sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); - } - } else { - sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; - if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) - sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; - } + sqp->ud_header.lrh.virtual_lane = 0; sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); - if (!sqp->qp.ibqp.qp_num) - ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); - else - ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); + ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey); sqp->ud_header.bth.pkey = cpu_to_be16(pkey); - sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER) + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + else + sqp->ud_header.bth.destination_qpn = + cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); - sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? - sqp->qkey : wr->wr.ud.remote_qkey); - sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + sqp->ud_header.deth.qkey = cpu_to_be32(qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); + + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); - if (0) { - printk(KERN_ERR "built UD header of size %d:\n", header_size); - for (i = 0; i < header_size / 4; ++i) { - if (i % 8 == 0) - printk(" [%02x] ", i * 4); - printk(" %08x", - be32_to_cpu(((__be32 *) sqp->header_buf)[i])); - if ((i + 1) % 8 == 0) - printk("\n"); - } - printk("\n"); - } - /* * Inline data segments may not cross a 64 byte boundary. If * our UD header is bigger than the space available up to the @@ -1669,6 +2436,213 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, return 0; } +static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct ib_device *ib_dev = sqp->qp.ibqp.device; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_ctrl_seg *ctrl = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + union ib_gid sgid; + u16 pkey; + int send_size; + int header_size; + int spc; + int i; + int is_eth; + int is_vlan = 0; + int is_grh; + u16 vlan = 0; + int err = 0; + + send_size = 0; + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; + is_grh = mlx4_ib_ah_grh_present(ah); + if (is_eth) { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid.raw[0]); + if (err) + return err; + } else { + err = ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid); + if (err) + return err; + } + + vlan = rdma_get_vlan_id(&sgid); + is_vlan = vlan < 0x1000; + } + ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); + + if (!is_eth) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + if (is_grh) { + sqp->ud_header.grh.traffic_class = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.grh.flow_label = + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + if (is_eth) + memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); + else { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + sqp->ud_header.grh.source_gid.global.subnet_prefix = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + subnet_prefix; + sqp->ud_header.grh.source_gid.global.interface_id = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + guid_cache[ah->av.ib.gid_index]; + } else + ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, + &sqp->ud_header.grh.source_gid); + } + memcpy(sqp->ud_header.grh.destination_gid.raw, + ah->av.ib.dgid, 16); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + if (!is_eth) { + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + if (ah->av.ib.port_pd & cpu_to_be32(0x80000000)) + mlx->flags |= cpu_to_be32(0x1); /* force loopback */ + mlx->rlid = sqp->ud_header.lrh.destination_lid; + } + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->ex.imm_data; + break; + default: + return -EINVAL; + } + + if (is_eth) { + u8 smac[6]; + struct in6_addr in6; + + u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; + + mlx->sched_prio = cpu_to_be16(pcp); + + memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); + /* FIXME: cache smac value? */ + memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); + memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); + memcpy(&in6, sgid.raw, sizeof(in6)); + rdma_get_ll_mac(&in6, smac); + memcpy(sqp->ud_header.eth.smac_h, smac, 6); + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) + mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); + if (!is_vlan) { + sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + } else { + sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); + } + } else { + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + } + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + if (0) { + pr_err("built UD header of size %d:\n", header_size); + for (i = 0; i < header_size / 4; ++i) { + if (i % 8 == 0) + pr_err(" [%02x] ", i * 4); + pr_cont(" %08x", + be32_to_cpu(((__be32 *) sqp->header_buf)[i])); + if ((i + 1) % 8 == 0) + pr_cont("\n"); + } + pr_err("\n"); + } + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) { unsigned cur; @@ -1757,14 +2731,70 @@ static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, } static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, - struct ib_send_wr *wr, __be16 *vlan) + struct ib_send_wr *wr) { memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); - *vlan = dseg->vlan; +} + +static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, + struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr, enum ib_qp_type qpt) +{ + union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; + struct mlx4_av sqp_av = {0}; + int port = *((u8 *) &av->ib.port_pd) & 0x3; + + /* force loopback */ + sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000); + sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */ + sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel & + cpu_to_be32(0xf0000000); + + memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); + /* This function used only for sending on QP1 proxies */ + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + /* Use QKEY from the QP context, which is set by master */ + dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); +} + +static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + struct mlx4_ib_tunnel_header hdr; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + int spc; + int i; + + memcpy(&hdr.av, &ah->av, sizeof hdr.av); + hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); + hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (sizeof (hdr) <= spc) { + memcpy(inl + 1, &hdr, sizeof (hdr)); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr)); + i = 1; + } else { + memcpy(inl + 1, &hdr, spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16); } static void set_mlx_icrc_seg(void *dseg) @@ -1814,11 +2844,12 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len, - __be32 *lso_hdr_sz, int *blh) + __be32 *lso_hdr_sz, __be32 *blh) { unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); - *blh = unlikely(halign > 64) ? 1 : 0; + if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) + *blh = cpu_to_be32(1 << 6); if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && wr->num_sge > qp->sq.max_gs - (halign >> 4))) @@ -1847,6 +2878,13 @@ static __be32 send_ieth(struct ib_send_wr *wr) } } +static void add_zero_len_inline(void *wqe) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + memset(wqe, 0, 16); + inl->byte_count = cpu_to_be32(1 << 31); +} + static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr, void *wqe, int *sz) { @@ -1923,7 +2961,8 @@ static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr, * implementations may use move-string-buffer assembler instructions, * which do not guarantee order of copying. */ -static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt) +static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, + unsigned bytecnt) { __iowrite64_copy(dst, src, bytecnt / 8); } @@ -1933,7 +2972,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, { struct mlx4_ib_qp *qp = to_mqp(ibqp); void *wqe; - struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_ctrl_seg *uninitialized_var(ctrl); struct mlx4_wqe_data_seg *dseg; unsigned long flags; int nreq; @@ -1945,29 +2984,24 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 dummy; __be32 *lso_wqe; __be32 uninitialized_var(lso_hdr_sz); + __be32 blh; int i; - int blh = 0; - __be16 vlan = 0; int inl = 0; - - ctrl = NULL; spin_lock_irqsave(&qp->sq.lock, flags); ind = qp->sq_next_wqe; for (nreq = 0; wr; ++nreq, wr = wr->next) { lso_wqe = &dummy; + blh = 0; if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { - mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num); err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->sq.max_gs)) { - mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)", - ibqp->qp_num, wr->num_sge); err = -EINVAL; *bad_wr = wr; goto out; @@ -1992,13 +3026,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wqe += sizeof *ctrl; size = sizeof *ctrl / 16; - switch (ibqp->qp_type) { - case IB_QPT_XRC: - ctrl->srcrb_flags |= - cpu_to_be32(wr->xrc_remote_srq_num << 8); - /* fall thru */ - case IB_QPT_RC: - case IB_QPT_UC: + switch (qp->mlx4_ib_qp_type) { + case MLX4_IB_QPT_RC: + case MLX4_IB_QPT_UC: switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: @@ -2059,8 +3089,26 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } break; - case IB_QPT_UD: - set_datagram_seg(wqe, wr, &vlan); + case MLX4_IB_QPT_TUN_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_TUN_GSI: + /* this is a UD qp used in MAD responses to slaves. */ + set_datagram_seg(wqe, wr); + /* set the forced-loopback bit in the data seg av */ + *(__be32 *) wqe |= cpu_to_be32(0x80000000); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; + case MLX4_IB_QPT_UD: + set_datagram_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; @@ -2076,20 +3124,48 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } break; - case IB_QPT_SMI: - case IB_QPT_GSI: - err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen); + case MLX4_IB_QPT_PROXY_SMI_OWNER: + if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) { + err = -ENOSYS; + *bad_wr = wr; + goto out; + } + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; goto out; } wqe += seglen; size += seglen / 16; + /* to start tunnel header on a cache-line boundary */ + add_zero_len_inline(wqe); + wqe += 16; + size++; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_PROXY_SMI: + /* don't allow QP0 sends on guests */ + err = -ENOSYS; + *bad_wr = wr; + goto out; + case MLX4_IB_QPT_PROXY_GSI: + /* If we are tunneling special qps, this is a UD qp. + * In this case we first add a UD segment targeting + * the tunnel qp, and then add a header with address + * information */ + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; break; - case IB_QPT_RAW_ETY: - err = build_raw_ety_header(to_msqp(qp), wr, ctrl, - &seglen); + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; goto out; @@ -2108,13 +3184,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, * cacheline. This avoids issues with WQE * prefetching. */ - dseg = wqe; dseg += wr->num_sge - 1; /* Add one more inline data segment for ICRC for MLX sends */ - if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI)) { + if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) { set_mlx_icrc_seg(dseg + 1); size += sizeof (struct mlx4_wqe_data_seg) / 16; } @@ -2127,7 +3204,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sz; } } else { - size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); + size += wr->num_sge * + (sizeof(struct mlx4_wqe_data_seg) / 16); for (i = wr->num_sge - 1; i >= 0; --i, --dseg) set_data_seg(dseg, wr->sg_list + i); } @@ -2139,15 +3217,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, */ wmb(); *lso_wqe = lso_hdr_sz; - ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; - if (vlan) { - ctrl->ins_vlan = 1 << 6; - ctrl->vlan_tag = vlan; - } - /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start @@ -2155,14 +3227,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, */ wmb(); - if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { + if (wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { + *bad_wr = wr; err = -EINVAL; goto out; } ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | - (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | - (blh ? cpu_to_be32(1 << 6) : 0); + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); @@ -2185,6 +3257,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, out: if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) { ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8); + /* We set above doorbell_qpn bits to 0 as part of vlan + * tag initialization, so |= should be correct. + */ *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; /* * Make sure that descriptor is written to memory @@ -2239,23 +3314,22 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, int err = 0; int nreq; int ind; + int max_gs; int i; + max_gs = qp->rq.max_gs; spin_lock_irqsave(&qp->rq.lock, flags); ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { - mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num); err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->rq.max_gs)) { - mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)", - ibqp->qp_num, wr->num_sge); err = -EINVAL; *bad_wr = wr; goto out; @@ -2263,10 +3337,25 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, scat = get_recv_wqe(qp, ind); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + ib_dma_sync_single_for_device(ibqp->device, + qp->sqp_proxy_rcv[ind].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + scat->byte_count = + cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr)); + /* use dma lkey from upper layer entry */ + scat->lkey = cpu_to_be32(wr->sg_list->lkey); + scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map); + scat++; + max_gs--; + } + for (i = 0; i < wr->num_sge; ++i) __set_data_seg(scat + i, wr->sg_list + i); - if (i < qp->rq.max_gs) { + if (i < max_gs) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); scat[i].addr = 0; @@ -2334,10 +3423,10 @@ static int to_ib_qp_access_flags(int mlx4_flags) return ib_flags; } -static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr, - struct mlx4_qp_path *path) +static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx4_qp_path *path) { - struct mlx4_dev *dev = ib_dev->dev; + struct mlx4_dev *dev = ibdev->dev; int is_eth; memset(ib_ah_attr, 0, sizeof *ib_ah_attr); @@ -2346,7 +3435,7 @@ static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_a if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) return; - is_eth = rdma_port_get_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) == + is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) == IB_LINK_LAYER_ETHERNET; if (is_eth) ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | @@ -2355,7 +3444,6 @@ static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_a ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; ib_ah_attr->dlid = be16_to_cpu(path->rlid); - ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; @@ -2407,8 +3495,7 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr qp_attr->qp_access_flags = to_ib_qp_access_flags(be32_to_cpu(context.params2)); - if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || - qp->ibqp.qp_type == IB_QPT_XRC) { + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; @@ -2463,308 +3550,21 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr if (qp->flags & MLX4_IB_QP_LSO) qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + if (qp->flags & MLX4_IB_QP_NETIF) + qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; + + qp_init_attr->sq_sig_type = + qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + + qp_init_attr->qpg_type = ibqp->qpg_type; + if (ibqp->qpg_type == IB_QPG_PARENT) + qp_init_attr->cap.qpg_tss_mask_sz = qp->qpg_data->qpg_tss_mask_sz; + else + qp_init_attr->cap.qpg_tss_mask_sz = 0; + out: mutex_unlock(&qp->mutex); return err; } -int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr, - u32 *qp_num) -{ - struct mlx4_ib_dev *dev = to_mdev(init_attr->xrc_domain->device); - struct mlx4_ib_xrcd *xrcd = to_mxrcd(init_attr->xrc_domain); - struct mlx4_ib_qp *qp; - struct ib_qp *ibqp; - struct mlx4_ib_xrc_reg_entry *ctx_entry; - unsigned long flags; - int err; - - if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) - return -ENOSYS; - - if (init_attr->qp_type != IB_QPT_XRC) - return -EINVAL; - - ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL); - if (!ctx_entry) - return -ENOMEM; - - qp = kzalloc(sizeof *qp, GFP_KERNEL); - if (!qp) { - kfree(ctx_entry); - return -ENOMEM; - } - mutex_lock(&dev->xrc_reg_mutex); - qp->flags = MLX4_IB_XRC_RCV; - qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn; - INIT_LIST_HEAD(&qp->xrc_reg_list); - err = create_qp_common(dev, xrcd->pd, init_attr, NULL, 0, qp); - if (err) { - mutex_unlock(&dev->xrc_reg_mutex); - kfree(ctx_entry); - kfree(qp); - return err; - } - - ibqp = &qp->ibqp; - /* set the ibpq attributes which will be used by the mlx4 module */ - ibqp->qp_num = qp->mqp.qpn; - ibqp->device = init_attr->xrc_domain->device; - ibqp->pd = xrcd->pd; - ibqp->send_cq = ibqp->recv_cq = xrcd->cq; - ibqp->event_handler = init_attr->event_handler; - ibqp->qp_context = init_attr->qp_context; - ibqp->qp_type = init_attr->qp_type; - ibqp->xrcd = init_attr->xrc_domain; - - mutex_lock(&qp->mutex); - ctx_entry->context = init_attr->qp_context; - spin_lock_irqsave(&qp->xrc_reg_list_lock, flags); - list_add_tail(&ctx_entry->list, &qp->xrc_reg_list); - spin_unlock_irqrestore(&qp->xrc_reg_list_lock, flags); - mutex_unlock(&qp->mutex); - mutex_unlock(&dev->xrc_reg_mutex); - *qp_num = qp->mqp.qpn; - return 0; -} - -int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num, - struct ib_qp_attr *attr, int attr_mask) -{ - struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device); - struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd); - struct mlx4_qp *mqp; - struct mlx4_ib_qp *mibqp; - int err = -EINVAL; - - if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) - return -ENOSYS; - - mutex_lock(&dev->xrc_reg_mutex); - mqp = mlx4_qp_lookup_lock(dev->dev, qp_num); - if (unlikely(!mqp)) { - printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: " - "unknown QPN %06x\n", qp_num); - goto err_out; - } - - mibqp = to_mibqp(mqp); - - if (!(mibqp->flags & MLX4_IB_XRC_RCV) || !mibqp->ibqp.xrcd || - xrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn) - goto err_out; - - err = mlx4_ib_modify_qp(&mibqp->ibqp, attr, attr_mask, NULL); - mutex_unlock(&dev->xrc_reg_mutex); - return err; - -err_out: - mutex_unlock(&dev->xrc_reg_mutex); - return err; -} - -int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num, - struct ib_qp_attr *qp_attr, int qp_attr_mask, - struct ib_qp_init_attr *qp_init_attr) -{ - struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device); - struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd); - struct mlx4_ib_qp *qp; - struct mlx4_qp *mqp; - struct mlx4_qp_context context; - int mlx4_state; - int err = -EINVAL; - - if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) - return -ENOSYS; - - mutex_lock(&dev->xrc_reg_mutex); - mqp = mlx4_qp_lookup_lock(dev->dev, qp_num); - if (unlikely(!mqp)) { - printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: " - "unknown QPN %06x\n", qp_num); - goto err_out; - } - - qp = to_mibqp(mqp); - if (!(qp->flags & MLX4_IB_XRC_RCV) || !(qp->ibqp.xrcd) || - xrcd->xrcdn != to_mxrcd(qp->ibqp.xrcd)->xrcdn) - goto err_out; - - if (qp->state == IB_QPS_RESET) { - qp_attr->qp_state = IB_QPS_RESET; - goto done; - } - - err = mlx4_qp_query(dev->dev, mqp, &context); - if (err) - goto err_out; - - mlx4_state = be32_to_cpu(context.flags) >> 28; - - qp_attr->qp_state = to_ib_qp_state(mlx4_state); - qp_attr->path_mtu = context.mtu_msgmax >> 5; - qp_attr->path_mig_state = - to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); - qp_attr->qkey = be32_to_cpu(context.qkey); - qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; - qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; - qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; - qp_attr->qp_access_flags = - to_ib_qp_access_flags(be32_to_cpu(context.params2)); - - if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || - qp->ibqp.qp_type == IB_QPT_XRC) { - to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); - to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, - &context.alt_path); - qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; - qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; - } - - qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; - if (qp_attr->qp_state == IB_QPS_INIT) - qp_attr->port_num = qp->port; - else - qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; - - /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ - qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; - - qp_attr->max_rd_atomic = - 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); - - qp_attr->max_dest_rd_atomic = - 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); - qp_attr->min_rnr_timer = - (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; - qp_attr->timeout = context.pri_path.ackto >> 3; - qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; - qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; - qp_attr->alt_timeout = context.alt_path.ackto >> 3; - -done: - qp_attr->cur_qp_state = qp_attr->qp_state; - qp_attr->cap.max_recv_wr = 0; - qp_attr->cap.max_recv_sge = 0; - qp_attr->cap.max_send_wr = 0; - qp_attr->cap.max_send_sge = 0; - qp_attr->cap.max_inline_data = 0; - qp_init_attr->cap = qp_attr->cap; - - mutex_unlock(&dev->xrc_reg_mutex); - return 0; - -err_out: - mutex_unlock(&dev->xrc_reg_mutex); - return err; -} - -int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num) -{ - - struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd); - - struct mlx4_qp *mqp; - struct mlx4_ib_qp *mibqp; - struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp; - unsigned long flags; - int err = -EINVAL; - - mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex); - mqp = mlx4_qp_lookup_lock(to_mdev(xrcd->device)->dev, qp_num); - if (unlikely(!mqp)) { - printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: " - "unknown QPN %06x\n", qp_num); - goto err_out; - } - - mibqp = to_mibqp(mqp); - - if (!(mibqp->flags & MLX4_IB_XRC_RCV) || !(mibqp->ibqp.xrcd) || - mxrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn) - goto err_out; - - ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL); - if (!ctx_entry) { - err = -ENOMEM; - goto err_out; - } - - mutex_lock(&mibqp->mutex); - list_for_each_entry(tmp, &mibqp->xrc_reg_list, list) - if (tmp->context == context) { - mutex_unlock(&mibqp->mutex); - kfree(ctx_entry); - mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); - return 0; - } - - ctx_entry->context = context; - spin_lock_irqsave(&mibqp->xrc_reg_list_lock, flags); - list_add_tail(&ctx_entry->list, &mibqp->xrc_reg_list); - spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags); - mutex_unlock(&mibqp->mutex); - mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); - return 0; - -err_out: - mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); - return err; -} - -int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num) -{ - - struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd); - - struct mlx4_qp *mqp; - struct mlx4_ib_qp *mibqp; - struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp; - unsigned long flags; - int found = 0; - int err = -EINVAL; - - mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex); - mqp = mlx4_qp_lookup_lock(to_mdev(xrcd->device)->dev, qp_num); - if (unlikely(!mqp)) { - printk(KERN_WARNING "mlx4_ib_unreg_xrc_rcv_qp: " - "unknown QPN %06x\n", qp_num); - goto err_out; - } - - mibqp = to_mibqp(mqp); - - if (!(mibqp->flags & MLX4_IB_XRC_RCV) || - mxrcd->xrcdn != (mibqp->xrcdn & 0xffff)) - goto err_out; - - mutex_lock(&mibqp->mutex); - spin_lock_irqsave(&mibqp->xrc_reg_list_lock, flags); - list_for_each_entry_safe(ctx_entry, tmp, &mibqp->xrc_reg_list, list) - if (ctx_entry->context == context) { - found = 1; - list_del(&ctx_entry->list); - spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags); - kfree(ctx_entry); - break; - } - - if (!found) - spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags); - mutex_unlock(&mibqp->mutex); - if (!found) - goto err_out; - - /* destroy the QP if the registration list is empty */ - if (list_empty(&mibqp->xrc_reg_list)) - mlx4_ib_destroy_qp(&mibqp->ibqp); - - mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); - return 0; - -err_out: - mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); - return err; -} - diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/srq.c b/sys/ofed/drivers/infiniband/hw/mlx4/srq.c index 90918c778d63..60c5fb025fc7 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/srq.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/srq.c @@ -33,6 +33,7 @@ #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -58,7 +59,7 @@ static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) event.event = IB_EVENT_SRQ_ERR; break; default: - printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " + pr_warn("Unexpected event type %d " "on SRQ %06x\n", type, srq->srqn); return; } @@ -67,17 +68,16 @@ static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) } } -struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd, - struct ib_cq *xrc_cq, - struct ib_xrcd *xrcd, - struct ib_srq_init_attr *init_attr, - struct ib_udata *udata) +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_srq *srq; struct mlx4_wqe_srq_next_seg *next; - u32 cqn; - u16 xrcdn; + struct mlx4_wqe_data_seg *scatter; + u32 cqn; + u16 xrcdn; int desc_size; int buf_size; int err; @@ -85,14 +85,10 @@ struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd, /* Sanity check SRQ size before proceeding */ if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || - init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) { - mlx4_ib_dbg("a size param is out of range. " - "max_wr = 0x%x, max_sge = 0x%x", - init_attr->attr.max_wr, init_attr->attr.max_sge); + init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) return ERR_PTR(-EINVAL); - } - srq = kzalloc(sizeof *srq, GFP_KERNEL); + srq = kmalloc(sizeof *srq, GFP_KERNEL); if (!srq) return ERR_PTR(-ENOMEM); @@ -138,8 +134,6 @@ struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd, if (err) goto err_mtt; } else { - struct mlx4_wqe_data_seg *scatter; - err = mlx4_db_alloc(dev->dev, &srq->db, 0); if (err) goto err_srq; @@ -182,24 +176,24 @@ struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd, } } - cqn = xrc_cq ? (u32) (to_mcq(xrc_cq)->mcq.cqn) : 0; - xrcdn = xrcd ? (u16) (to_mxrcd(xrcd)->xrcdn) : + cqn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0; + xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn : (u16) dev->dev->caps.reserved_xrcds; - err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt, srq->db.dma, &srq->msrq); if (err) goto err_wrid; srq->msrq.event = mlx4_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; - if (pd->uobject) { + if (pd->uobject) if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { err = -EFAULT; goto err_wrid; } - } else - srq->ibsrq.xrc_srq_num = srq->msrq.srqn; init_attr->attr.max_wr = srq->msrq.max - 1; @@ -238,16 +232,12 @@ int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, int ret; /* We don't support resizing SRQs (yet?) */ - if (attr_mask & IB_SRQ_MAX_WR) { - mlx4_ib_dbg("resize not yet supported"); + if (attr_mask & IB_SRQ_MAX_WR) return -EINVAL; - } if (attr_mask & IB_SRQ_LIMIT) { - if (attr->srq_limit >= srq->msrq.max){ - mlx4_ib_dbg("limit (0x%x) too high", attr->srq_limit); + if (attr->srq_limit >= srq->msrq.max) return -EINVAL; - } mutex_lock(&srq->mutex); ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit); @@ -260,13 +250,6 @@ int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, return 0; } -struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, - struct ib_srq_init_attr *init_attr, - struct ib_udata *udata) -{ - return mlx4_ib_create_xrc_srq(pd, NULL, NULL, init_attr, udata); -} - int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) { struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); @@ -289,18 +272,6 @@ int mlx4_ib_destroy_srq(struct ib_srq *srq) { struct mlx4_ib_dev *dev = to_mdev(srq->device); struct mlx4_ib_srq *msrq = to_msrq(srq); - struct mlx4_ib_cq *cq; - - mlx4_srq_invalidate(dev->dev, &msrq->msrq); - - if (srq->xrc_cq && !srq->uobject) { - cq = to_mcq(srq->xrc_cq); - spin_lock_irq(&cq->lock); - __mlx4_ib_cq_clean(cq, -1, msrq); - mlx4_srq_remove(dev->dev, &msrq->msrq); - spin_unlock_irq(&cq->lock); - } else - mlx4_srq_remove(dev->dev, &msrq->msrq); mlx4_srq_free(dev->dev, &msrq->msrq); mlx4_mtt_cleanup(dev->dev, &msrq->mtt); @@ -349,16 +320,12 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, for (nreq = 0; wr; ++nreq, wr = wr->next) { if (unlikely(wr->num_sge > srq->msrq.max_gs)) { - mlx4_ib_dbg("srq num 0x%x: num s/g entries too large (%d)", - srq->msrq.srqn, wr->num_sge); err = -EINVAL; *bad_wr = wr; break; } if (unlikely(srq->head == srq->tail)) { - mlx4_ib_dbg("srq num 0x%x: No entries available to post.", - srq->msrq.srqn); err = -ENOMEM; *bad_wr = wr; break; diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c b/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c new file mode 100644 index 000000000000..f19525e8a448 --- /dev/null +++ b/sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c @@ -0,0 +1,800 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*#include "core_priv.h"*/ +#include "mlx4_ib.h" +#include +#include + +#include +/*show_admin_alias_guid returns the administratively assigned value of that GUID. + * Values returned in buf parameter string: + * 0 - requests opensm to assign a value. + * ffffffffffffffff - delete this entry. + * other - value assigned by administrator. + */ +static ssize_t show_admin_alias_guid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + + record_num = mlx4_ib_iov_dentry->entry_num / 8 ; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ; + + return sprintf(buf, "%llx\n", (long long) + be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid. + ports_guid[port->num - 1]. + all_rec_per_port[record_num]. + all_recs[8 * guid_index_in_rec])); +} + +/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID. + * Values in buf parameter string: + * 0 - requests opensm to assign a value. + * 0xffffffffffffffff - delete this entry. + * other - guid value assigned by the administrator. + */ +static ssize_t store_admin_alias_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u64 sysadmin_ag_val; + + record_num = mlx4_ib_iov_dentry->entry_num / 8; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8; + if (0 == record_num && 0 == guid_index_in_rec) { + pr_err("GUID 0 block 0 is RO\n"); + return count; + } + sscanf(buf, "%llx", &sysadmin_ag_val); + *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * guid_index_in_rec] = + cpu_to_be64(sysadmin_ag_val); + + /* Change the state to be pending for update */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE ; + + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method + = MLX4_GUID_INFO_RECORD_SET; + + switch (sysadmin_ag_val) { + case MLX4_GUID_FOR_DELETE_VAL: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method + = MLX4_GUID_INFO_RECORD_DELETE; + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_SYSADMIN_ASSIGN; + break; + /* The sysadmin requests the SM to re-assign */ + case MLX4_NOT_SET_GUID: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_DRIVER_ASSIGN; + break; + /* The sysadmin requests a specific value.*/ + default: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_SYSADMIN_ASSIGN; + break; + } + + /* set the record index */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes + = mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + + mlx4_ib_init_alias_guid_work(mdev, port->num - 1); + + return count; +} + +static ssize_t show_port_gid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + union ib_gid gid; + ssize_t ret; + + ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &gid, 1); + if (ret) + return ret; + ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) gid.raw)[0]), + be16_to_cpu(((__be16 *) gid.raw)[1]), + be16_to_cpu(((__be16 *) gid.raw)[2]), + be16_to_cpu(((__be16 *) gid.raw)[3]), + be16_to_cpu(((__be16 *) gid.raw)[4]), + be16_to_cpu(((__be16 *) gid.raw)[5]), + be16_to_cpu(((__be16 *) gid.raw)[6]), + be16_to_cpu(((__be16 *) gid.raw)[7])); + return ret; +} + +static ssize_t show_phys_port_pkey(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u16 pkey; + ssize_t ret; + + ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &pkey, 1); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define DENTRY_REMOVE(_dentry) \ +do { \ + sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \ +} while (0); + +static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry, + char *_name, struct kobject *_kobj, + ssize_t (*show)(struct device *dev, + struct device_attribute *attr, + char *buf), + ssize_t (*store)(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) + ) +{ + int ret = 0; + struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry; + + vdentry->ctx = _ctx; + vdentry->dentry.show = show; + vdentry->dentry.store = store; + sysfs_attr_init(&vdentry->dentry.attr); + vdentry->dentry.attr.name = vdentry->name; + vdentry->dentry.attr.mode = 0; + vdentry->kobj = _kobj; + snprintf(vdentry->name, 15, "%s", _name); + + if (vdentry->dentry.store) + vdentry->dentry.attr.mode |= S_IWUSR; + + if (vdentry->dentry.show) + vdentry->dentry.attr.mode |= S_IRUGO; + + ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr); + if (ret) { + pr_err("failed to create %s\n", vdentry->dentry.attr.name); + vdentry->ctx = NULL; + return ret; + } + + return ret; +} + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + int ret; + + ret = sysfs_create_file(port->mcgs_parent, attr); + if (ret) + pr_err("failed to create %s\n", attr->name); + + return ret; +} + +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + + sysfs_remove_file(port->mcgs_parent, attr); +} + +static int add_port_entries(struct mlx4_ib_dev *device, int port_num) +{ + int i; + char buff[10]; + struct mlx4_ib_iov_port *port = NULL; + int ret = 0 ; + struct ib_port_attr attr; + + /* get the physical gid and pkey table sizes.*/ + ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1); + if (ret) + goto err; + + port = &device->iov_ports[port_num - 1]; + port->dev = device; + port->num = port_num; + /* Directory structure: + * iov - + * port num - + * admin_guids + * gids (operational) + * mcg_table + */ + port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar), + GFP_KERNEL); + if (!port->dentr_ar) { + ret = -ENOMEM; + goto err; + } + sprintf(buff, "%d", port_num); + port->cur_port = kobject_create_and_add(buff, + kobject_get(device->ports_parent)); + if (!port->cur_port) { + ret = -ENOMEM; + goto kobj_create_err; + } + /* admin GUIDs */ + port->admin_alias_parent = kobject_create_and_add("admin_guids", + kobject_get(port->cur_port)); + if (!port->admin_alias_parent) { + ret = -ENOMEM; + goto err_admin_guids; + } + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[i].entry_num = i; + ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i], + buff, port->admin_alias_parent, + show_admin_alias_guid, store_admin_alias_guid); + if (ret) + goto err_admin_alias_parent; + } + + /* gids subdirectory (operational gids) */ + port->gids_parent = kobject_create_and_add("gids", + kobject_get(port->cur_port)); + if (!port->gids_parent) { + ret = -ENOMEM; + goto err_gids; + } + + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[attr.gid_tbl_len + i], + buff, + port->gids_parent, show_port_gid, NULL); + if (ret) + goto err_gids_parent; + } + + /* physical port pkey table */ + port->pkeys_parent = + kobject_create_and_add("pkeys", kobject_get(port->cur_port)); + if (!port->pkeys_parent) { + ret = -ENOMEM; + goto err_pkeys; + } + + for (i = 0 ; i < attr.pkey_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i], + buff, port->pkeys_parent, + show_phys_port_pkey, NULL); + if (ret) + goto err_pkeys_parent; + } + + /* MCGs table */ + port->mcgs_parent = + kobject_create_and_add("mcgs", kobject_get(port->cur_port)); + if (!port->mcgs_parent) { + ret = -ENOMEM; + goto err_mcgs; + } + return 0; + +err_mcgs: + kobject_put(port->cur_port); + +err_pkeys_parent: + kobject_put(port->pkeys_parent); + +err_pkeys: + kobject_put(port->cur_port); + +err_gids_parent: + kobject_put(port->gids_parent); + +err_gids: + kobject_put(port->cur_port); + +err_admin_alias_parent: + kobject_put(port->admin_alias_parent); + +err_admin_guids: + kobject_put(port->cur_port); + kobject_put(port->cur_port); /* once more for create_and_add buff */ + +kobj_create_err: + kobject_put(device->ports_parent); + kfree(port->dentr_ar); + +err: + pr_err("add_port_entries FAILED: for port:%d, error: %d\n", + port_num, ret); + return ret; +} + +static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max) +{ + char base_name[9]; + + /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */ + strlcpy(name, pci_name(dev->dev->pdev), max); + strncpy(base_name, name, 8); /*till xxxx:yy:*/ + base_name[8] = '\0'; + /* with no ARI only 3 last bits are used so when the fn is higher than 8 + * need to add it to the dev num, so count in the last number will be + * modulo 8 */ + sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8)); +} + +struct mlx4_port { + struct kobject kobj; + struct mlx4_ib_dev *dev; + struct attribute_group pkey_group; + struct attribute_group gid_group; + u8 port_num; + int slave; +}; + + +static void mlx4_port_release(struct kobject *kobj) +{ + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + struct attribute *a; + int i; + + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + kfree(p->pkey_group.attrs); + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + kfree(p->gid_group.attrs); + kfree(p); +} + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->show) + return -EIO; + return port_attr->show(p, port_attr, buf); +} + +static ssize_t port_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->store) + return -EIO; + return port_attr->store(p, port_attr, buf, size); +} + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show, + .store = port_attr_store, +}; + +static struct kobj_type port_type = { + .release = mlx4_port_release, + .sysfs_ops = &port_sysfs_ops, +}; + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; +}; + +static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + ssize_t ret = -ENODEV; + + if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >= + (p->dev->dev->caps.pkey_table_len[p->port_num])) + ret = sprintf(buf, "none\n"); + else + ret = sprintf(buf, "%d\n", + p->dev->pkeys.virt2phys_pkey[p->slave] + [p->port_num - 1][tab_attr->index]); + return ret; +} + +static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + const char *buf, size_t count) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int idx; + int err; + + /* do not allow remapping Dom0 virtual pkey table */ + if (p->slave == mlx4_master_func_num(p->dev->dev)) + return -EINVAL; + + if (!strncasecmp(buf, "no", 2)) + idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1; + else if (sscanf(buf, "%i", &idx) != 1 || + idx >= p->dev->dev->caps.pkey_table_len[p->port_num] || + idx < 0) + return -EINVAL; + + p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1] + [tab_attr->index] = idx; + mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num, + tab_attr->index, idx); + err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num); + if (err) { + pr_err("mlx4_gen_pkey_eqe failed for slave %d," + " port %d, index %d\n", p->slave, p->port_num, idx); + return err; + } + return count; +} + +static ssize_t show_port_gid_idx(struct mlx4_port *p, + struct port_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", p->slave); +} + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct mlx4_port *, + struct port_attribute *, char *buf), + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof (struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + if (snprintf(element->name, sizeof (element->name), + "%d", i) >= sizeof (element->name)) { + kfree(element); + goto err; + } + sysfs_attr_init(&element->attr.attr); + element->attr.attr.name = element->name; + if (store) { + element->attr.attr.mode = S_IWUSR | S_IRUGO; + element->attr.store = store; + } else + element->attr.attr.mode = S_IRUGO; + + element->attr.show = show; + element->index = i; + tab_attr[i] = &element->attr.attr; + } + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) +{ + struct mlx4_port *p; + int i; + int ret; + int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) == + IB_LINK_LAYER_ETHERNET; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->dev = dev; + p->port_num = port_num; + p->slave = slave; + + ret = kobject_init_and_add(&p->kobj, &port_type, + kobject_get(dev->dev_ports_parent[slave]), + "%d", port_num); + if (ret) + goto err_alloc; + + p->pkey_group.name = "pkey_idx"; + if (is_eth) + p->pkey_group.attrs = + alloc_group_attrs(show_port_pkey, NULL, + dev->dev->caps.pkey_table_len[port_num]); + else + p->pkey_group.attrs = + alloc_group_attrs(show_port_pkey, store_port_pkey, + dev->dev->caps.pkey_table_len[port_num]); + if (!p->pkey_group.attrs) + goto err_alloc; + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + p->gid_group.name = "gid_idx"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1); + if (!p->gid_group.attrs) + goto err_free_pkey; + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); + return 0; + +err_free_gid: + kfree(p->gid_group.attrs[0]); + kfree(p->gid_group.attrs); + +err_free_pkey: + for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i) + kfree(p->pkey_group.attrs[i]); + kfree(p->pkey_group.attrs); + +err_alloc: + kobject_put(dev->dev_ports_parent[slave]); + kfree(p); + return ret; +} + +static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) +{ + char name[32]; + int err; + int port; + struct kobject *p, *t; + struct mlx4_port *mport; + + get_name(dev, name, slave, sizeof name); + + dev->pkeys.device_parent[slave] = + kobject_create_and_add(name, kobject_get(dev->iov_parent)); + + if (!dev->pkeys.device_parent[slave]) { + err = -ENOMEM; + goto fail_dev; + } + + INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]); + + dev->dev_ports_parent[slave] = + kobject_create_and_add("ports", + kobject_get(dev->pkeys.device_parent[slave])); + + if (!dev->dev_ports_parent[slave]) { + err = -ENOMEM; + goto err_ports; + } + + for (port = 1; port <= dev->dev->caps.num_ports; ++port) { + err = add_port(dev, port, slave); + if (err) + goto err_add; + } + return 0; + +err_add: + list_for_each_entry_safe(p, t, + &dev->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + mport = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &mport->pkey_group); + sysfs_remove_group(p, &mport->gid_group); + kobject_put(p); + } + kobject_put(dev->dev_ports_parent[slave]); + +err_ports: + kobject_put(dev->pkeys.device_parent[slave]); + /* extra put for the device_parent create_and_add */ + kobject_put(dev->pkeys.device_parent[slave]); + +fail_dev: + kobject_put(dev->iov_parent); + return err; +} + +static int register_pkey_tree(struct mlx4_ib_dev *device) +{ + int i; + + if (!mlx4_is_master(device->dev)) + return 0; + + for (i = 0; i <= device->dev->num_vfs; ++i) + register_one_pkey_tree(device, i); + + return 0; +} + +static void unregister_pkey_tree(struct mlx4_ib_dev *device) +{ + int slave; + struct kobject *p, *t; + struct mlx4_port *port; + + if (!mlx4_is_master(device->dev)) + return; + + for (slave = device->dev->num_vfs; slave >= 0; --slave) { + list_for_each_entry_safe(p, t, + &device->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + port = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + kobject_put(p); + kobject_put(device->dev_ports_parent[slave]); + } + kobject_put(device->dev_ports_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->iov_parent); + } +} + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev) +{ + int i; + int ret = 0; + + if (!mlx4_is_master(dev->dev)) + return 0; + + dev->iov_parent = + kobject_create_and_add("iov", + kobject_get(dev->ib_dev.ports_parent->parent)); + if (!dev->iov_parent) { + ret = -ENOMEM; + goto err; + } + dev->ports_parent = + kobject_create_and_add("ports", + kobject_get(dev->iov_parent)); + if (!dev->iov_parent) { + ret = -ENOMEM; + goto err_ports; + } + + for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) { + ret = add_port_entries(dev, i); + if (ret) + goto err_add_entries; + } + + ret = register_pkey_tree(dev); + if (ret) + goto err_add_entries; + return 0; + +err_add_entries: + kobject_put(dev->ports_parent); + +err_ports: + kobject_put(dev->iov_parent); +err: + kobject_put(dev->ib_dev.ports_parent->parent); + pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret); + return ret; +} + +static void unregister_alias_guid_tree(struct mlx4_ib_dev *device) +{ + struct mlx4_ib_iov_port *p; + int i; + + if (!mlx4_is_master(device->dev)) + return; + + for (i = 0; i < device->dev->caps.num_ports; i++) { + p = &device->iov_ports[i]; + kobject_put(p->admin_alias_parent); + kobject_put(p->gids_parent); + kobject_put(p->pkeys_parent); + kobject_put(p->mcgs_parent); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->dev->ports_parent); + kfree(p->dentr_ar); + } +} + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device) +{ + unregister_alias_guid_tree(device); + unregister_pkey_tree(device); + kobject_put(device->ports_parent); + kobject_put(device->iov_parent); + kobject_put(device->iov_parent); + kobject_put(device->ib_dev.ports_parent->parent); +} diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/user.h b/sys/ofed/drivers/infiniband/hw/mlx4/user.h index 13beedeeef9f..07e6769ef43b 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/user.h +++ b/sys/ofed/drivers/infiniband/hw/mlx4/user.h @@ -40,7 +40,9 @@ * Increment this value if any changes that break userspace ABI * compatibility are made. */ -#define MLX4_IB_UVERBS_ABI_VERSION 3 + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 /* * Make sure that all structs defined in this file remain laid out so @@ -50,12 +52,20 @@ * instead. */ -struct mlx4_ib_alloc_ucontext_resp { +struct mlx4_ib_alloc_ucontext_resp_v3 { __u32 qp_tab_size; __u16 bf_reg_size; __u16 bf_regs_per_page; }; +struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; + __u32 cqe_size; +}; + struct mlx4_ib_alloc_pd_resp { __u32 pdn; __u32 reserved; diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/wc.c b/sys/ofed/drivers/infiniband/hw/mlx4/wc.c index 827de14a068c..c73a61c2ece3 100644 --- a/sys/ofed/drivers/infiniband/hw/mlx4/wc.c +++ b/sys/ofed/drivers/infiniband/hw/mlx4/wc.c @@ -71,4 +71,3 @@ int mlx4_wc_enabled(void) } #endif - diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c index 81e28387fac4..f9d187288476 100644 --- a/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1808,7 +1808,7 @@ int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn, case IB_QPT_RAW_IPV6: op_mod = 2; break; - case IB_QPT_RAW_ETY: + case IB_QPT_RAW_ETHERTYPE: op_mod = 3; break; default: diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c index 5401364dd2f5..10f7fd3d9b14 100644 --- a/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c +++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c @@ -1325,7 +1325,7 @@ static void __init mthca_validate_profile(void) if (log_mtts_per_seg == 0) log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) { - printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %ld\n", + printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n", log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8)); log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); } diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c index 783da4bec5d0..ab139bf63219 100644 --- a/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -448,6 +448,7 @@ static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int pag page * MTHCA_ICM_PAGE_SIZE; } + #include #include #include diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c index e54773976d6e..eaec3e6b928e 100644 --- a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1006,7 +1006,7 @@ static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd, } static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt, int acc, struct ib_udata *udata) + u64 virt, int acc, struct ib_udata *udata, int mr_id) { struct mthca_dev *dev = to_mdev(pd->device); struct ib_umem_chunk *chunk; @@ -1402,7 +1402,7 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); - ret = ib_register_device(&dev->ib_dev); + ret = ib_register_device(&dev->ib_dev, NULL); if (ret) return ret; diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h index 5740eb0677ee..768833de6dd8 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h @@ -109,7 +109,8 @@ enum { IPOIB_ENCAP_LEN = 4, IPOIB_HEADER_LEN = IPOIB_ENCAP_LEN + INFINIBAND_ALEN, IPOIB_UD_MAX_MTU = 4 * 1024, - IPOIB_UD_RX_SG = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE), +// IPOIB_UD_RX_SG = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE), + IPOIB_UD_RX_SG = 2, IPOIB_UD_TX_SG = (IPOIB_UD_MAX_MTU / MCLBYTES) + 2, IPOIB_CM_MAX_MTU = (64 * 1024), IPOIB_CM_TX_SG = (IPOIB_CM_MAX_MTU / MCLBYTES) + 2, diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c index 9081e135042b..bae1740ae7bd 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1539,3 +1539,20 @@ ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); + +#undef MODULE_VERSION +#include +static int +ipoib_evhand(module_t mod, int event, void *arg) +{ + return (0); +} + +static moduledata_t ipoib_mod = { + .name = "ipoib", + .evhand = ipoib_evhand, +}; + +DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_SMP, SI_ORDER_ANY); +MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); + diff --git a/sys/ofed/drivers/net/mlx4/Makefile b/sys/ofed/drivers/net/mlx4/Makefile index b9d2e7eaa68c..bac8eb31fd14 100644 --- a/sys/ofed/drivers/net/mlx4/Makefile +++ b/sys/ofed/drivers/net/mlx4/Makefile @@ -1,9 +1,34 @@ -obj-$(CONFIG_MLX4_CORE) += mlx4_core.o +# $FreeBSD$ +#.PATH: ${.CURDIR}/../../ofed/drivers/net/mlx4:${.CURDIR}/../../ofed/include/linux +.PATH: ${.CURDIR}/../../../../../include/linux -mlx4_core-y := alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \ - mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o xrcd.o +.include -obj-$(CONFIG_MLX4_EN) += mlx4_en.o -mlx4_en-y := en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \ - en_resources.o en_netdev.o en_frag.o en_selftest.o +KMOD = mlx4 +SRCS = device_if.h bus_if.h pci_if.h vnode_if.h +SRCS+= alloc.c catas.c cmd.c cq.c eq.c fw.c icm.c intf.c main.c mcg.c mr.c linux_compat.c linux_radix.c +SRCS+= pd.c port.c profile.c qp.c reset.c sense.c srq.c resource_tracker.c sys_tune.c +SRCS+= opt_inet.h opt_inet6.h + + +#CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4 +#CFLAGS+= -I${.CURDIR}/../../ofed/include/ +CFLAGS+= -I${.CURDIR}/../../../../../include + +.if !defined(KERNBUILDDIR) +.if ${MK_INET_SUPPORT} != "no" +opt_inet.h: + @echo "#define INET 1" > ${.TARGET} +.endif + +.if ${MK_INET6_SUPPORT} != "no" +opt_inet6.h: + @echo "#define INET6 1" > ${.TARGET} +.endif +.endif + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions + diff --git a/sys/ofed/drivers/net/mlx4/alloc.c b/sys/ofed/drivers/net/mlx4/alloc.c index c22791a4d981..38f3cafaf321 100644 --- a/sys/ofed/drivers/net/mlx4/alloc.c +++ b/sys/ofed/drivers/net/mlx4/alloc.c @@ -34,6 +34,7 @@ #include #include #include +//#include /* XXX SK probabaly not needed in freeBSD XXX */ #include #include #include @@ -77,14 +78,15 @@ void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj) static unsigned long find_aligned_range(unsigned long *bitmap, u32 start, u32 nbits, - int len, int align) + int len, int align, u32 skip_mask) { unsigned long end, i; again: start = ALIGN(start, align); - while ((start < nbits) && test_bit(start, bitmap)) + while ((start < nbits) && (test_bit(start, bitmap) || + (start & skip_mask))) start += align; if (start >= nbits) @@ -95,7 +97,7 @@ static unsigned long find_aligned_range(unsigned long *bitmap, return -1; for (i = start + 1; i < end; i++) { - if (test_bit(i, bitmap)) { + if (test_bit(i, bitmap) || ((u32)i & skip_mask)) { start = i + 1; goto again; } @@ -104,27 +106,27 @@ static unsigned long find_aligned_range(unsigned long *bitmap, return start; } -u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align) +u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, + int align, u32 skip_mask) { - u32 obj, i; + u32 obj; - if (likely(cnt == 1 && align == 1)) + if (likely(cnt == 1 && align == 1 && !skip_mask)) return mlx4_bitmap_alloc(bitmap); spin_lock(&bitmap->lock); obj = find_aligned_range(bitmap->table, bitmap->last, - bitmap->max, cnt, align); + bitmap->max, cnt, align, skip_mask); if (obj >= bitmap->max) { bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top) & bitmap->mask; obj = find_aligned_range(bitmap->table, 0, bitmap->max, - cnt, align); + cnt, align, skip_mask); } if (obj < bitmap->max) { - for (i = 0; i < cnt; i++) - set_bit(obj + i, bitmap->table); + bitmap_set(bitmap->table, obj, cnt); if (obj == bitmap->last) { bitmap->last = (obj + cnt); if (bitmap->last >= bitmap->max) @@ -149,16 +151,10 @@ u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap) void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt) { - u32 i; - obj &= bitmap->max + bitmap->reserved_top - 1; spin_lock(&bitmap->lock); - for (i = 0; i < cnt; i++) - clear_bit(obj + i, bitmap->table); - bitmap->last = min(bitmap->last, obj); - bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top) - & bitmap->mask; + bitmap_clear(bitmap->table, obj, cnt); bitmap->avail += cnt; spin_unlock(&bitmap->lock); } @@ -166,12 +162,17 @@ void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt) int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved_bot, u32 reserved_top) { - int i; + /* sanity check */ + if (num <= (u64)reserved_top + reserved_bot) + return -EINVAL; /* num must be a power of 2 */ if (num != roundup_pow_of_two(num)) return -EINVAL; + if (reserved_bot + reserved_top >= num) + return -EINVAL; + bitmap->last = 0; bitmap->top = 0; bitmap->max = num - reserved_top; @@ -184,8 +185,7 @@ int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, if (!bitmap->table) return -ENOMEM; - for (i = 0; i < reserved_bot; ++i) - set_bit(i, bitmap->table); + bitmap_set(bitmap->table, 0, reserved_bot); return 0; } @@ -207,7 +207,6 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, { dma_addr_t t; - buf->direct.buf = NULL; if (size <= max_direct) { buf->nbufs = 1; buf->npages = 1; @@ -229,11 +228,10 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, int i; buf->direct.buf = NULL; - buf->direct.map = 0; buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; buf->npages = buf->nbufs; buf->page_shift = PAGE_SHIFT; - buf->page_list = kzalloc(buf->nbufs * sizeof *buf->page_list, + buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list), GFP_KERNEL); if (!buf->page_list) return -ENOMEM; @@ -291,7 +289,6 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf) buf->page_list[i].map); kfree(buf->page_list); } - buf->direct.buf = NULL; } EXPORT_SYMBOL_GPL(mlx4_buf_free); diff --git a/sys/ofed/drivers/net/mlx4/catas.c b/sys/ofed/drivers/net/mlx4/catas.c index 334aad927501..185129a4b1d7 100644 --- a/sys/ofed/drivers/net/mlx4/catas.c +++ b/sys/ofed/drivers/net/mlx4/catas.c @@ -32,10 +32,12 @@ */ #include +#include #include "mlx4.h" -#define MLX4_CATAS_POLL_INTERVAL (5 * HZ) +#define MLX4_CATAS_POLL_INTERVAL (5 * HZ) + static DEFINE_SPINLOCK(catas_lock); @@ -45,7 +47,8 @@ static struct work_struct catas_work; static int internal_err_reset = 1; module_param(internal_err_reset, int, 0644); MODULE_PARM_DESC(internal_err_reset, - "Reset device on internal errors if non-zero (default 1)"); + "Reset device on internal errors if non-zero" + " (default 1, in SRIOV mode default is 0)"); static void dump_err_buf(struct mlx4_dev *dev) { @@ -65,16 +68,21 @@ static void poll_catas(unsigned long dev_ptr) struct mlx4_priv *priv = mlx4_priv(dev); if (readl(priv->catas_err.map)) { - dump_err_buf(dev); + /* If the device is off-line, we cannot try to recover it */ + if (pci_channel_offline(dev->pdev)) + mod_timer(&priv->catas_err.timer, + round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); + else { + dump_err_buf(dev); + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); - mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); + if (internal_err_reset) { + spin_lock(&catas_lock); + list_add(&priv->catas_err.list, &catas_list); + spin_unlock(&catas_lock); - if (internal_err_reset) { - spin_lock(&catas_lock); - list_add(&priv->catas_err.list, &catas_list); - spin_unlock(&catas_lock); - - queue_work(mlx4_wq, &catas_work); + queue_work(mlx4_wq, &catas_work); + } } } else mod_timer(&priv->catas_err.timer, @@ -89,9 +97,6 @@ static void catas_reset(struct work_struct *work) LIST_HEAD(tlist); int ret; - if (!mutex_trylock(&drv_mutex)) - return; - spin_lock_irq(&catas_lock); list_splice_init(&catas_list, &tlist); spin_unlock_irq(&catas_lock); @@ -99,23 +104,30 @@ static void catas_reset(struct work_struct *work) list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) { struct pci_dev *pdev = priv->dev.pdev; + /* If the device is off-line, we cannot reset it */ + if (pci_channel_offline(pdev)) + continue; + ret = mlx4_restart_one(priv->dev.pdev); /* 'priv' now is not valid */ if (ret) - printk(KERN_ERR "mlx4 %s: Reset failed (%d)\n", - pci_name(pdev), ret); + pr_err("mlx4 %s: Reset failed (%d)\n", + pci_name(pdev), ret); else { dev = pci_get_drvdata(pdev); mlx4_dbg(dev, "Reset succeeded\n"); } } - mutex_unlock(&drv_mutex); } void mlx4_start_catas_poll(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); - unsigned long addr; + phys_addr_t addr; + + /*If we are in SRIOV the default of the module param must be 0*/ + if (mlx4_is_mfunc(dev)) + internal_err_reset = 0; INIT_LIST_HEAD(&priv->catas_err.list); init_timer(&priv->catas_err.timer); @@ -126,8 +138,8 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev) priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); if (!priv->catas_err.map) { - mlx4_warn(dev, "Failed to map internal error buffer at 0x%lx\n", - addr); + mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", + (unsigned long long) addr); return; } diff --git a/sys/ofed/drivers/net/mlx4/cmd.c b/sys/ofed/drivers/net/mlx4/cmd.c index bc4a618cb00f..5c78cdc9a292 100644 --- a/sys/ofed/drivers/net/mlx4/cmd.c +++ b/sys/ofed/drivers/net/mlx4/cmd.c @@ -33,16 +33,24 @@ */ #include +#include #include #include #include +#include +#include #include #include "mlx4.h" +#include "fw.h" #define CMD_POLL_TOKEN 0xffff +#define INBOX_MASK 0xffffffffffffff00ULL + +#define CMD_CHAN_VER 1 +#define CMD_CHAN_IF_REV 1 enum { /* command completed successfully: */ @@ -111,6 +119,9 @@ struct mlx4_cmd_context { u8 fw_status; }; +static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr_cmd *in_vhcr); + static int mlx4_status_to_errno(u8 status) { static const int trans_table[] = { @@ -141,9 +152,157 @@ static int mlx4_status_to_errno(u8 status) return trans_table[status]; } +static u8 mlx4_errno_to_status(int errno) +{ + switch (errno) { + case -EPERM: + return CMD_STAT_BAD_OP; + case -EINVAL: + return CMD_STAT_BAD_PARAM; + case -ENXIO: + return CMD_STAT_BAD_SYS_STATE; + case -EBUSY: + return CMD_STAT_RESOURCE_BUSY; + case -ENOMEM: + return CMD_STAT_EXCEED_LIM; + case -ENFILE: + return CMD_STAT_ICM_ERROR; + default: + return CMD_STAT_INTERNAL_ERR; + } +} + +static int comm_pending(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u32 status = readl(&priv->mfunc.comm->slave_read); + + return (swab32(status) >> 31) != priv->cmd.comm_toggle; +} + +static void mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u32 val; + + priv->cmd.comm_toggle ^= 1; + val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31); + __raw_writel((__force u32) cpu_to_be32(val), + &priv->mfunc.comm->slave_write); + mmiowb(); +} + +static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param, + unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + unsigned long end; + int err = 0; + int ret_from_pending = 0; + + /* First, verify that the master reports correct status */ + if (comm_pending(dev)) { + mlx4_warn(dev, "Communication channel is not idle." + "my toggle is %d (cmd:0x%x)\n", + priv->cmd.comm_toggle, cmd); + return -EAGAIN; + } + + /* Write command */ + down(&priv->cmd.poll_sem); + mlx4_comm_cmd_post(dev, cmd, param); + + end = msecs_to_jiffies(timeout) + jiffies; + while (comm_pending(dev) && time_before(jiffies, end)) + cond_resched(); + ret_from_pending = comm_pending(dev); + if (ret_from_pending) { + /* check if the slave is trying to boot in the middle of + * FLR process. The only non-zero result in the RESET command + * is MLX4_DELAY_RESET_SLAVE*/ + if ((MLX4_COMM_CMD_RESET == cmd)) { + mlx4_warn(dev, "Got slave FLRed from Communication" + " channel (ret:0x%x)\n", ret_from_pending); + err = MLX4_DELAY_RESET_SLAVE; + } else { + mlx4_warn(dev, "Communication channel timed out\n"); + err = -ETIMEDOUT; + } + } + + up(&priv->cmd.poll_sem); + return err; +} + +static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op, + u16 param, unsigned long timeout) +{ + struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd; + struct mlx4_cmd_context *context; + unsigned long end; + int err = 0; + + down(&cmd->event_sem); + + spin_lock(&cmd->context_lock); + BUG_ON(cmd->free_head < 0); + context = &cmd->context[cmd->free_head]; + context->token += cmd->token_mask + 1; + cmd->free_head = context->next; + spin_unlock(&cmd->context_lock); + + init_completion(&context->done); + + mlx4_comm_cmd_post(dev, op, param); + + if (!wait_for_completion_timeout(&context->done, + msecs_to_jiffies(timeout))) { + mlx4_warn(dev, "communication channel command 0x%x timed out\n", op); + err = -EBUSY; + goto out; + } + + err = context->result; + if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) { + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); + goto out; + } + +out: + /* wait for comm channel ready + * this is necessary for prevention the race + * when switching between event to polling mode + */ + end = msecs_to_jiffies(timeout) + jiffies; + while (comm_pending(dev) && time_before(jiffies, end)) + cond_resched(); + + spin_lock(&cmd->context_lock); + context->next = cmd->free_head; + cmd->free_head = context - cmd->context; + spin_unlock(&cmd->context_lock); + + up(&cmd->event_sem); + return err; +} + +int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, + unsigned long timeout) +{ + if (mlx4_priv(dev)->cmd.use_events) + return mlx4_comm_cmd_wait(dev, cmd, param, timeout); + return mlx4_comm_cmd_poll(dev, cmd, param, timeout); +} + static int cmd_pending(struct mlx4_dev *dev) { - u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET); + u32 status; + + if (pci_channel_offline(dev->pdev)) + return -EIO; + + status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET); return (status & swab32(1 << HCR_GO_BIT)) || (mlx4_priv(dev)->cmd.toggle == @@ -161,13 +320,33 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param, mutex_lock(&cmd->hcr_mutex); + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + ret = -EIO; + goto out; + } + end = jiffies; if (event) end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS); while (cmd_pending(dev)) { - if (time_after_eq(jiffies, end)) + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + ret = -EIO; goto out; + } + + if (time_after_eq(jiffies, end)) { + mlx4_err(dev, "%s:cmd_pending failed\n", __func__); + goto out; + } cond_resched(); } @@ -191,7 +370,7 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param, (cmd->toggle << HCR_T_BIT) | (event ? (1 << HCR_E_BIT) : 0) | (op_modifier << HCR_OPMOD_SHIFT) | - op), hcr + 6); + op), hcr + 6); /* * Make sure that our HCR writes don't get mixed in with @@ -208,6 +387,65 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param, return ret; } +static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + int out_is_imm, u32 in_modifier, u8 op_modifier, + u16 op, unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vhcr_cmd *vhcr = priv->mfunc.vhcr; + int ret; + + mutex_lock(&priv->cmd.slave_cmd_mutex); + + vhcr->in_param = cpu_to_be64(in_param); + vhcr->out_param = out_param ? cpu_to_be64(*out_param) : 0; + vhcr->in_modifier = cpu_to_be32(in_modifier); + vhcr->opcode = cpu_to_be16((((u16) op_modifier) << 12) | (op & 0xfff)); + vhcr->token = cpu_to_be16(CMD_POLL_TOKEN); + vhcr->status = 0; + vhcr->flags = !!(priv->cmd.use_events) << 6; + + if (mlx4_is_master(dev)) { + ret = mlx4_master_process_vhcr(dev, dev->caps.function, vhcr); + if (!ret) { + if (out_is_imm) { + if (out_param) + *out_param = + be64_to_cpu(vhcr->out_param); + else { + mlx4_err(dev, "response expected while" + "output mailbox is NULL for " + "command 0x%x\n", op); + vhcr->status = CMD_STAT_BAD_PARAM; + } + } + ret = mlx4_status_to_errno(vhcr->status); + } + } else { + ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0, + MLX4_COMM_TIME + timeout); + if (!ret) { + if (out_is_imm) { + if (out_param) + *out_param = + be64_to_cpu(vhcr->out_param); + else { + mlx4_err(dev, "response expected while" + "output mailbox is NULL for " + "command 0x%x\n", op); + vhcr->status = CMD_STAT_BAD_PARAM; + } + } + ret = mlx4_status_to_errno(vhcr->status); + } else + mlx4_err(dev, "failed execution of VHCR_POST command" + "opcode 0x%x\n", op); + } + + mutex_unlock(&priv->cmd.slave_cmd_mutex); + return ret; +} + static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout) @@ -220,16 +458,36 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, down(&priv->cmd.poll_sem); + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + err = -EIO; + goto out; + } + err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0, in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0); if (err) goto out; end = msecs_to_jiffies(timeout) + jiffies; - while (cmd_pending(dev) && time_before(jiffies, end)) + while (cmd_pending(dev) && time_before(jiffies, end)) { + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + err = -EIO; + goto out; + } + cond_resched(); + } if (cmd_pending(dev)) { + mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n", op); err = -ETIMEDOUT; goto out; } @@ -240,13 +498,12 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 | (u64) be32_to_cpu((__force __be32) __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4)); - stat = be32_to_cpu((__force __be32) __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24; + stat = be32_to_cpu((__force __be32) + __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24; err = mlx4_status_to_errno(stat); - if (err) { - if (op != MLX4_CMD_SET_NODE || stat != CMD_STAT_BAD_OP) - mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", - op, stat); - } + if (err) + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, stat); out: up(&priv->cmd.poll_sem); @@ -270,6 +527,19 @@ void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param) complete(&context->done); } +static int get_status(struct mlx4_dev *dev, u32 *status, int *go_bit, + int *t_bit) +{ + if (pci_channel_offline(dev->pdev)) + return -EIO; + + *status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET); + *t_bit = !!(*status & swab32(1 << HCR_T_BIT)); + *go_bit = !!(*status & swab32(1 << HCR_GO_BIT)); + + return 0; +} + static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout) @@ -277,6 +547,8 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param, struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd; struct mlx4_cmd_context *context; int err = 0; + int go_bit = 0, t_bit = 0, stat_err; + u32 status = 0; down(&cmd->event_sem); @@ -289,19 +561,29 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param, init_completion(&context->done); - mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0, - in_modifier, op_modifier, op, context->token, 1); + err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0, + in_modifier, op_modifier, op, context->token, 1); + if (err) { + mlx4_warn(dev, "command 0x%x could not be posted (%d)\n", + op, err); + goto out; + } - if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) { + if (!wait_for_completion_timeout(&context->done, + msecs_to_jiffies(timeout))) { + stat_err = get_status(dev, &status, &go_bit, &t_bit); + mlx4_warn(dev, "command 0x%x timed out: " + "get_status err=%d, status=0x%x, go_bit=%d, " + "t_bit=%d, toggle=0x%x\n", op, stat_err, status, + go_bit, t_bit, mlx4_priv(dev)->cmd.toggle); err = -EBUSY; goto out; } err = context->result; if (err) { - if (op != MLX4_CMD_SET_NODE || context->fw_status != CMD_STAT_BAD_OP) - mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", - op, context->fw_status); + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); goto out; } @@ -320,42 +602,1397 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, - u16 op, unsigned long timeout) + u16 op, unsigned long timeout, int native) { - if (mlx4_priv(dev)->cmd.use_events && !cold) - return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm, - in_modifier, op_modifier, op, timeout); - else - return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm, - in_modifier, op_modifier, op, timeout); + if (pci_channel_offline(dev->pdev)) + return -EIO; + + if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) { + if (mlx4_priv(dev)->cmd.use_events) + return mlx4_cmd_wait(dev, in_param, out_param, + out_is_imm, in_modifier, + op_modifier, op, timeout); + else + return mlx4_cmd_poll(dev, in_param, out_param, + out_is_imm, in_modifier, + op_modifier, op, timeout); + } + return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm, + in_modifier, op_modifier, op, timeout); } EXPORT_SYMBOL_GPL(__mlx4_cmd); + +static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev) +{ + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); +} + +static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr, + int slave, u64 slave_addr, + int size, int is_read) +{ + u64 in_param; + u64 out_param; + + if ((slave_addr & 0xfff) | (master_addr & 0xfff) | + (slave & ~0x7f) | (size & 0xff)) { + mlx4_err(dev, "Bad access mem params - slave_addr:0x%llx " + "master_addr:0x%llx slave_id:%d size:%d\n", + slave_addr, master_addr, slave, size); + return -EINVAL; + } + + if (is_read) { + in_param = (u64) slave | slave_addr; + out_param = (u64) dev->caps.function | master_addr; + } else { + in_param = (u64) dev->caps.function | master_addr; + out_param = (u64) slave | slave_addr; + } + + return mlx4_cmd_imm(dev, in_param, &out_param, size, 0, + MLX4_CMD_ACCESS_MEM, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); +} + +static int query_pkey_block(struct mlx4_dev *dev, u8 port, u16 index, u16 *pkey, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox) +{ + struct ib_smp *in_mad = (struct ib_smp *)(inbox->buf); + struct ib_smp *out_mad = (struct ib_smp *)(outbox->buf); + int err; + int i; + + if (index & 0x1f) + return -EINVAL; + + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); + if (err) + return err; + + for (i = 0; i < 32; ++i) + pkey[i] = be16_to_cpu(((__be16 *) out_mad->data)[i]); + + return err; +} + +static int get_full_pkey_table(struct mlx4_dev *dev, u8 port, u16 *table, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox) +{ + int i; + int err; + + for (i = 0; i < dev->caps.pkey_table_len[port]; i += 32) { + err = query_pkey_block(dev, port, i, table + i, inbox, outbox); + if (err) + return err; + } + + return 0; +} +#define PORT_CAPABILITY_LOCATION_IN_SMP 20 +#define PORT_STATE_OFFSET 32 + +static enum ib_port_state vf_port_state(struct mlx4_dev *dev, int port, int vf) +{ + if (mlx4_get_slave_port_state(dev, vf, port) == SLAVE_PORT_UP) + return IB_PORT_ACTIVE; + else + return IB_PORT_DOWN; +} + +static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct ib_smp *smp = inbox->buf; + u32 index; + u8 port; + u16 *table; + int err; + int vidx, pidx; + struct mlx4_priv *priv = mlx4_priv(dev); + struct ib_smp *outsmp = outbox->buf; + __be16 *outtab = (__be16 *)(outsmp->data); + __be32 slave_cap_mask; + __be64 slave_node_guid; + port = vhcr->in_modifier; + + if (smp->base_version == 1 && + smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && + smp->class_version == 1) { + if (smp->method == IB_MGMT_METHOD_GET) { + if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) { + index = be32_to_cpu(smp->attr_mod); + if (port < 1 || port > dev->caps.num_ports) + return -EINVAL; + table = kcalloc(dev->caps.pkey_table_len[port], sizeof *table, GFP_KERNEL); + if (!table) + return -ENOMEM; + /* need to get the full pkey table because the paravirtualized + * pkeys may be scattered among several pkey blocks. + */ + err = get_full_pkey_table(dev, port, table, inbox, outbox); + if (!err) { + for (vidx = index * 32; vidx < (index + 1) * 32; ++vidx) { + pidx = priv->virt2phys_pkey[slave][port - 1][vidx]; + outtab[vidx % 32] = cpu_to_be16(table[pidx]); + } + } + kfree(table); + return err; + } + if (smp->attr_id == IB_SMP_ATTR_PORT_INFO) { + /*get the slave specific caps:*/ + /*do the command */ + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, vhcr->op_modifier, + vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); + /* modify the response for slaves */ + if (!err && slave != mlx4_master_func_num(dev)) { + u8 *state = outsmp->data + PORT_STATE_OFFSET; + + *state = (*state & 0xf0) | vf_port_state(dev, port, slave); + slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port]; + memcpy(outsmp->data + PORT_CAPABILITY_LOCATION_IN_SMP, &slave_cap_mask, 4); + } + return err; + } + if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) { + /* compute slave's gid block */ + smp->attr_mod = cpu_to_be32(slave / 8); + /* execute cmd */ + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, vhcr->op_modifier, + vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); + if (!err) { + /* if needed, move slave gid to index 0 */ + if (slave % 8) + memcpy(outsmp->data, + outsmp->data + (slave % 8) * 8, 8); + /* delete all other gids */ + memset(outsmp->data + 8, 0, 56); + } + return err; + } + if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) { + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, vhcr->op_modifier, + vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); + if (!err) { + slave_node_guid = mlx4_get_slave_node_guid(dev, slave); + memcpy(outsmp->data + 12, &slave_node_guid, 8); + } + return err; + } + } + } + if (slave != mlx4_master_func_num(dev) && + ((smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) || + (smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && + smp->method == IB_MGMT_METHOD_SET))) { + mlx4_err(dev, "slave %d is trying to execute a Subnet MGMT MAD, " + "class 0x%x, method 0x%x for attr 0x%x. Rejecting\n", + slave, smp->method, smp->mgmt_class, + be16_to_cpu(smp->attr_id)); + return -EPERM; + } + /*default:*/ + return mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, vhcr->op_modifier, + vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); +} + +int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u64 in_param; + u64 out_param; + int err; + + in_param = cmd->has_inbox ? (u64) inbox->dma : vhcr->in_param; + out_param = cmd->has_outbox ? (u64) outbox->dma : vhcr->out_param; + if (cmd->encode_slave_id) { + in_param &= 0xffffffffffffff00ll; + in_param |= slave; + } + + err = __mlx4_cmd(dev, in_param, &out_param, cmd->out_is_imm, + vhcr->in_modifier, vhcr->op_modifier, vhcr->op, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + if (cmd->out_is_imm) + vhcr->out_param = out_param; + + return err; +} + +static struct mlx4_cmd_info cmd_info[] = { + { + .opcode = MLX4_CMD_QUERY_FW, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_FW_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_HCA, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_QUERY_DEV_CAP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_DEV_CAP_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_FUNC_CAP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_FUNC_CAP_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_ADAPTER, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_INIT_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT_PORT_wrapper + }, + { + .opcode = MLX4_CMD_CLOSE_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CLOSE_PORT_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_PORT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_PORT_wrapper + }, + { + .opcode = MLX4_CMD_SET_PORT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_PORT_wrapper + }, + { + .opcode = MLX4_CMD_MAP_EQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MAP_EQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_EQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_EQ_wrapper + }, + { + .opcode = MLX4_CMD_HW_HEALTH_CHECK, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_NOP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_ALLOC_RES, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_ALLOC_RES_wrapper + }, + { + .opcode = MLX4_CMD_FREE_RES, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_FREE_RES_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_MPT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_MPT_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_MPT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_MPT_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_MPT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_MPT_wrapper + }, + { + .opcode = MLX4_CMD_READ_MTT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_WRITE_MTT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_WRITE_MTT_wrapper + }, + { + .opcode = MLX4_CMD_SYNC_TPT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_HW2SW_EQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_HW2SW_EQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_EQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_QUERY_EQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_CQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_CQ_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_CQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_CQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_CQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_CQ_wrapper + }, + { + .opcode = MLX4_CMD_MODIFY_CQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MODIFY_CQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_SRQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_SRQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_SRQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_ARM_SRQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_ARM_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_RST2INIT_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_RST2INIT_QP_wrapper + }, + { + .opcode = MLX4_CMD_INIT2INIT_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT2INIT_QP_wrapper + }, + { + .opcode = MLX4_CMD_INIT2RTR_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT2RTR_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTR2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_RTR2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTS2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_RTS2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQERR2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQERR2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_2ERR_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTS2SQD_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQD2SQD_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQD2SQD_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQD2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQD2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_2RST_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_2RST_QP_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_QP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_SUSPEND_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_UNSUSPEND_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_CONF_SPECIAL_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, /* XXX verify: only demux can do this */ + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_MAD_IFC, + .has_inbox = true, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MAD_IFC_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_IF_STAT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_IF_STAT_wrapper + }, + /* Native multicast commands are not available for guests */ + { + .opcode = MLX4_CMD_QP_ATTACH, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QP_ATTACH_wrapper + }, + { + .opcode = MLX4_CMD_PROMISC, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_PROMISC_wrapper + }, + /* Ethernet specific commands */ + { + .opcode = MLX4_CMD_SET_VLAN_FLTR, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_VLAN_FLTR_wrapper + }, + { + .opcode = MLX4_CMD_SET_MCAST_FLTR, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_MCAST_FLTR_wrapper + }, + { + .opcode = MLX4_CMD_DUMP_ETH_STATS, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_DUMP_ETH_STATS_wrapper + }, + { + .opcode = MLX4_CMD_INFORM_FLR_DONE, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + /* flow steering commands */ + { + .opcode = MLX4_QP_FLOW_STEERING_ATTACH, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QP_FLOW_STEERING_ATTACH_wrapper + }, + { + .opcode = MLX4_QP_FLOW_STEERING_DETACH, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QP_FLOW_STEERING_DETACH_wrapper + }, +}; + +static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr_cmd *in_vhcr) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_info *cmd = NULL; + struct mlx4_vhcr_cmd *vhcr_cmd = in_vhcr ? in_vhcr : priv->mfunc.vhcr; + struct mlx4_vhcr *vhcr; + struct mlx4_cmd_mailbox *inbox = NULL; + struct mlx4_cmd_mailbox *outbox = NULL; + u64 in_param; + u64 out_param; + int ret = 0; + int i; + int err = 0; + + /* Create sw representation of Virtual HCR */ + vhcr = kzalloc(sizeof(struct mlx4_vhcr), GFP_KERNEL); + if (!vhcr) + return -ENOMEM; + + /* DMA in the vHCR */ + if (!in_vhcr) { + ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave, + priv->mfunc.master.slave_state[slave].vhcr_dma, + ALIGN(sizeof(struct mlx4_vhcr_cmd), + MLX4_ACCESS_MEM_ALIGN), 1); + if (ret) { + mlx4_err(dev, "%s:Failed reading vhcr" + "ret: 0x%x\n", __func__, ret); + kfree(vhcr); + return ret; + } + } + + /* Fill SW VHCR fields */ + vhcr->in_param = be64_to_cpu(vhcr_cmd->in_param); + vhcr->out_param = be64_to_cpu(vhcr_cmd->out_param); + vhcr->in_modifier = be32_to_cpu(vhcr_cmd->in_modifier); + vhcr->token = be16_to_cpu(vhcr_cmd->token); + vhcr->op = be16_to_cpu(vhcr_cmd->opcode) & 0xfff; + vhcr->op_modifier = (u8) (be16_to_cpu(vhcr_cmd->opcode) >> 12); + vhcr->e_bit = vhcr_cmd->flags & (1 << 6); + + /* Lookup command */ + for (i = 0; i < ARRAY_SIZE(cmd_info); ++i) { + if (vhcr->op == cmd_info[i].opcode) { + cmd = &cmd_info[i]; + break; + } + } + if (!cmd) { + mlx4_err(dev, "Unknown command:0x%x accepted from slave:%d\n", + vhcr->op, slave); + vhcr_cmd->status = CMD_STAT_BAD_PARAM; + goto out_status; + } + + /* Read inbox */ + if (cmd->has_inbox) { + vhcr->in_param &= INBOX_MASK; + inbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(inbox)) { + vhcr_cmd->status = CMD_STAT_BAD_SIZE; + inbox = NULL; + goto out_status; + } + + if (mlx4_ACCESS_MEM(dev, inbox->dma, slave, + vhcr->in_param, + MLX4_MAILBOX_SIZE, 1)) { + mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n", + __func__, cmd->opcode); + vhcr_cmd->status = CMD_STAT_INTERNAL_ERR; + goto out_status; + } + } + + /* Apply permission and bound checks if applicable */ + if (cmd->verify && cmd->verify(dev, slave, vhcr, inbox)) { + mlx4_warn(dev, "Command:0x%x from slave: %d failed protection " + "checks for resource_id:%d\n", vhcr->op, slave, + vhcr->in_modifier); + vhcr_cmd->status = CMD_STAT_BAD_OP; + goto out_status; + } + + /* Allocate outbox */ + if (cmd->has_outbox) { + outbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(outbox)) { + vhcr_cmd->status = CMD_STAT_BAD_SIZE; + outbox = NULL; + goto out_status; + } + } + + /* Execute the command! */ + if (cmd->wrapper) { + err = cmd->wrapper(dev, slave, vhcr, inbox, outbox, + cmd); + if (cmd->out_is_imm) + vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param); + } else { + in_param = cmd->has_inbox ? (u64) inbox->dma : + vhcr->in_param; + out_param = cmd->has_outbox ? (u64) outbox->dma : + vhcr->out_param; + err = __mlx4_cmd(dev, in_param, &out_param, + cmd->out_is_imm, vhcr->in_modifier, + vhcr->op_modifier, vhcr->op, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + if (cmd->out_is_imm) { + vhcr->out_param = out_param; + vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param); + } + } + + if (err) { + mlx4_warn(dev, "vhcr command:0x%x slave:%d failed with" + " error:%d, status %d\n", + vhcr->op, slave, vhcr->errno, err); + vhcr_cmd->status = mlx4_errno_to_status(err); + goto out_status; + } + + + /* Write outbox if command completed successfully */ + if (cmd->has_outbox && !vhcr_cmd->status) { + ret = mlx4_ACCESS_MEM(dev, outbox->dma, slave, + vhcr->out_param, + MLX4_MAILBOX_SIZE, MLX4_CMD_WRAPPED); + if (ret) { + /* If we failed to write back the outbox after the + *command was successfully executed, we must fail this + * slave, as it is now in undefined state */ + mlx4_err(dev, "%s:Failed writing outbox\n", __func__); + goto out; + } + } + +out_status: + /* DMA back vhcr result */ + if (!in_vhcr) { + ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave, + priv->mfunc.master.slave_state[slave].vhcr_dma, + ALIGN(sizeof(struct mlx4_vhcr), + MLX4_ACCESS_MEM_ALIGN), + MLX4_CMD_WRAPPED); + if (ret) + mlx4_err(dev, "%s:Failed writing vhcr result\n", + __func__); + else if (vhcr->e_bit && + mlx4_GEN_EQE(dev, slave, &priv->mfunc.master.cmd_eqe)) + mlx4_warn(dev, "Failed to generate command completion " + "eqe for slave %d\n", slave); + } + +out: + kfree(vhcr); + mlx4_free_cmd_mailbox(dev, inbox); + mlx4_free_cmd_mailbox(dev, outbox); + return ret; +} + +static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave) +{ + int port, err; + struct mlx4_vport_state *vp_admin; + struct mlx4_vport_oper_state *vp_oper; + + for (port = 1; port <= MLX4_MAX_PORTS; port++) { + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; + vp_oper->state = *vp_admin; + if (MLX4_VGT != vp_admin->default_vlan) { + err = mlx4_register_vlan(&priv->dev, port, + vp_admin->default_vlan, &(vp_oper->vlan_idx)); + if (err) { + vp_oper->vlan_idx = NO_INDX; + mlx4_warn((&priv->dev), + "No vlan resorces slave %d, port %d\n", + slave, port); + return err; + } + mlx4_dbg((&(priv->dev)), "alloc vlan %d idx %d slave %d port %d\n", + (int)(vp_oper->state.default_vlan), + vp_oper->vlan_idx, slave, port); + } + if (vp_admin->spoofchk) { + vp_oper->mac_idx = __mlx4_register_mac(&priv->dev, + port, + vp_admin->mac); + if (0 > vp_oper->mac_idx) { + err = vp_oper->mac_idx; + vp_oper->mac_idx = NO_INDX; + mlx4_warn((&priv->dev), + "No mac resorces slave %d, port %d\n", + slave, port); + return err; + } + mlx4_dbg((&(priv->dev)), "alloc mac %llx idx %d slave %d port %d\n", + vp_oper->state.mac, vp_oper->mac_idx, slave, port); + } + } + return 0; +} + +static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave) +{ + int port; + struct mlx4_vport_oper_state *vp_oper; + + for (port = 1; port <= MLX4_MAX_PORTS; port++) { + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + if (NO_INDX != vp_oper->vlan_idx) { + __mlx4_unregister_vlan(&priv->dev, + port, vp_oper->state.default_vlan); + vp_oper->vlan_idx = NO_INDX; + } + if (NO_INDX != vp_oper->mac_idx) { + __mlx4_unregister_mac(&priv->dev, port, vp_oper->state.mac); + vp_oper->mac_idx = NO_INDX; + } + } + return; +} + +static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd, + u16 param, u8 toggle) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; + u32 reply; + u8 is_going_down = 0; + int i; + unsigned long flags; + + slave_state[slave].comm_toggle ^= 1; + reply = (u32) slave_state[slave].comm_toggle << 31; + if (toggle != slave_state[slave].comm_toggle) { + mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER" + "STATE COMPROMISIED ***\n", toggle, slave); + goto reset_slave; + } + if (cmd == MLX4_COMM_CMD_RESET) { + mlx4_warn(dev, "Received reset from slave:%d\n", slave); + slave_state[slave].active = false; + mlx4_master_deactivate_admin_state(priv, slave); + for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) { + slave_state[slave].event_eq[i].eqn = -1; + slave_state[slave].event_eq[i].token = 0; + } + /*check if we are in the middle of FLR process, + if so return "retry" status to the slave*/ + if (MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) + goto inform_slave_state; + + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, slave); + + /* write the version in the event field */ + reply |= mlx4_comm_get_version(); + + goto reset_slave; + } + /*command from slave in the middle of FLR*/ + if (cmd != MLX4_COMM_CMD_RESET && + MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) { + mlx4_warn(dev, "slave:%d is Trying to run cmd(0x%x) " + "in the middle of FLR\n", slave, cmd); + return; + } + + switch (cmd) { + case MLX4_COMM_CMD_VHCR0: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_RESET) + goto reset_slave; + slave_state[slave].vhcr_dma = ((u64) param) << 48; + priv->mfunc.master.slave_state[slave].cookie = 0; + mutex_init(&priv->mfunc.master.gen_eqe_mutex[slave]); + break; + case MLX4_COMM_CMD_VHCR1: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR0) + goto reset_slave; + slave_state[slave].vhcr_dma |= ((u64) param) << 32; + break; + case MLX4_COMM_CMD_VHCR2: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR1) + goto reset_slave; + slave_state[slave].vhcr_dma |= ((u64) param) << 16; + break; + case MLX4_COMM_CMD_VHCR_EN: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2) + goto reset_slave; + slave_state[slave].vhcr_dma |= param; + if (mlx4_master_activate_admin_state(priv, slave)) + goto reset_slave; + slave_state[slave].active = true; + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_INIT, slave); + break; + case MLX4_COMM_CMD_VHCR_POST: + if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) && + (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) + goto reset_slave; + + mutex_lock(&priv->cmd.slave_cmd_mutex); + if (mlx4_master_process_vhcr(dev, slave, NULL)) { + mlx4_err(dev, "Failed processing vhcr for slave:%d," + " resetting slave.\n", slave); + mutex_unlock(&priv->cmd.slave_cmd_mutex); + goto reset_slave; + } + mutex_unlock(&priv->cmd.slave_cmd_mutex); + break; + default: + mlx4_warn(dev, "Bad comm cmd:%d from slave:%d\n", cmd, slave); + goto reset_slave; + } + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); + if (!slave_state[slave].is_slave_going_down) + slave_state[slave].last_cmd = cmd; + else + is_going_down = 1; + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + if (is_going_down) { + mlx4_warn(dev, "Slave is going down aborting command(%d)" + " executing from slave:%d\n", + cmd, slave); + return; + } + __raw_writel((__force u32) cpu_to_be32(reply), + &priv->mfunc.comm[slave].slave_read); + mmiowb(); + + return; + +reset_slave: + /* cleanup any slave resources */ + mlx4_delete_all_resources_for_slave(dev, slave); + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); + if (!slave_state[slave].is_slave_going_down) + slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET; + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + /*with slave in the middle of flr, no need to clean resources again.*/ +inform_slave_state: + memset(&slave_state[slave].event_eq, 0, + sizeof(struct mlx4_slave_event_eq_info)); + __raw_writel((__force u32) cpu_to_be32(reply), + &priv->mfunc.comm[slave].slave_read); + wmb(); +} + +/* master command processing */ +void mlx4_master_comm_channel(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = + container_of(work, + struct mlx4_mfunc_master_ctx, + comm_work); + struct mlx4_mfunc *mfunc = + container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = + container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + __be32 *bit_vec; + u32 comm_cmd; + u32 vec; + int i, j, slave; + int toggle; + int served = 0; + int reported = 0; + u32 slt; + + bit_vec = master->comm_arm_bit_vector; + for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++) { + vec = be32_to_cpu(bit_vec[i]); + for (j = 0; j < 32; j++) { + if (!(vec & (1 << j))) + continue; + ++reported; + slave = (i * 32) + j; + comm_cmd = swab32(readl( + &mfunc->comm[slave].slave_write)); + slt = swab32(readl(&mfunc->comm[slave].slave_read)) + >> 31; + toggle = comm_cmd >> 31; + if (toggle != slt) { + if (master->slave_state[slave].comm_toggle + != slt) { + mlx4_info(dev, "slave %d out of sync." + " read toggle %d, state toggle %d. " + "Resynching.\n", slave, slt, + master->slave_state[slave].comm_toggle); + master->slave_state[slave].comm_toggle = + slt; + } + mlx4_master_do_cmd(dev, slave, + comm_cmd >> 16 & 0xff, + comm_cmd & 0xffff, toggle); + ++served; + } + } + } + + if (reported && reported != served) + mlx4_warn(dev, "Got command event with bitmask from %d slaves" + " but %d were served\n", + reported, served); + + if (mlx4_ARM_COMM_CHANNEL(dev)) + mlx4_warn(dev, "Failed to arm comm channel events\n"); +} + +static int sync_toggles(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int wr_toggle; + int rd_toggle; + unsigned long end; + + wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31; + end = jiffies + msecs_to_jiffies(5000); + + while (time_before(jiffies, end)) { + rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31; + if (rd_toggle == wr_toggle) { + priv->cmd.comm_toggle = rd_toggle; + return 0; + } + + cond_resched(); + } + + /* + * we could reach here if for example the previous VM using this + * function misbehaved and left the channel with unsynced state. We + * should fix this here and give this VM a chance to use a properly + * synced channel + */ + mlx4_warn(dev, "recovering from previously mis-behaved VM\n"); + __raw_writel((__force u32) 0, &priv->mfunc.comm->slave_read); + __raw_writel((__force u32) 0, &priv->mfunc.comm->slave_write); + priv->cmd.comm_toggle = 0; + + return 0; +} + +int mlx4_multi_func_init(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state; + int i, j, err, port; + + if (mlx4_is_master(dev)) + priv->mfunc.comm = + ioremap(pci_resource_start(dev->pdev, priv->fw.comm_bar) + + priv->fw.comm_base, MLX4_COMM_PAGESIZE); + else + priv->mfunc.comm = + ioremap(pci_resource_start(dev->pdev, 2) + + MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE); + if (!priv->mfunc.comm) { + mlx4_err(dev, "Couldn't map communication vector.\n"); + goto err_vhcr; + } + + if (mlx4_is_master(dev)) { + priv->mfunc.master.slave_state = + kzalloc(dev->num_slaves * + sizeof(struct mlx4_slave_state), GFP_KERNEL); + if (!priv->mfunc.master.slave_state) + goto err_comm; + + priv->mfunc.master.vf_admin = + kzalloc(dev->num_slaves * + sizeof(struct mlx4_vf_admin_state), GFP_KERNEL); + if (!priv->mfunc.master.vf_admin) + goto err_comm_admin; + + priv->mfunc.master.vf_oper = + kzalloc(dev->num_slaves * + sizeof(struct mlx4_vf_oper_state), GFP_KERNEL); + if (!priv->mfunc.master.vf_oper) + goto err_comm_oper; + + for (i = 0; i < dev->num_slaves; ++i) { + s_state = &priv->mfunc.master.slave_state[i]; + s_state->last_cmd = MLX4_COMM_CMD_RESET; + for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j) + s_state->event_eq[j].eqn = -1; + __raw_writel((__force u32) 0, + &priv->mfunc.comm[i].slave_write); + __raw_writel((__force u32) 0, + &priv->mfunc.comm[i].slave_read); + mmiowb(); + for (port = 1; port <= MLX4_MAX_PORTS; port++) { + s_state->vlan_filter[port] = + kzalloc(sizeof(struct mlx4_vlan_fltr), + GFP_KERNEL); + if (!s_state->vlan_filter[port]) { + if (--port) + kfree(s_state->vlan_filter[port]); + goto err_slaves; + } + INIT_LIST_HEAD(&s_state->mcast_filters[port]); + priv->mfunc.master.vf_admin[i].vport[port].default_vlan = MLX4_VGT; + priv->mfunc.master.vf_oper[i].vport[port].state.default_vlan = MLX4_VGT; + priv->mfunc.master.vf_oper[i].vport[port].vlan_idx = NO_INDX; + priv->mfunc.master.vf_oper[i].vport[port].mac_idx = NO_INDX; + } + spin_lock_init(&s_state->lock); + } + + memset(&priv->mfunc.master.cmd_eqe, 0, dev->caps.eqe_size); + priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD; + INIT_WORK(&priv->mfunc.master.comm_work, + mlx4_master_comm_channel); + INIT_WORK(&priv->mfunc.master.slave_event_work, + mlx4_gen_slave_eqe); + INIT_WORK(&priv->mfunc.master.slave_flr_event_work, + mlx4_master_handle_slave_flr); + spin_lock_init(&priv->mfunc.master.slave_state_lock); + spin_lock_init(&priv->mfunc.master.slave_eq.event_lock); + priv->mfunc.master.comm_wq = + create_singlethread_workqueue("mlx4_comm"); + if (!priv->mfunc.master.comm_wq) + goto err_slaves; + + if (mlx4_init_resource_tracker(dev)) + goto err_thread; + + err = mlx4_ARM_COMM_CHANNEL(dev); + if (err) { + mlx4_err(dev, " Failed to arm comm channel eq: %x\n", + err); + goto err_resource; + } + + } else { + err = sync_toggles(dev); + if (err) { + mlx4_err(dev, "Couldn't sync toggles\n"); + goto err_comm; + } + } + return 0; + +err_resource: + mlx4_free_resource_tracker(dev, RES_TR_FREE_ALL); +err_thread: + flush_workqueue(priv->mfunc.master.comm_wq); + destroy_workqueue(priv->mfunc.master.comm_wq); +err_slaves: + while (--i) { + for (port = 1; port <= MLX4_MAX_PORTS; port++) + kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]); + } + kfree(priv->mfunc.master.vf_oper); +err_comm_oper: + kfree(priv->mfunc.master.vf_admin); +err_comm_admin: + kfree(priv->mfunc.master.slave_state); +err_comm: + iounmap(priv->mfunc.comm); +err_vhcr: + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + priv->mfunc.vhcr, + priv->mfunc.vhcr_dma); + priv->mfunc.vhcr = NULL; + return -ENOMEM; +} + int mlx4_cmd_init(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); mutex_init(&priv->cmd.hcr_mutex); + mutex_init(&priv->cmd.slave_cmd_mutex); sema_init(&priv->cmd.poll_sem, 1); priv->cmd.use_events = 0; priv->cmd.toggle = 1; - priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_HCR_BASE, - MLX4_HCR_SIZE); - if (!priv->cmd.hcr) { - mlx4_err(dev, "Couldn't map command register."); - return -ENOMEM; + priv->cmd.hcr = NULL; + priv->mfunc.vhcr = NULL; + + if (!mlx4_is_slave(dev)) { + priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + + MLX4_HCR_BASE, MLX4_HCR_SIZE); + if (!priv->cmd.hcr) { + mlx4_err(dev, "Couldn't map command register.\n"); + return -ENOMEM; + } + } + + if (mlx4_is_mfunc(dev)) { + priv->mfunc.vhcr = dma_alloc_coherent(&(dev->pdev->dev), PAGE_SIZE, + &priv->mfunc.vhcr_dma, + GFP_KERNEL); + if (!priv->mfunc.vhcr) { + mlx4_err(dev, "Couldn't allocate VHCR.\n"); + goto err_hcr; + } } priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev, MLX4_MAILBOX_SIZE, MLX4_MAILBOX_SIZE, 0); - if (!priv->cmd.pool) { - iounmap(priv->cmd.hcr); - return -ENOMEM; - } + if (!priv->cmd.pool) + goto err_vhcr; return 0; + +err_vhcr: + if (mlx4_is_mfunc(dev)) + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + priv->mfunc.vhcr, priv->mfunc.vhcr_dma); + priv->mfunc.vhcr = NULL; + +err_hcr: + if (!mlx4_is_slave(dev)) + iounmap(priv->cmd.hcr); + return -ENOMEM; +} + +void mlx4_multi_func_cleanup(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i, port; + + if (mlx4_is_master(dev)) { + flush_workqueue(priv->mfunc.master.comm_wq); + destroy_workqueue(priv->mfunc.master.comm_wq); + for (i = 0; i < dev->num_slaves; i++) { + for (port = 1; port <= MLX4_MAX_PORTS; port++) + kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]); + } + kfree(priv->mfunc.master.slave_state); + kfree(priv->mfunc.master.vf_admin); + kfree(priv->mfunc.master.vf_oper); + } + + iounmap(priv->mfunc.comm); } void mlx4_cmd_cleanup(struct mlx4_dev *dev) @@ -363,7 +2000,13 @@ void mlx4_cmd_cleanup(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); pci_pool_destroy(priv->cmd.pool); - iounmap(priv->cmd.hcr); + + if (!mlx4_is_slave(dev)) + iounmap(priv->cmd.hcr); + if (mlx4_is_mfunc(dev)) + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + priv->mfunc.vhcr, priv->mfunc.vhcr_dma); + priv->mfunc.vhcr = NULL; } /* @@ -374,6 +2017,7 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int i; + int err = 0; priv->cmd.context = kmalloc(priv->cmd.max_cmds * sizeof (struct mlx4_cmd_context), @@ -398,11 +2042,10 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev) ; /* nothing */ --priv->cmd.token_mask; + down(&priv->cmd.poll_sem); priv->cmd.use_events = 1; - down(&priv->cmd.poll_sem); - - return 0; + return err; } /* @@ -442,7 +2085,8 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev) } EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox); -void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox) +void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox) { if (!mailbox) return; @@ -451,3 +2095,70 @@ void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbo kfree(mailbox); } EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox); + +u32 mlx4_comm_get_version(void) +{ + return ((u32) CMD_CHAN_IF_REV << 8) | (u32) CMD_CHAN_VER; +} + +int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vport_state *s_info; + + if (!mlx4_is_master(dev)) + return -EPROTONOSUPPORT; + + if ((vf <= 0) || (vf > dev->num_vfs)) { + mlx4_err(dev, "Bad vf number:%d (max vf activated: %d)\n", vf, dev->num_vfs); + return -EINVAL; + } + + s_info = &priv->mfunc.master.vf_admin[vf].vport[port]; + s_info->mac = mlx4_mac_to_u64(mac); + mlx4_info(dev, "default mac on vf %d port %d to %llX will take afect only after vf restart\n", + vf, port, s_info->mac); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_set_vf_mac); + +int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vport_state *s_info; + + if ((!mlx4_is_master(dev)) || + !(dev->caps.flags & MLX4_DEV_CAP_FLAG_ESWITCH_SUPPORT)) + return -EPROTONOSUPPORT; + + if ((vf <= 0) || (vf > dev->num_vfs) || (vlan > 4095) || (qos > 7)) + return -EINVAL; + + s_info = &priv->mfunc.master.vf_admin[vf].vport[port]; + if ((0 == vlan) && (0 == qos)) + s_info->default_vlan = MLX4_VGT; + else + s_info->default_vlan = vlan; + s_info->default_qos = qos; + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan); + +int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vport_state *s_info; + + if ((!mlx4_is_master(dev)) || + !(dev->caps.flags & MLX4_DEV_CAP_FLAG_ESWITCH_SUPPORT)) + return -EPROTONOSUPPORT; + + if ((vf <= 0) || (vf > dev->num_vfs)) + return -EINVAL; + + s_info = &priv->mfunc.master.vf_admin[vf].vport[port]; + s_info->spoofchk = setting; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk); diff --git a/sys/ofed/drivers/net/mlx4/cq.c b/sys/ofed/drivers/net/mlx4/cq.c index 076c602acb00..c5a36e0b4d63 100644 --- a/sys/ofed/drivers/net/mlx4/cq.c +++ b/sys/ofed/drivers/net/mlx4/cq.c @@ -43,27 +43,6 @@ #include "mlx4.h" #include "icm.h" -struct mlx4_cq_context { - __be32 flags; - u16 reserved1[3]; - __be16 page_offset; - __be32 logsize_usrpage; - __be16 cq_period; - __be16 cq_max_count; - u8 reserved2[3]; - u8 comp_eqn; - u8 log_page_size; - u8 reserved3[2]; - u8 mtt_base_addr_h; - __be32 mtt_base_addr_l; - __be32 last_notified_index; - __be32 solicit_producer_index; - __be32 consumer_index; - __be32 producer_index; - u32 reserved4[2]; - __be64 db_rec_addr; -}; - #define MLX4_CQ_STATUS_OK ( 0 << 28) #define MLX4_CQ_STATUS_OVERFLOW ( 9 << 28) #define MLX4_CQ_STATUS_WRITE_FAIL (10 << 28) @@ -75,10 +54,16 @@ struct mlx4_cq_context { void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn) { + struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table; struct mlx4_cq *cq; + spin_lock(&cq_table->lock); cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree, cqn & (dev->caps.num_cqs - 1)); + if (cq) + atomic_inc(&cq->refcount); + spin_unlock(&cq_table->lock); + if (!cq) { mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn); return; @@ -87,6 +72,9 @@ void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn) ++cq->arm_sn; cq->comp(cq); + + if (atomic_dec_and_test(&cq->refcount)) + complete(&cq->free); } void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type) @@ -116,23 +104,24 @@ void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type) static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int cq_num) { - return mlx4_cmd(dev, mailbox->dma, cq_num, 0, MLX4_CMD_SW2HW_CQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, mailbox->dma, cq_num, 0, + MLX4_CMD_SW2HW_CQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); } static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int cq_num, u32 opmod) { return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int cq_num) { - return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, cq_num, - mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, + cq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq, @@ -187,25 +176,121 @@ int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq, } EXPORT_SYMBOL_GPL(mlx4_cq_resize); -static int mlx4_find_least_loaded_vector(struct mlx4_priv *priv) +int mlx4_cq_ignore_overrun(struct mlx4_dev *dev, struct mlx4_cq *cq) { - int i; - int index = 0; - int min = priv->eq_table.eq[0].load; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_cq_context *cq_context; + int err; - for (i = 1; i < priv->dev.caps.num_comp_vectors; i++) { - if (priv->eq_table.eq[i].load < min) { - index = i; - min = priv->eq_table.eq[i].load; - } - } + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); - return index; + cq_context = mailbox->buf; + memset(cq_context, 0, sizeof *cq_context); + + cq_context->flags |= cpu_to_be32(MLX4_CQ_FLAG_OI); + + err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 3); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_cq_ignore_overrun); + +int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cq_table *cq_table = &priv->cq_table; + int err; + + *cqn = mlx4_bitmap_alloc(&cq_table->bitmap); + if (*cqn == -1) + return -ENOMEM; + + err = mlx4_table_get(dev, &cq_table->table, *cqn); + if (err) + goto err_out; + + err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn); + if (err) + goto err_put; + return 0; + +err_put: + mlx4_table_put(dev, &cq_table->table, *cqn); + +err_out: + mlx4_bitmap_free(&cq_table->bitmap, *cqn); + return err; } -int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, - struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq, - unsigned vector, int collapsed) +static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_CQ, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + return err; + else { + *cqn = get_param_l(&out_param); + return 0; + } + } + return __mlx4_cq_alloc_icm(dev, cqn); +} + +void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cq_table *cq_table = &priv->cq_table; + + mlx4_table_put(dev, &cq_table->cmpt_table, cqn); + mlx4_table_put(dev, &cq_table->table, cqn); + mlx4_bitmap_free(&cq_table->bitmap, cqn); +} + +static void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn) +{ + u64 in_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, cqn); + err = mlx4_cmd(dev, in_param, RES_CQ, RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + mlx4_warn(dev, "Failed freeing cq:%d\n", cqn); + } else + __mlx4_cq_free_icm(dev, cqn); +} + +static int mlx4_find_least_loaded_vector(struct mlx4_priv *priv) +{ + int i; + int index = 0; + int min = priv->eq_table.eq[0].load; + + for (i = 1; i < priv->dev.caps.num_comp_vectors; i++) { + if (priv->eq_table.eq[i].load < min) { + index = i; + min = priv->eq_table.eq[i].load; + } + } + + return index; +} + + +int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, + struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec, + struct mlx4_cq *cq, unsigned vector, int collapsed, + int timestamp_en) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cq_table *cq_table = &priv->cq_table; @@ -214,29 +299,24 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, u64 mtt_addr; int err; - cq->vector = (vector == MLX4_LEAST_ATTACHED_VECTOR) ? - mlx4_find_least_loaded_vector(priv) : vector; + cq->vector = (vector == MLX4_LEAST_ATTACHED_VECTOR) ? + mlx4_find_least_loaded_vector(priv) : vector; - if (cq->vector >= dev->caps.num_comp_vectors) + if (cq->vector > dev->caps.num_comp_vectors + dev->caps.comp_pool) { return -EINVAL; + } - cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap); - if (cq->cqn == -1) - return -ENOMEM; - - err = mlx4_table_get(dev, &cq_table->table, cq->cqn); - if (err) - goto err_out; - - err = mlx4_table_get(dev, &cq_table->cmpt_table, cq->cqn); - if (err) - goto err_put; + err = mlx4_cq_alloc_icm(dev, &cq->cqn); + if (err) { + return err; + } spin_lock_irq(&cq_table->lock); err = radix_tree_insert(&cq_table->tree, cq->cqn, cq); spin_unlock_irq(&cq_table->lock); - if (err) - goto err_cmpt_put; + if (err){ + goto err_icm; + } mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { @@ -248,6 +328,9 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, memset(cq_context, 0, sizeof *cq_context); cq_context->flags = cpu_to_be32(!!collapsed << 18); + if (timestamp_en) + cq_context->flags |= cpu_to_be32(1 << 19); + cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index); cq_context->comp_eqn = priv->eq_table.eq[cq->vector].eqn; cq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; @@ -262,13 +345,16 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, if (err) goto err_radix; - priv->eq_table.eq[cq->vector].load++; + priv->eq_table.eq[cq->vector].load++; cq->cons_index = 0; cq->arm_sn = 1; cq->uar = uar; atomic_set(&cq->refcount, 1); init_completion(&cq->free); + cq->eqn = priv->eq_table.eq[cq->vector].eqn; + cq->irq = priv->eq_table.eq[cq->vector].irq; + return 0; err_radix: @@ -276,14 +362,8 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, radix_tree_delete(&cq_table->tree, cq->cqn); spin_unlock_irq(&cq_table->lock); -err_cmpt_put: - mlx4_table_put(dev, &cq_table->cmpt_table, cq->cqn); - -err_put: - mlx4_table_put(dev, &cq_table->table, cq->cqn); - -err_out: - mlx4_bitmap_free(&cq_table->bitmap, cq->cqn); +err_icm: + mlx4_cq_free_icm(dev, cq->cqn); return err; } @@ -299,8 +379,9 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq) if (err) mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn); + + priv->eq_table.eq[cq->vector].load--; synchronize_irq(priv->eq_table.eq[cq->vector].irq); - priv->eq_table.eq[cq->vector].load--; spin_lock_irq(&cq_table->lock); radix_tree_delete(&cq_table->tree, cq->cqn); @@ -310,8 +391,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq) complete(&cq->free); wait_for_completion(&cq->free); - mlx4_table_put(dev, &cq_table->table, cq->cqn); - mlx4_bitmap_free(&cq_table->bitmap, cq->cqn); + mlx4_cq_free_icm(dev, cq->cqn); } EXPORT_SYMBOL_GPL(mlx4_cq_free); @@ -322,6 +402,8 @@ int mlx4_init_cq_table(struct mlx4_dev *dev) spin_lock_init(&cq_table->lock); INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC); + if (mlx4_is_slave(dev)) + return 0; err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs, dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0); @@ -333,6 +415,8 @@ int mlx4_init_cq_table(struct mlx4_dev *dev) void mlx4_cleanup_cq_table(struct mlx4_dev *dev) { + if (mlx4_is_slave(dev)) + return; /* Nothing to do to clean up radix_tree */ mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap); } diff --git a/sys/ofed/drivers/net/mlx4/en_cq.c b/sys/ofed/drivers/net/mlx4/en_cq.c index 57f00d4c41fb..9783e233774f 100644 --- a/sys/ofed/drivers/net/mlx4/en_cq.c +++ b/sys/ofed/drivers/net/mlx4/en_cq.c @@ -101,10 +101,12 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) if (!cq->is_tx) cq->size = priv->rx_ring[cq->ring].actual_size; + err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar, - cq->wqres.db.dma, &cq->mcq, cq->vector, cq->is_tx); - if (err) + cq->wqres.db.dma, &cq->mcq, cq->vector, cq->is_tx, 0); + if (err) { return err; + } cq->mcq.comp = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq; cq->mcq.event = mlx4_en_cq_event; diff --git a/sys/ofed/drivers/net/mlx4/en_main.c b/sys/ofed/drivers/net/mlx4/en_main.c index 107eee364f14..b56766bea6a7 100644 --- a/sys/ofed/drivers/net/mlx4/en_main.c +++ b/sys/ofed/drivers/net/mlx4/en_main.c @@ -88,7 +88,8 @@ static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) params->tcp_rss = tcp_rss; params->udp_rss = udp_rss; - if (params->udp_rss && !mdev->dev->caps.udp_rss) { + if (params->udp_rss && !(mdev->dev->caps.flags + & MLX4_DEV_CAP_FLAG_UDP_RSS)) { mlx4_warn(mdev, "UDP RSS is not supported on this device.\n"); params->udp_rss = 0; } @@ -116,18 +117,17 @@ static void *get_netdev(struct mlx4_dev *dev, void *ctx, u8 port) } static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, - enum mlx4_dev_event event, int port) + enum mlx4_dev_event event, unsigned long port) { struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr; struct mlx4_en_priv *priv; - if (!mdev->pndev[port]) - return; - - priv = netdev_priv(mdev->pndev[port]); switch (event) { case MLX4_DEV_EVENT_PORT_UP: case MLX4_DEV_EVENT_PORT_DOWN: + if (!mdev->pndev[port]) + return; + priv = netdev_priv(mdev->pndev[port]); /* To prevent races, we poll the link state in a separate task rather than changing it here */ priv->link_state = event; @@ -139,7 +139,11 @@ static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, break; default: - mlx4_warn(mdev, "Unhandled event: %d\n", event); + if (port < 1 || port > dev->caps.num_ports || + !mdev->pndev[port]) + return; + mlx4_warn(mdev, "Unhandled event %d for port %d\n", event, + (int) port); } } @@ -351,8 +355,8 @@ static struct mlx4_interface mlx4_en_interface = { .remove = mlx4_en_remove, .event = mlx4_en_event, .query = mlx4_en_query, - .get_prot_dev = get_netdev, - .protocol = MLX4_PROT_EN, + .get_dev = get_netdev, + .protocol = MLX4_PROT_ETH, }; static int __init mlx4_en_init(void) diff --git a/sys/ofed/drivers/net/mlx4/en_netdev.c b/sys/ofed/drivers/net/mlx4/en_netdev.c index 0a59ab44383f..f7167d9cc519 100644 --- a/sys/ofed/drivers/net/mlx4/en_netdev.c +++ b/sys/ofed/drivers/net/mlx4/en_netdev.c @@ -632,8 +632,7 @@ int mlx4_en_start_port(struct net_device *dev) /* Set port mac number */ en_dbg(DRV, priv, "Setting mac for port %d\n", priv->port); err = mlx4_register_mac(mdev->dev, priv->port, - mlx4_en_mac_to_u64(IF_LLADDR(dev)), - &priv->mac_index); + mlx4_en_mac_to_u64(IF_LLADDR(dev))); if (err) { en_err(priv, "Failed setting port mac\n"); goto tx_err; @@ -697,7 +696,7 @@ int mlx4_en_start_port(struct net_device *dev) mlx4_CLOSE_PORT(mdev->dev, priv->port); mac_err: - mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index); + mlx4_unregister_mac(mdev->dev, priv->port, priv->mac); tx_err: while (tx_index--) { mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[tx_index]); @@ -730,7 +729,7 @@ void mlx4_en_stop_port(struct net_device *dev) priv->port_up = false; /* Unregister Mac address for the port */ - mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index); + mlx4_unregister_mac(mdev->dev, priv->port, priv->mac); mdev->mac_removed[priv->port] = 1; /* Free TX Rings */ @@ -946,6 +945,7 @@ void mlx4_en_destroy_netdev(struct net_device *dev) mutex_unlock(&mdev->state_lock); mlx4_en_free_resources(priv); + mtx_destroy(&priv->stats_lock.m); mtx_destroy(&priv->vlan_lock.m); kfree(priv); @@ -1587,6 +1587,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, /* * Setup wake-on-lan. */ +#if 0 if (priv->mdev->dev->caps.wol) { u64 config; if (mlx4_wol_read(priv->mdev->dev, &config, priv->port) == 0) { @@ -1596,6 +1597,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, dev->if_capenable |= IFCAP_WOL_MAGIC; } } +#endif /* Register for VLAN events */ priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, diff --git a/sys/ofed/drivers/net/mlx4/en_port.c b/sys/ofed/drivers/net/mlx4/en_port.c index d8a2d7f88306..303bb2bef1f9 100644 --- a/sys/ofed/drivers/net/mlx4/en_port.c +++ b/sys/ofed/drivers/net/mlx4/en_port.c @@ -39,13 +39,14 @@ #include #include - +#if 0 // moved to port.c int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode) { return mlx4_cmd(dev, (mac | (clear << 63)), port, mode, - MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } +#endif int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans) { @@ -65,12 +66,13 @@ int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans) i++, j--) filter->entry[j] = cpu_to_be32(vlans[i]); err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_VLAN_FLTR, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } +#if 0 //moved to port.c - shahark int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx) { @@ -94,15 +96,19 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, in_mod = MLX4_SET_PORT_GENERAL << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } - int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, u8 promisc) { + + printf("%s %s:%d\n", __func__, __FILE__, __LINE__); + + + struct mlx4_cmd_mailbox *mailbox; struct mlx4_set_port_rqp_calc_context *context; int err; @@ -116,8 +122,10 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, context->base_qpn = cpu_to_be32(base_qpn); context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_EN_SHIFT | base_qpn); +/* context->mcast = cpu_to_be32((dev->caps.mc_promisc_mode << SET_PORT_PROMISC_MODE_SHIFT) | base_qpn); +*/ context->intra_no_vlan = 0; context->no_vlan = MLX4_NO_VLAN_IDX; context->intra_vlan_miss = 0; @@ -125,11 +133,12 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } +#endif int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) { @@ -144,7 +153,7 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) return PTR_ERR(mailbox); memset(mailbox->buf, 0, sizeof(*qport_context)); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, - MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; qport_context = mailbox->buf; @@ -176,6 +185,7 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) return err; } +#if 0 static int read_iboe_counters(struct mlx4_dev *dev, int index, u64 counters[]) { struct mlx4_cmd_mailbox *mailbox; @@ -189,7 +199,7 @@ static int read_iboe_counters(struct mlx4_dev *dev, int index, u64 counters[]) return -ENOMEM; err = mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, - MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C); + MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_WRAPPED); if (err) goto out; @@ -217,6 +227,7 @@ static int read_iboe_counters(struct mlx4_dev *dev, int index, u64 counters[]) mlx4_free_cmd_mailbox(dev, mailbox); return err; } +#endif int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) { @@ -229,22 +240,24 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) unsigned long ierror; int err; int i; - int counter; + //int counter; u64 counters[4]; dev = mdev->pndev[port]; priv = netdev_priv(dev); memset(counters, 0, sizeof counters); + /* counter = mlx4_get_iboe_counter(priv->mdev->dev, port); if (counter >= 0) err = read_iboe_counters(priv->mdev->dev, counter, counters); + */ mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, sizeof(*mlx4_en_stats)); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0, - MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; diff --git a/sys/ofed/drivers/net/mlx4/en_port.h b/sys/ofed/drivers/net/mlx4/en_port.h index a9e2e2410476..531981452333 100644 --- a/sys/ofed/drivers/net/mlx4/en_port.h +++ b/sys/ofed/drivers/net/mlx4/en_port.h @@ -39,11 +39,7 @@ #define SET_PORT_PROMISC_EN_SHIFT 31 #define SET_PORT_PROMISC_MODE_SHIFT 30 -enum { - MLX4_CMD_SET_VLAN_FLTR = 0x47, - MLX4_CMD_SET_MCAST_FLTR = 0x48, - MLX4_CMD_DUMP_ETH_STATS = 0x49, -}; +#if 0 //moved to port.c - shahark struct mlx4_set_port_general_context { u8 reserved[3]; @@ -72,6 +68,7 @@ struct mlx4_set_port_rqp_calc_context { __be32 promisc; __be32 mcast; }; +#endif #define VLAN_FLTR_SIZE 128 struct mlx4_set_vlan_fltr_mbox { diff --git a/sys/ofed/drivers/net/mlx4/en_rx.c b/sys/ofed/drivers/net/mlx4/en_rx.c index d6843d8cd2f6..81affcee3ebc 100644 --- a/sys/ofed/drivers/net/mlx4/en_rx.c +++ b/sys/ofed/drivers/net/mlx4/en_rx.c @@ -267,7 +267,6 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv) int err; int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + DS_SIZE * priv->num_frags); - for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = &priv->rx_ring[ring_ind]; @@ -673,7 +672,6 @@ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn, en_err(priv, "Failed to allocate qp context\n"); return -ENOMEM; } - err = mlx4_qp_alloc(mdev->dev, qpn, qp); if (err) { en_err(priv, "Failed to allocate qp #%x\n", qpn); @@ -717,7 +715,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) en_dbg(DRV, priv, "Configuring rss steering\n"); err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num, roundup_pow_of_two(priv->rx_ring_num), - &rss_map->base_qpn); + &rss_map->base_qpn, 0); if (err) { en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num); return err; @@ -736,7 +734,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) } /* Configure RSS indirection qp */ - err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &priv->base_qpn); + err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &priv->base_qpn, 0); if (err) { en_err(priv, "Failed to reserve range for RSS " "indirection qp\n"); diff --git a/sys/ofed/drivers/net/mlx4/en_tx.c b/sys/ofed/drivers/net/mlx4/en_tx.c index 9ad3c59b7d08..4661024a3eb8 100644 --- a/sys/ofed/drivers/net/mlx4/en_tx.c +++ b/sys/ofed/drivers/net/mlx4/en_tx.c @@ -122,7 +122,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size, ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map); - err = mlx4_qp_reserve_range(mdev->dev, 1, 256, &ring->qpn); + err = mlx4_qp_reserve_range(mdev->dev, 1, 256, &ring->qpn, MLX4_RESERVE_BF_QP); if (err) { en_err(priv, "Failed reserving qp for tx ring.\n"); goto err_map; @@ -135,7 +135,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, } ring->qp.event = mlx4_en_sqp_event; - err = mlx4_bf_alloc(mdev->dev, &ring->bf); + err = mlx4_bf_alloc(mdev->dev, &ring->bf, 0); if (err) { ring->bf.uar = &mdev->priv_uar; ring->bf.uar->map = mdev->uar_map; diff --git a/sys/ofed/drivers/net/mlx4/eq.c b/sys/ofed/drivers/net/mlx4/eq.c index 3dd96e693f6d..f9d6ab9ced3d 100644 --- a/sys/ofed/drivers/net/mlx4/eq.c +++ b/sys/ofed/drivers/net/mlx4/eq.c @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -41,36 +42,16 @@ #include "mlx4.h" #include "fw.h" +enum { + MLX4_IRQNAME_SIZE = 32 +}; + enum { MLX4_NUM_ASYNC_EQE = 0x100, MLX4_NUM_SPARE_EQE = 0x80, MLX4_EQ_ENTRY_SIZE = 0x20 }; -/* - * Must be packed because start is 64 bits but only aligned to 32 bits. - */ -struct mlx4_eq_context { - __be32 flags; - u16 reserved1[3]; - __be16 page_offset; - u8 log_eq_size; - u8 reserved2[4]; - u8 eq_period; - u8 reserved3; - u8 eq_max_count; - u8 reserved4[3]; - u8 intr; - u8 log_page_size; - u8 reserved5[2]; - u8 mtt_base_addr_h; - __be32 mtt_base_addr_l; - u32 reserved6[2]; - __be32 consumer_index; - __be32 producer_index; - u32 reserved7[4]; -}; - #define MLX4_EQ_STATUS_OK ( 0 << 28) #define MLX4_EQ_STATUS_WRITE_FAIL (10 << 28) #define MLX4_EQ_OWNER_SW ( 0 << 24) @@ -95,46 +76,20 @@ struct mlx4_eq_context { (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR) | \ (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE) | \ (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT) | \ - (1ull << MLX4_EVENT_TYPE_CMD)) + (1ull << MLX4_EVENT_TYPE_CMD) | \ + (1ull << MLX4_EVENT_TYPE_OP_REQUIRED) | \ + (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL) | \ + (1ull << MLX4_EVENT_TYPE_FLR_EVENT) | \ + (1ull << MLX4_EVENT_TYPE_FATAL_WARNING)) -struct mlx4_eqe { - u8 reserved1; - u8 type; - u8 reserved2; - u8 subtype; - union { - u32 raw[6]; - struct { - __be32 cqn; - } __attribute__((packed)) comp; - struct { - u16 reserved1; - __be16 token; - u32 reserved2; - u8 reserved3[3]; - u8 status; - __be64 out_param; - } __attribute__((packed)) cmd; - struct { - __be32 qpn; - } __attribute__((packed)) qp; - struct { - __be32 srqn; - } __attribute__((packed)) srq; - struct { - __be32 cqn; - u32 reserved1; - u8 reserved2[3]; - u8 syndrome; - } __attribute__((packed)) cq_err; - struct { - u32 reserved1[2]; - __be32 port; - } __attribute__((packed)) port_change; - } event; - u8 reserved3[3]; - u8 owner; -} __attribute__((packed)); +static u64 get_async_ev_mask(struct mlx4_dev *dev) +{ + u64 async_ev_mask = MLX4_ASYNC_EVENT_MASK; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV) + async_ev_mask |= (1ull << MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT); + + return async_ev_mask; +} static void eq_set_ci(struct mlx4_eq *eq, int req_not) { @@ -145,27 +100,355 @@ static void eq_set_ci(struct mlx4_eq *eq, int req_not) mb(); } -static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry) +static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor) { - unsigned long off = (entry & (eq->nent - 1)) * MLX4_EQ_ENTRY_SIZE; - return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE; + /* (entry & (eq->nent - 1)) gives us a cyclic array */ + unsigned long offset = (entry & (eq->nent - 1)) * (MLX4_EQ_ENTRY_SIZE << eqe_factor); + /* CX3 is capable of extending the EQE from 32 to 64 bytes. + * When this feature is enabled, the first (in the lower addresses) + * 32 bytes in the 64 byte EQE are reserved and the next 32 bytes + * contain the legacy EQE information. + */ + return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE; } -static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq) +static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor) { - struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index); + struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor); return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe; } +static struct mlx4_eqe *next_slave_event_eqe(struct mlx4_slave_event_eq *slave_eq) +{ + struct mlx4_eqe *eqe = + &slave_eq->event_eqe[slave_eq->cons & (SLAVE_EVENT_EQ_SIZE - 1)]; + return (!!(eqe->owner & 0x80) ^ + !!(slave_eq->cons & SLAVE_EVENT_EQ_SIZE)) ? + eqe : NULL; +} + +void mlx4_gen_slave_eqe(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = + container_of(work, struct mlx4_mfunc_master_ctx, + slave_event_work); + struct mlx4_mfunc *mfunc = + container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + struct mlx4_slave_event_eq *slave_eq = &mfunc->master.slave_eq; + struct mlx4_eqe *eqe; + u8 slave; + int i; + + for (eqe = next_slave_event_eqe(slave_eq); eqe; + eqe = next_slave_event_eqe(slave_eq)) { + slave = eqe->slave_id; + + /* All active slaves need to receive the event */ + if (slave == ALL_SLAVES) { + for (i = 0; i < dev->num_slaves; i++) { + if (i != dev->caps.function && + master->slave_state[i].active) + if (mlx4_GEN_EQE(dev, i, eqe)) + mlx4_warn(dev, "Failed to " + " generate event " + "for slave %d\n", i); + } + } else { + if (mlx4_GEN_EQE(dev, slave, eqe)) + mlx4_warn(dev, "Failed to generate event " + "for slave %d\n", slave); + } + ++slave_eq->cons; + } +} + + +static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq *slave_eq = &priv->mfunc.master.slave_eq; + struct mlx4_eqe *s_eqe; + unsigned long flags; + + spin_lock_irqsave(&slave_eq->event_lock, flags); + s_eqe = &slave_eq->event_eqe[slave_eq->prod & (SLAVE_EVENT_EQ_SIZE - 1)]; + if ((!!(s_eqe->owner & 0x80)) ^ + (!!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE))) { + mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. " + "No free EQE on slave events queue\n", slave); + spin_unlock_irqrestore(&slave_eq->event_lock, flags); + return; + } + + memcpy(s_eqe, eqe, dev->caps.eqe_size - 1); + s_eqe->slave_id = slave; + /* ensure all information is written before setting the ownersip bit */ + wmb(); + s_eqe->owner = !!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE) ? 0x0 : 0x80; + ++slave_eq->prod; + + queue_work(priv->mfunc.master.comm_wq, + &priv->mfunc.master.slave_event_work); + spin_unlock_irqrestore(&slave_eq->event_lock, flags); +} + +static void mlx4_slave_event(struct mlx4_dev *dev, int slave, + struct mlx4_eqe *eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave = + &priv->mfunc.master.slave_state[slave]; + + if (!s_slave->active) { + /*mlx4_warn(dev, "Trying to pass event to inactive slave\n");*/ + return; + } + + slave_event(dev, slave, eqe); +} + +int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_eqe eqe; + + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave]; + + if (!s_slave->active) + return 0; + + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT; + eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE; + eqe.event.port_mgmt_change.port = port; + + return mlx4_GEN_EQE(dev, slave, &eqe); +} +EXPORT_SYMBOL(mlx4_gen_pkey_eqe); + +int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_eqe eqe; + + /*don't send if we don't have the that slave */ + if (dev->num_vfs < slave) + return 0; + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT; + eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO; + eqe.event.port_mgmt_change.port = port; + + return mlx4_GEN_EQE(dev, slave, &eqe); +} +EXPORT_SYMBOL(mlx4_gen_guid_change_eqe); + +int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, + u8 port_subtype_change) +{ + struct mlx4_eqe eqe; + + /*don't send if we don't have the that slave */ + if (dev->num_vfs < slave) + return 0; + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE; + eqe.subtype = port_subtype_change; + eqe.event.port_change.port = cpu_to_be32(port << 28); + + mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__, + port_subtype_change, slave, port); + return mlx4_GEN_EQE(dev, slave, &eqe); +} +EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe); + +enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state; + if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS) { + pr_err("%s: Error: asking for slave:%d, port:%d\n", + __func__, slave, port); + return SLAVE_PORT_DOWN; + } + return s_state[slave].port_state[port]; +} +EXPORT_SYMBOL(mlx4_get_slave_port_state); + +static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, + enum slave_port_state state) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state; + + if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) { + pr_err("%s: Error: asking for slave:%d, port:%d\n", + __func__, slave, port); + return -1; + } + s_state[slave].port_state[port] = state; + + return 0; +} + +static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event) +{ + int i; + enum slave_port_gen_event gen_event; + + for (i = 0; i < dev->num_slaves; i++) + set_and_calc_slave_port_state(dev, i, port, event, &gen_event); +} +/************************************************************************** + The function get as input the new event to that port, + and according to the prev state change the slave's port state. + The events are: + MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, + MLX4_PORT_STATE_DEV_EVENT_PORT_UP + MLX4_PORT_STATE_IB_EVENT_GID_VALID + MLX4_PORT_STATE_IB_EVENT_GID_INVALID +***************************************************************************/ +int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, + u8 port, int event, + enum slave_port_gen_event *gen_event) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *ctx = NULL; + unsigned long flags; + int ret = -1; + enum slave_port_state cur_state = + mlx4_get_slave_port_state(dev, slave, port); + + *gen_event = SLAVE_PORT_GEN_EVENT_NONE; + + if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) { + pr_err("%s: Error: asking for slave:%d, port:%d\n", + __func__, slave, port); + return ret; + } + + ctx = &priv->mfunc.master.slave_state[slave]; + spin_lock_irqsave(&ctx->lock, flags); + + switch (cur_state) { + case SLAVE_PORT_DOWN: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_UP == event) + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PENDING_UP); + break; + case SLAVE_PENDING_UP: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PORT_DOWN); + else if (MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID == event) { + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PORT_UP); + *gen_event = SLAVE_PORT_GEN_EVENT_UP; + } + break; + case SLAVE_PORT_UP: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) { + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PORT_DOWN); + *gen_event = SLAVE_PORT_GEN_EVENT_DOWN; + } else if (MLX4_PORT_STATE_IB_EVENT_GID_INVALID == + event) { + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PENDING_UP); + *gen_event = SLAVE_PORT_GEN_EVENT_DOWN; + } + break; + default: + pr_err("%s: BUG!!! UNKNOWN state: " + "slave:%d, port:%d\n", __func__, slave, port); + goto out; + } + ret = mlx4_get_slave_port_state(dev, slave, port); + +out: + spin_unlock_irqrestore(&ctx->lock, flags); + return ret; +} + +EXPORT_SYMBOL(set_and_calc_slave_port_state); + +int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr) +{ + struct mlx4_eqe eqe; + + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT; + eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PORT_INFO; + eqe.event.port_mgmt_change.port = port; + eqe.event.port_mgmt_change.params.port_info.changed_attr = + cpu_to_be32((u32) attr); + + slave_event(dev, ALL_SLAVES, &eqe); + return 0; +} +EXPORT_SYMBOL(mlx4_gen_slaves_port_mgt_ev); + +void mlx4_master_handle_slave_flr(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = + container_of(work, struct mlx4_mfunc_master_ctx, + slave_flr_event_work); + struct mlx4_mfunc *mfunc = + container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = + container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; + int i; + int err; + unsigned long flags; + + mlx4_dbg(dev, "mlx4_handle_slave_flr\n"); + + for (i = 0 ; i < dev->num_slaves; i++) { + + if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) { + mlx4_dbg(dev, "mlx4_handle_slave_flr: " + "clean slave: %d\n", i); + + mlx4_delete_all_resources_for_slave(dev, i); + /*return the slave to running mode*/ + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); + slave_state[i].last_cmd = MLX4_COMM_CMD_RESET; + slave_state[i].is_slave_going_down = 0; + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + /*notify the FW:*/ + err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + mlx4_warn(dev, "Failed to notify FW on " + "FLR done (slave:%d)\n", i); + } + } +} + static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) { + struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_eqe *eqe; int cqn; int eqes_found = 0; int set_ci = 0; int port; + int slave = 0; + int ret; + u32 flr_slave; + u8 update_slave_state; + int i; + enum slave_port_gen_event gen_event; + unsigned long flags; - while ((eqe = next_eqe_sw(eq))) { + while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor))) { /* * Make sure we read EQ entry contents after we've * checked the ownership bit. @@ -186,14 +469,68 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) case MLX4_EVENT_TYPE_PATH_MIG_FAILED: case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: - mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, - eqe->type); + mlx4_dbg(dev, "event %d arrived\n", eqe->type); + if (mlx4_is_master(dev)) { + /* forward only to slave owning the QP */ + ret = mlx4_get_slave_from_resource_id(dev, + RES_QP, + be32_to_cpu(eqe->event.qp.qpn) + & 0xffffff, &slave); + if (ret && ret != -ENOENT) { + mlx4_dbg(dev, "QP event %02x(%02x) on " + "EQ %d at index %u: could " + "not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + + if (!ret && slave != dev->caps.function) { + mlx4_slave_event(dev, slave, eqe); + break; + } + + } + mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & + 0xffffff, eqe->type); break; case MLX4_EVENT_TYPE_SRQ_LIMIT: + mlx4_warn(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT\n", + __func__); case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR: - mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff, - eqe->type); + if (mlx4_is_master(dev)) { + /* forward only to slave owning the SRQ */ + ret = mlx4_get_slave_from_resource_id(dev, + RES_SRQ, + be32_to_cpu(eqe->event.srq.srqn) + & 0xffffff, + &slave); + if (ret && ret != -ENOENT) { + mlx4_warn(dev, "SRQ event %02x(%02x) " + "on EQ %d at index %u: could" + " not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + mlx4_warn(dev, "%s: slave:%d, srq_no:0x%x," + " event: %02x(%02x)\n", __func__, + slave, + be32_to_cpu(eqe->event.srq.srqn), + eqe->type, eqe->subtype); + + if (!ret && slave != dev->caps.function) { + mlx4_warn(dev, "%s: sending event " + "%02x(%02x) to slave:%d\n", + __func__, eqe->type, + eqe->subtype, slave); + mlx4_slave_event(dev, slave, eqe); + break; + } + } + mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & + 0xffffff, eqe->type); break; case MLX4_EVENT_TYPE_CMD: @@ -209,10 +546,46 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN, port); mlx4_priv(dev)->sense.do_sense_port[port] = 1; + if (!mlx4_is_master(dev)) + break; + for (i = 0; i < dev->num_slaves; i++) { + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) { + if (i == mlx4_master_func_num(dev)) + continue; + mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN" + " to slave: %d, port:%d\n", + __func__, i, port); + mlx4_slave_event(dev, i, eqe); + } else { /* IB port */ + set_and_calc_slave_port_state(dev, i, port, + MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, + &gen_event); + /*we can be in pending state, then do not send port_down event*/ + if (SLAVE_PORT_GEN_EVENT_DOWN == gen_event) { + if (i == mlx4_master_func_num(dev)) + continue; + mlx4_slave_event(dev, i, eqe); + } + } + } } else { - mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, - port); + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, port); + mlx4_priv(dev)->sense.do_sense_port[port] = 0; + + if (!mlx4_is_master(dev)) + break; + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) + for (i = 0; i < dev->num_slaves; i++) { + if (i == mlx4_master_func_num(dev)) + continue; + mlx4_slave_event(dev, i, eqe); + } + else /* IB port */ + /* port-up event will be sent to a slave when the + * slave's alias-guid is set. This is done in alias_GUID.c + */ + set_all_slave_state(dev, port, MLX4_DEV_EVENT_PORT_UP); } break; @@ -221,7 +594,28 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) eqe->event.cq_err.syndrome == 1 ? "overrun" : "access violation", be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff); - mlx4_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn), + if (mlx4_is_master(dev)) { + ret = mlx4_get_slave_from_resource_id(dev, + RES_CQ, + be32_to_cpu(eqe->event.cq_err.cqn) + & 0xffffff, &slave); + if (ret && ret != -ENOENT) { + mlx4_dbg(dev, "CQ event %02x(%02x) on " + "EQ %d at index %u: could " + "not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + + if (!ret && slave != dev->caps.function) { + mlx4_slave_event(dev, slave, eqe); + break; + } + } + mlx4_cq_event(dev, + be32_to_cpu(eqe->event.cq_err.cqn) + & 0xffffff, eqe->type); break; @@ -229,11 +623,99 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn); break; + case MLX4_EVENT_TYPE_OP_REQUIRED: + atomic_inc(&priv->opreq_count); + /* FW commands can't be executed from interrupt context + working in deferred task */ + queue_work(mlx4_wq, &priv->opreq_task); + break; + + case MLX4_EVENT_TYPE_COMM_CHANNEL: + if (!mlx4_is_master(dev)) { + mlx4_warn(dev, "Received comm channel event " + "for non master device\n"); + break; + } + memcpy(&priv->mfunc.master.comm_arm_bit_vector, + eqe->event.comm_channel_arm.bit_vec, + sizeof eqe->event.comm_channel_arm.bit_vec); + queue_work(priv->mfunc.master.comm_wq, + &priv->mfunc.master.comm_work); + break; + + case MLX4_EVENT_TYPE_FLR_EVENT: + flr_slave = be32_to_cpu(eqe->event.flr_event.slave_id); + if (!mlx4_is_master(dev)) { + mlx4_warn(dev, "Non-master function received" + "FLR event\n"); + break; + } + + mlx4_dbg(dev, "FLR event for slave: %d\n", flr_slave); + + if (flr_slave >= dev->num_slaves) { + mlx4_warn(dev, + "Got FLR for unknown function: %d\n", + flr_slave); + update_slave_state = 0; + } else + update_slave_state = 1; + + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); + if (update_slave_state) { + priv->mfunc.master.slave_state[flr_slave].active = false; + priv->mfunc.master.slave_state[flr_slave].last_cmd = MLX4_COMM_CMD_FLR; + priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1; + } + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + queue_work(priv->mfunc.master.comm_wq, + &priv->mfunc.master.slave_flr_event_work); + break; + + case MLX4_EVENT_TYPE_FATAL_WARNING: + if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) { + if (mlx4_is_master(dev)) + for (i = 0; i < dev->num_slaves; i++) { + mlx4_dbg(dev, "%s: Sending " + "MLX4_FATAL_WARNING_SUBTYPE_WARMING" + " to slave: %d\n", __func__, i); + if (i == dev->caps.function) + continue; + mlx4_slave_event(dev, i, eqe); + } + mlx4_err(dev, "Temperature Threshold was reached! " + "Threshold: %d celsius degrees; " + "Current Temperature: %d\n", + be16_to_cpu(eqe->event.warming.warning_threshold), + be16_to_cpu(eqe->event.warming.current_temperature)); + } else + mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), " + "subtype %02x on EQ %d at index %u. owner=%x, " + "nent=0x%x, slave=%x, ownership=%s\n", + eqe->type, eqe->subtype, eq->eqn, + eq->cons_index, eqe->owner, eq->nent, + eqe->slave_id, + !!(eqe->owner & 0x80) ^ + !!(eq->cons_index & eq->nent) ? "HW" : "SW"); + + break; + + case MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT: + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_MGMT_CHANGE, + (unsigned long) eqe); + break; + case MLX4_EVENT_TYPE_EEC_CATAS_ERROR: case MLX4_EVENT_TYPE_ECC_DETECT: default: - mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u\n", - eqe->type, eqe->subtype, eq->eqn, eq->cons_index); + mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at " + "index %u. owner=%x, nent=0x%x, slave=%x, " + "ownership=%s\n", + eqe->type, eqe->subtype, eq->eqn, + eq->cons_index, eqe->owner, eq->nent, + eqe->slave_id, + !!(eqe->owner & 0x80) ^ + !!(eq->cons_index & eq->nent) ? "HW" : "SW"); break; }; @@ -266,6 +748,7 @@ static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr) int work = 0; int i; + writel(priv->eq_table.clr_mask, priv->eq_table.clr_int); for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) @@ -285,25 +768,55 @@ static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr) return IRQ_HANDLED; } +int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq_info *event_eq = + priv->mfunc.master.slave_state[slave].event_eq; + u32 in_modifier = vhcr->in_modifier; + u32 eqn = in_modifier & 0x1FF; + u64 in_param = vhcr->in_param; + int err = 0; + int i; + + if (slave == dev->caps.function) + err = mlx4_cmd(dev, in_param, (in_modifier & 0x80000000) | eqn, + 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + if (!err) + for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) + if (in_param & (1LL << i)) + event_eq[i].eqn = in_modifier >> 31 ? -1 : eqn; + + return err; +} + static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap, int eq_num) { return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num, - 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B); + 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); } static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int eq_num) { - return mlx4_cmd(dev, mailbox->dma, eq_num, 0, MLX4_CMD_SW2HW_EQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, mailbox->dma, eq_num, 0, + MLX4_CMD_SW2HW_EQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); } static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int eq_num) { - return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, 0, MLX4_CMD_HW2SW_EQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, + 0, MLX4_CMD_HW2SW_EQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); } static int mlx4_num_eq_uar(struct mlx4_dev *dev) @@ -313,8 +826,8 @@ static int mlx4_num_eq_uar(struct mlx4_dev *dev) * we need to map, take the difference of highest index and * the lowest index we'll use and add 1. */ - return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 - - dev->caps.reserved_eqs / 4 + 1; + return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs + + dev->caps.comp_pool)/4 - dev->caps.reserved_eqs/4 + 1; } static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq) @@ -339,6 +852,18 @@ static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq) return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4); } +static void mlx4_unmap_uar(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + for (i = 0; i < mlx4_num_eq_uar(dev); ++i) + if (priv->eq_table.uar_map[i]) { + iounmap(priv->eq_table.uar_map[i]); + priv->eq_table.uar_map[i] = NULL; + } +} + static int mlx4_create_eq(struct mlx4_dev *dev, int nent, u8 intr, struct mlx4_eq *eq) { @@ -354,7 +879,8 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent, eq->dev = dev; eq->nent = roundup_pow_of_two(max(nent, 2)); - npages = PAGE_ALIGN(eq->nent * MLX4_EQ_ENTRY_SIZE) / PAGE_SIZE; + /* CX3 is capable of extending the CQE\EQE from 32 to 64 bytes */ + npages = PAGE_ALIGN(eq->nent * (MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor)) / PAGE_SIZE; eq->page_list = kmalloc(npages * sizeof *eq->page_list, GFP_KERNEL); @@ -456,8 +982,9 @@ static void mlx4_free_eq(struct mlx4_dev *dev, struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; int err; - int npages = PAGE_ALIGN(MLX4_EQ_ENTRY_SIZE * eq->nent) / PAGE_SIZE; int i; + /* CX3 is capable of extending the CQE\EQE from 32 to 64 bytes */ + int npages = PAGE_ALIGN((MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor) * eq->nent) / PAGE_SIZE; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -471,16 +998,16 @@ static void mlx4_free_eq(struct mlx4_dev *dev, mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn); for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) { if (i % 4 == 0) - printk("[%02x] ", i * 4); - printk(" %08x", be32_to_cpup(mailbox->buf + i * 4)); + pr_cont("[%02x] ", i * 4); + pr_cont(" %08x", be32_to_cpup(mailbox->buf + i * 4)); if ((i + 1) % 4 == 0) - printk("\n"); + pr_cont("\n"); } } mlx4_mtt_cleanup(dev, &eq->mtt); for (i = 0; i < npages; ++i) - pci_free_consistent(dev->pdev, PAGE_SIZE, + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, eq->page_list[i].buf, eq->page_list[i].map); @@ -492,16 +1019,32 @@ static void mlx4_free_eq(struct mlx4_dev *dev, static void mlx4_free_irqs(struct mlx4_dev *dev) { struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table; - int i; + struct mlx4_priv *priv = mlx4_priv(dev); + int i, vec; if (eq_table->have_irq) free_irq(dev->pdev->irq, dev); + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) if (eq_table->eq[i].have_irq) { free_irq(eq_table->eq[i].irq, eq_table->eq + i); eq_table->eq[i].have_irq = 0; } + for (i = 0; i < dev->caps.comp_pool; i++) { + /* + * Freeing the assigned irq's + * all bits should be 0, but we need to validate + */ + if (priv->msix_ctl.pool_bm & 1ULL << i) { + /* NO need protecting*/ + vec = dev->caps.num_comp_vectors + 1 + i; + free_irq(priv->eq_table.eq[vec].irq, + &priv->eq_table.eq[vec]); + } + } + + kfree(eq_table->irq_names); } @@ -549,8 +1092,9 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) int err; int i; - priv->eq_table.uar_map = kcalloc(sizeof *priv->eq_table.uar_map, - mlx4_num_eq_uar(dev), GFP_KERNEL); + priv->eq_table.uar_map = kcalloc(mlx4_num_eq_uar(dev), + sizeof *priv->eq_table.uar_map, + GFP_KERNEL); if (!priv->eq_table.uar_map) { err = -ENOMEM; goto err_out_free; @@ -564,23 +1108,30 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) for (i = 0; i < mlx4_num_eq_uar(dev); ++i) priv->eq_table.uar_map[i] = NULL; - err = mlx4_map_clr_int(dev); - if (err) - goto err_out_bitmap; + if (!mlx4_is_slave(dev)) { + err = mlx4_map_clr_int(dev); + if (err) + goto err_out_bitmap; - priv->eq_table.clr_mask = - swab32(1 << (priv->eq_table.inta_pin & 31)); - priv->eq_table.clr_int = priv->clr_base + - (priv->eq_table.inta_pin < 32 ? 4 : 0); + priv->eq_table.clr_mask = + swab32(1 << (priv->eq_table.inta_pin & 31)); + priv->eq_table.clr_int = priv->clr_base + + (priv->eq_table.inta_pin < 32 ? 4 : 0); + } - priv->eq_table.irq_names = kmalloc(16 * dev->caps.num_comp_vectors, GFP_KERNEL); + priv->eq_table.irq_names = + kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1 + + dev->caps.comp_pool), + GFP_KERNEL); if (!priv->eq_table.irq_names) { err = -ENOMEM; - goto err_out_bitmap; + goto err_out_clr_int; } for (i = 0; i < dev->caps.num_comp_vectors; ++i) { - err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE, + err = mlx4_create_eq(dev, dev->caps.num_cqs - + dev->caps.reserved_cqs + + MLX4_NUM_SPARE_EQE, (dev->flags & MLX4_FLAG_MSI_X) ? i : 0, &priv->eq_table.eq[i]); if (err) { @@ -595,18 +1146,42 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) if (err) goto err_out_comp; + /*if additional completion vectors poolsize is 0 this loop will not run*/ + for (i = dev->caps.num_comp_vectors + 1; + i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) { + + err = mlx4_create_eq(dev, dev->caps.num_cqs - + dev->caps.reserved_cqs + + MLX4_NUM_SPARE_EQE, + (dev->flags & MLX4_FLAG_MSI_X) ? i : 0, + &priv->eq_table.eq[i]); + if (err) { + --i; + goto err_out_unmap; + } + } + + if (dev->flags & MLX4_FLAG_MSI_X) { - static const char async_eq_name[] = DRV_NAME "(async)"; const char *eq_name; for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) { if (i < dev->caps.num_comp_vectors) { - snprintf(priv->eq_table.irq_names + i * 16, 16, - "eth-mlx4-%d", i); - eq_name = priv->eq_table.irq_names + i * 16; - } else - eq_name = async_eq_name; + snprintf(priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, + "mlx4-comp-%d@pci:%s", i, + pci_name(dev->pdev)); + } else { + snprintf(priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, + "mlx4-async@pci:%s", + pci_name(dev->pdev)); + } + eq_name = priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE; err = request_irq(priv->eq_table.eq[i].irq, mlx4_msi_x_interrupt, 0, eq_name, priv->eq_table.eq + i); @@ -616,15 +1191,19 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) priv->eq_table.eq[i].have_irq = 1; } } else { + snprintf(priv->eq_table.irq_names, + MLX4_IRQNAME_SIZE, + DRV_NAME "@pci:%s", + pci_name(dev->pdev)); err = request_irq(dev->pdev->irq, mlx4_interrupt, - IRQF_SHARED, DRV_NAME, dev); + IRQF_SHARED, priv->eq_table.irq_names, dev); if (err) goto err_out_async; priv->eq_table.have_irq = 1; } - err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0, + err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); if (err) mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n", @@ -646,10 +1225,14 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) mlx4_free_eq(dev, &priv->eq_table.eq[i]); --i; } - mlx4_unmap_clr_int(dev); mlx4_free_irqs(dev); +err_out_clr_int: + if (!mlx4_is_slave(dev)) + mlx4_unmap_clr_int(dev); + err_out_bitmap: + mlx4_unmap_uar(dev); mlx4_bitmap_cleanup(&priv->eq_table.bitmap); err_out_free: @@ -663,20 +1246,18 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); int i; - mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1, + mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 1, priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); mlx4_free_irqs(dev); - for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) + for (i = 0; i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) mlx4_free_eq(dev, &priv->eq_table.eq[i]); - mlx4_unmap_clr_int(dev); - - for (i = 0; i < mlx4_num_eq_uar(dev); ++i) - if (priv->eq_table.uar_map[i]) - iounmap(priv->eq_table.uar_map[i]); + if (!mlx4_is_slave(dev)) + mlx4_unmap_clr_int(dev); + mlx4_unmap_uar(dev); mlx4_bitmap_cleanup(&priv->eq_table.bitmap); kfree(priv->eq_table.uar_map); @@ -694,7 +1275,7 @@ int mlx4_test_interrupts(struct mlx4_dev *dev) err = mlx4_NOP(dev); /* When not in MSI_X, there is only one irq to check */ - if (!(dev->flags & MLX4_FLAG_MSI_X)) + if (!(dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(dev)) return err; /* A loop over all completion vectors, for each vector we will check @@ -705,8 +1286,8 @@ int mlx4_test_interrupts(struct mlx4_dev *dev) /* Temporary use polling for command completions */ mlx4_cmd_use_polling(dev); - /* Map the new eq to handle all asynchronous events */ - err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0, + /* Map the new eq to handle all asyncronous events */ + err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, priv->eq_table.eq[i].eqn); if (err) { mlx4_warn(dev, "Failed mapping eq for interrupt test\n"); @@ -720,8 +1301,70 @@ int mlx4_test_interrupts(struct mlx4_dev *dev) } /* Return to default */ - mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0, + mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); return err; } EXPORT_SYMBOL(mlx4_test_interrupts); + +int mlx4_assign_eq(struct mlx4_dev *dev, char *name, int *vector) +{ + + struct mlx4_priv *priv = mlx4_priv(dev); + int vec = 0, err = 0, i; + + mutex_lock(&priv->msix_ctl.pool_lock); + for (i = 0; !vec && i < dev->caps.comp_pool; i++) { + if (~priv->msix_ctl.pool_bm & 1ULL << i) { + priv->msix_ctl.pool_bm |= 1ULL << i; + vec = dev->caps.num_comp_vectors + 1 + i; + snprintf(priv->eq_table.irq_names + + vec * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, "%s", name); + err = request_irq(priv->eq_table.eq[vec].irq, + mlx4_msi_x_interrupt, 0, + &priv->eq_table.irq_names[vec<<5], + priv->eq_table.eq + vec); + if (err) { + /*zero out bit by fliping it*/ + priv->msix_ctl.pool_bm ^= 1 << i; + vec = 0; + continue; + /*we dont want to break here*/ + } + eq_set_ci(&priv->eq_table.eq[vec], 1); + } + } + mutex_unlock(&priv->msix_ctl.pool_lock); + + if (vec) { + *vector = vec; + } else { + *vector = 0; + err = (i == dev->caps.comp_pool) ? -ENOSPC : err; + } + return err; +} +EXPORT_SYMBOL(mlx4_assign_eq); + +void mlx4_release_eq(struct mlx4_dev *dev, int vec) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + /*bm index*/ + int i = vec - dev->caps.num_comp_vectors - 1; + + if (likely(i >= 0)) { + /*sanity check , making sure were not trying to free irq's + Belonging to a legacy EQ*/ + mutex_lock(&priv->msix_ctl.pool_lock); + if (priv->msix_ctl.pool_bm & 1ULL << i) { + free_irq(priv->eq_table.eq[vec].irq, + &priv->eq_table.eq[vec]); + priv->msix_ctl.pool_bm &= ~(1ULL << i); + } + mutex_unlock(&priv->msix_ctl.pool_lock); + } + +} +EXPORT_SYMBOL(mlx4_release_eq); + diff --git a/sys/ofed/drivers/net/mlx4/fw.c b/sys/ofed/drivers/net/mlx4/fw.c index d27db3834a14..bafae00df3b8 100644 --- a/sys/ofed/drivers/net/mlx4/fw.c +++ b/sys/ofed/drivers/net/mlx4/fw.c @@ -33,6 +33,7 @@ */ #include +#include #include "fw.h" #include "icm.h" @@ -46,14 +47,10 @@ enum { extern void __buggy_use_of_MLX4_GET(void); extern void __buggy_use_of_MLX4_PUT(void); -static int enable_qos; +static bool enable_qos; module_param(enable_qos, bool, 0444); MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)"); -static int mlx4_pre_t11_mode = 0; -module_param_named(enable_pre_t11_mode, mlx4_pre_t11_mode, int, 0644); -MODULE_PARM_DESC(enable_pre_t11_mode, "For FCoXX, enable pre-t11 mode if non-zero (default: 0)"); - #define MLX4_GET(dest, source, offset) \ do { \ void *__p = (char *) (source) + (offset); \ @@ -93,6 +90,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags) [ 9] = "Q_Key violation counter", [10] = "VMM", [12] = "DPDP", + [15] = "Big LSO headers", [16] = "MW support", [17] = "APM support", [18] = "Atomic ops support", @@ -102,8 +100,17 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags) [24] = "Demand paging support", [25] = "Router support", [30] = "IBoE support", - [48] = "Basic counters support", - [49] = "Extended counters support", + [32] = "Unicast loopback support", + [34] = "FCS header control", + [38] = "Wake On LAN support", + [40] = "UDP RSS support", + [41] = "Unicast VEP steering support", + [42] = "Multicast VEP steering support", + [48] = "Counters support", + [59] = "Port management change event support", + [60] = "eSwitch support", + [61] = "64 byte EQE support", + [62] = "64 byte CQE support", }; int i; @@ -113,6 +120,21 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags) mlx4_dbg(dev, " %s\n", fname[i]); } +static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) +{ + static const char * const fname[] = { + [0] = "RSS support", + [1] = "RSS Toeplitz Hash Function support", + [2] = "RSS XOR Hash Function support", + [3] = "Device manage flow steering support" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(fname); ++i) + if (fname[i] && (flags & (1LL << i))) + mlx4_dbg(dev, " %s\n", fname[i]); +} + int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg) { struct mlx4_cmd_mailbox *mailbox; @@ -135,25 +157,257 @@ int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg) MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET); err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } +int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u8 field; + u32 size; + int err = 0; + +#define QUERY_FUNC_CAP_FLAGS_OFFSET 0x0 +#define QUERY_FUNC_CAP_NUM_PORTS_OFFSET 0x1 +#define QUERY_FUNC_CAP_PF_BHVR_OFFSET 0x4 +#define QUERY_FUNC_CAP_FMR_OFFSET 0x8 +#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET 0x10 +#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET 0x14 +#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET 0x18 +#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET 0x20 +#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET 0x24 +#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET 0x28 +#define QUERY_FUNC_CAP_MAX_EQ_OFFSET 0x2c +#define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET 0x30 + +#define QUERY_FUNC_CAP_FMR_FLAG 0x80 +#define QUERY_FUNC_CAP_FLAG_RDMA 0x40 +#define QUERY_FUNC_CAP_FLAG_ETH 0x80 + +/* when opcode modifier = 1 */ +#define QUERY_FUNC_CAP_PHYS_PORT_OFFSET 0x3 +#define QUERY_FUNC_CAP_RDMA_PROPS_OFFSET 0x8 +#define QUERY_FUNC_CAP_ETH_PROPS_OFFSET 0xc + +#define QUERY_FUNC_CAP_QP0_TUNNEL 0x10 +#define QUERY_FUNC_CAP_QP0_PROXY 0x14 +#define QUERY_FUNC_CAP_QP1_TUNNEL 0x18 +#define QUERY_FUNC_CAP_QP1_PROXY 0x1c + +#define QUERY_FUNC_CAP_ETH_PROPS_FORCE_MAC 0x40 +#define QUERY_FUNC_CAP_ETH_PROPS_FORCE_VLAN 0x80 + +#define QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID 0x80 + + if (vhcr->op_modifier == 1) { + field = 0; + /* ensure force vlan and force mac bits are not set */ + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_ETH_PROPS_OFFSET); + /* ensure that phy_wqe_gid bit is not set */ + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_RDMA_PROPS_OFFSET); + + field = vhcr->in_modifier; /* phys-port = logical-port */ + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_PHYS_PORT_OFFSET); + + /* size is now the QP number */ + size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + field - 1; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL); + + size += 2; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL); + + size = dev->phys_caps.base_proxy_sqpn + 8 * slave + field - 1; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_PROXY); + + size += 2; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_PROXY); + + } else if (vhcr->op_modifier == 0) { + /* enable rdma and ethernet interfaces */ + field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA); + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET); + + field = dev->caps.num_ports; + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET); + + size = dev->caps.function_caps; /* set PF behaviours */ + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_PF_BHVR_OFFSET); + + field = 0; /* protected FMR support not available as yet */ + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FMR_OFFSET); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET); + + size = dev->caps.num_eqs; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET); + + size = dev->caps.reserved_eqs; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET); + + size = dev->caps.num_mgms + dev->caps.num_amgms; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET); + + } else + err = -EINVAL; + + return err; +} + +int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port, + struct mlx4_func_cap *func_cap) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + u8 field, op_modifier; + u32 size; + int err = 0; + + op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */ + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, gen_or_port, op_modifier, + MLX4_CMD_QUERY_FUNC_CAP, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + goto out; + + outbox = mailbox->buf; + + if (!op_modifier) { + MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS_OFFSET); + if (!(field & (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA))) { + mlx4_err(dev, "The host supports neither eth nor rdma interfaces\n"); + err = -EPROTONOSUPPORT; + goto out; + } + func_cap->flags = field; + + MLX4_GET(field, outbox, QUERY_FUNC_CAP_NUM_PORTS_OFFSET); + func_cap->num_ports = field; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_PF_BHVR_OFFSET); + func_cap->pf_context_behaviour = size; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET); + func_cap->qp_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET); + func_cap->srq_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET); + func_cap->cq_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MAX_EQ_OFFSET); + func_cap->max_eq = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET); + func_cap->reserved_eq = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET); + func_cap->mpt_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET); + func_cap->mtt_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET); + func_cap->mcg_quota = size & 0xFFFFFF; + goto out; + } + + /* logical port query */ + if (gen_or_port > dev->caps.num_ports) { + err = -EINVAL; + goto out; + } + + if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) { + MLX4_GET(field, outbox, QUERY_FUNC_CAP_ETH_PROPS_OFFSET); + if (field & QUERY_FUNC_CAP_ETH_PROPS_FORCE_VLAN) { + mlx4_err(dev, "VLAN is enforced on this port\n"); + err = -EPROTONOSUPPORT; + goto out; + } + + if (field & QUERY_FUNC_CAP_ETH_PROPS_FORCE_MAC) { + mlx4_err(dev, "Force mac is enabled on this port\n"); + err = -EPROTONOSUPPORT; + goto out; + } + } else if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_IB) { + MLX4_GET(field, outbox, QUERY_FUNC_CAP_RDMA_PROPS_OFFSET); + if (field & QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID) { + mlx4_err(dev, "phy_wqe_gid is " + "enforced on this ib port\n"); + err = -EPROTONOSUPPORT; + goto out; + } + } + + MLX4_GET(field, outbox, QUERY_FUNC_CAP_PHYS_PORT_OFFSET); + func_cap->physical_port = field; + if (func_cap->physical_port != gen_or_port) { + err = -ENOSYS; + goto out; + } + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL); + func_cap->qp0_tunnel_qpn = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_PROXY); + func_cap->qp0_proxy_qpn = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_TUNNEL); + func_cap->qp1_tunnel_qpn = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY); + func_cap->qp1_proxy_qpn = size & 0xFFFFFF; + + /* All other resources are allocated by the master, but we still report + * 'num' and 'reserved' capabilities as follows: + * - num remains the maximum resource index + * - 'num - reserved' is the total available objects of a resource, but + * resource indices may be less than 'reserved' + * TODO: set per-resource quotas */ + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + + return err; +} + int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { struct mlx4_cmd_mailbox *mailbox; u32 *outbox; u8 field; - u32 field32; + u32 field32, flags, ext_flags; u16 size; u16 stat_rate; int err; int i; - u32 in_modifier; - u64 out_param; - u32 tmp1, tmp2; #define QUERY_DEV_CAP_OUT_SIZE 0x100 #define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET 0x10 @@ -178,8 +432,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET 0x29 #define QUERY_DEV_CAP_MAX_RES_QP_OFFSET 0x2b #define QUERY_DEV_CAP_MAX_GSO_OFFSET 0x2d +#define QUERY_DEV_CAP_RSS_OFFSET 0x2e #define QUERY_DEV_CAP_MAX_RDMA_OFFSET 0x2f -#define QUERY_DEV_CAP_STAT_CFG_INL_OFFSET 0x31 #define QUERY_DEV_CAP_RSZ_SRQ_OFFSET 0x33 #define QUERY_DEV_CAP_ACK_DELAY_OFFSET 0x35 #define QUERY_DEV_CAP_MTU_WIDTH_OFFSET 0x36 @@ -187,10 +441,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET 0x38 #define QUERY_DEV_CAP_MAX_GID_OFFSET 0x3b #define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET 0x3c +#define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET 0x3e #define QUERY_DEV_CAP_MAX_PKEY_OFFSET 0x3f #define QUERY_DEV_CAP_EXT_FLAGS_OFFSET 0x40 -#define QUERY_DEV_CAP_UDP_RSS_OFFSET 0x42 -#define QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET 0x43 +#define QUERY_DEV_CAP_SYNC_QP_OFFSET 0x42 #define QUERY_DEV_CAP_FLAGS_OFFSET 0x44 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET 0x48 #define QUERY_DEV_CAP_UAR_SZ_OFFSET 0x49 @@ -210,6 +464,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_PD_OFFSET 0x65 #define QUERY_DEV_CAP_RSVD_XRC_OFFSET 0x66 #define QUERY_DEV_CAP_MAX_XRC_OFFSET 0x67 +#define QUERY_DEV_CAP_MAX_BASIC_COUNTERS_OFFSET 0x68 +#define QUERY_DEV_CAP_MAX_EXTENDED_COUNTERS_OFFSET 0x6c +#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET 0x76 +#define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET 0x77 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET 0x84 @@ -223,16 +481,15 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET 0x94 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 -#define QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET 0x68 -#define QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET 0x6c + dev_cap->flags2 = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) goto out; @@ -253,7 +510,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET); dev_cap->max_mpts = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET); - dev_cap->reserved_eqs = 1 << (field & 0xf); + dev_cap->reserved_eqs = field & 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET); dev_cap->max_eqs = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET); @@ -275,6 +532,17 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) else dev_cap->max_gso_sz = 1 << field; + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSS_OFFSET); + if (field & 0x20) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_XOR; + if (field & 0x10) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_TOP; + field &= 0xf; + if (field) { + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS; + dev_cap->max_rss_tbl_sz = 1 << field; + } else + dev_cap->max_rss_tbl_sz = 0; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET); dev_cap->max_rdma_global = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET); @@ -283,16 +551,21 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->num_ports = field & 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET); dev_cap->max_msg_sz = 1 << (field & 0x1f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET); + if (field & 0x80) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN; + dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f; + MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET); + dev_cap->fs_max_num_qp_per_entry = field; MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET); dev_cap->stat_rate_support = stat_rate; - MLX4_GET(field, outbox, QUERY_DEV_CAP_UDP_RSS_OFFSET); - dev_cap->udp_rss = field & 0x1; - MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET); - dev_cap->loopback_support = field & 0x1; - dev_cap->wol = field & 0x40; - MLX4_GET(tmp1, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); - MLX4_GET(tmp2, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); - dev_cap->flags = tmp2 | (u64)tmp1 << 32; + MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET); + dev_cap->timestamp_support = field & 0x80; + MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); + dev_cap->flags = flags | (u64)ext_flags << 32; + MLX4_GET(field, outbox, QUERY_DEV_CAP_SYNC_QP_OFFSET); + dev_cap->sync_qp = field & 0x10; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET); dev_cap->reserved_uars = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET); @@ -305,10 +578,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET); dev_cap->bf_reg_size = 1 << (field & 0x1f); MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET); - if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) { - mlx4_dbg(dev, "log blue flame is invalid (%d), forcing 3\n", field & 0x1f); + if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) field = 3; - } dev_cap->bf_regs_per_page = 1 << (field & 0x3f); mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n", dev_cap->bf_reg_size, dev_cap->bf_regs_per_page); @@ -332,7 +603,6 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->reserved_pds = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET); dev_cap->max_pds = 1 << (field & 0x3f); - MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET); dev_cap->reserved_xrcds = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET); @@ -363,8 +633,6 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->max_srq_sz = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET); dev_cap->max_qp_sz = 1 << field; - MLX4_GET(field, outbox, QUERY_DEV_CAP_STAT_CFG_INL_OFFSET); - dev_cap->inline_cfg = field & 1; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET); dev_cap->resize_srq = field & 1; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET); @@ -378,10 +646,14 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) QUERY_DEV_CAP_RSVD_LKEY_OFFSET); MLX4_GET(dev_cap->max_icm_sz, outbox, QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET); - MLX4_GET(dev_cap->max_basic_counters, outbox, - QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET); - MLX4_GET(dev_cap->max_ext_counters, outbox, - QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET); + if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS) + MLX4_GET(dev_cap->max_basic_counters, outbox, + QUERY_DEV_CAP_MAX_BASIC_COUNTERS_OFFSET); + /* FW reports 256 however real value is 255 */ + dev_cap->max_basic_counters = min_t(u32, dev_cap->max_basic_counters, 255); + if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT) + MLX4_GET(dev_cap->max_extended_counters, outbox, + QUERY_DEV_CAP_MAX_EXTENDED_COUNTERS_OFFSET); if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { for (i = 1; i <= dev_cap->num_ports; ++i) { @@ -408,19 +680,16 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_PORT_WAVELENGTH_OFFSET 0x1c #define QUERY_PORT_TRANS_CODE_OFFSET 0x20 -#define STAT_CFG_PORT_MODE (1 << 28) -#define STAT_CFG_PORT_OFFSET 0x8 -#define STAT_CFG_PORT_MASK (1 << 20) -#define STAT_CFG_MOD_INLINE 0x3 - for (i = 1; i <= dev_cap->num_ports; ++i) { err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) goto out; MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET); dev_cap->supported_port_types[i] = field & 3; + dev_cap->suggested_type[i] = (field >> 3) & 1; + dev_cap->default_sense[i] = (field >> 4) & 1; MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET); dev_cap->ib_mtu[i] = field & 0xf; MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET); @@ -440,20 +709,6 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->vendor_oui[i] = field32 & 0xffffff; MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET); MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET); - - /* Query stat cfg for port enablement */ - if (dev_cap->inline_cfg) { - in_modifier = STAT_CFG_PORT_MODE | i << 8 | - STAT_CFG_PORT_OFFSET; - err = mlx4_cmd_imm(dev, 0, &out_param, - in_modifier, - STAT_CFG_MOD_INLINE, - MLX4_CMD_MOD_STAT_CFG, - MLX4_CMD_TIME_CLASS_B); - if (!err) - if (!(out_param & STAT_CFG_PORT_MASK)) - dev_cap->supported_port_types[i] = 0; - } } } @@ -494,14 +749,134 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n", dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg); mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz); + mlx4_dbg(dev, "Max basic counters: %d\n", dev_cap->max_basic_counters); + mlx4_dbg(dev, "Max extended counters: %d\n", dev_cap->max_extended_counters); + mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz); dump_dev_cap_flags(dev, dev_cap->flags); + dump_dev_cap_flags2(dev, dev_cap->flags2); out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } +int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u64 flags; + int err = 0; + u8 field; + + err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + + /* add port mng change event capability unconditionally to slaves */ + MLX4_GET(flags, outbox->buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + flags |= MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV; + MLX4_PUT(outbox->buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + + /* For guests, report Blueflame disabled */ + MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET); + field &= 0x7f; + MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET); + + return 0; +} + +int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 def_mac; + u8 port_type; + u16 short_field; + int err; + +#define MLX4_VF_PORT_NO_LINK_SENSE_MASK 0xE0 +#define QUERY_PORT_CUR_MAX_PKEY_OFFSET 0x0c +#define QUERY_PORT_CUR_MAX_GID_OFFSET 0x0e + + err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + + if (!err && dev->caps.function != slave) { + /* set slave default_mac address */ + MLX4_GET(def_mac, outbox->buf, QUERY_PORT_MAC_OFFSET); + def_mac += slave << 8; + /* if config MAC in DB use it */ + if (priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac) + def_mac = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac; + MLX4_PUT(outbox->buf, def_mac, QUERY_PORT_MAC_OFFSET); + + /* get port type - currently only eth is enabled */ + MLX4_GET(port_type, outbox->buf, + QUERY_PORT_SUPPORTED_TYPE_OFFSET); + + /* No link sensing allowed */ + port_type &= MLX4_VF_PORT_NO_LINK_SENSE_MASK; + /* set port type to currently operating port type */ + port_type |= (dev->caps.port_type[vhcr->in_modifier] & 0x3); + + MLX4_PUT(outbox->buf, port_type, + QUERY_PORT_SUPPORTED_TYPE_OFFSET); + + if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH) + short_field = mlx4_get_slave_num_gids(dev, slave); + else + short_field = 1; /* slave max gids */ + MLX4_PUT(outbox->buf, short_field, + QUERY_PORT_CUR_MAX_GID_OFFSET); + + short_field = dev->caps.pkey_table_len[vhcr->in_modifier]; + MLX4_PUT(outbox->buf, short_field, + QUERY_PORT_CUR_MAX_PKEY_OFFSET); + } + + return err; +} + +int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port, + int *gid_tbl_len, int *pkey_tbl_len) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + u16 field; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto out; + + outbox = mailbox->buf; + + MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_GID_OFFSET); + *gid_tbl_len = field; + + MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_PKEY_OFFSET); + *pkey_tbl_len = field; + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_get_slave_pkey_gid_tbl_len); + int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt) { struct mlx4_cmd_mailbox *mailbox; @@ -551,7 +926,8 @@ int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt) if (++nent == MLX4_MAILBOX_SIZE / 16) { err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); if (err) goto out; nent = 0; @@ -560,7 +936,8 @@ int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt) } if (nent) - err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B); + err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) goto out; @@ -589,13 +966,15 @@ int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm) int mlx4_UNMAP_FA(struct mlx4_dev *dev) { - return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B); + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } int mlx4_RUN_FW(struct mlx4_dev *dev) { - return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } int mlx4_QUERY_FW(struct mlx4_dev *dev) @@ -611,7 +990,7 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) #define QUERY_FW_OUT_SIZE 0x100 #define QUERY_FW_VER_OFFSET 0x00 -#define MC_PROMISC_VER 0x2000702bcull +#define QUERY_FW_PPF_ID 0x09 #define QUERY_FW_CMD_IF_REV_OFFSET 0x0a #define QUERY_FW_MAX_CMD_OFFSET 0x0f #define QUERY_FW_ERR_START_OFFSET 0x30 @@ -622,13 +1001,19 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) #define QUERY_FW_CLR_INT_BASE_OFFSET 0x20 #define QUERY_FW_CLR_INT_BAR_OFFSET 0x28 +#define QUERY_FW_COMM_BASE_OFFSET 0x40 +#define QUERY_FW_COMM_BAR_OFFSET 0x48 + +#define QUERY_FW_CLOCK_OFFSET 0x50 +#define QUERY_FW_CLOCK_BAR 0x58 + mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) goto out; @@ -640,10 +1025,13 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) | ((fw_ver & 0xffff0000ull) >> 16) | ((fw_ver & 0x0000ffffull) << 16); - if (dev->caps.fw_ver < MC_PROMISC_VER) - dev->caps.mc_promisc_mode = 2; - else - dev->caps.mc_promisc_mode = 1; + + MLX4_GET(lg, outbox, QUERY_FW_PPF_ID); + dev->caps.function = lg; + + if (mlx4_is_slave(dev)) + goto out; + MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET); if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV || @@ -686,8 +1074,19 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) MLX4_GET(fw->clr_int_bar, outbox, QUERY_FW_CLR_INT_BAR_OFFSET); fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2; + MLX4_GET(fw->comm_base, outbox, QUERY_FW_COMM_BASE_OFFSET); + MLX4_GET(fw->comm_bar, outbox, QUERY_FW_COMM_BAR_OFFSET); + fw->comm_bar = (fw->comm_bar >> 6) * 2; + mlx4_dbg(dev, "Communication vector bar:%d offset:0x%llx\n", + fw->comm_bar, fw->comm_base); mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2); + MLX4_GET(fw->clock_offset, outbox, QUERY_FW_CLOCK_OFFSET); + MLX4_GET(fw->clock_bar, outbox, QUERY_FW_CLOCK_BAR); + fw->clock_bar = (fw->clock_bar >> 6) * 2; + mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n", + fw->comm_bar, fw->comm_base); + /* * Round up number of system pages needed in case * MLX4_ICM_PAGE_SIZE < PAGE_SIZE. @@ -704,6 +1103,30 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) return err; } +int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u8 *outbuf; + int err; + + outbuf = outbox->buf; + err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_FW, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + + /* for slaves, set pci PPF ID to invalid and zero out everything + * else except FW version */ + outbuf[0] = outbuf[1] = 0; + memset(&outbuf[8], 0, QUERY_FW_OUT_SIZE - 8); + outbuf[QUERY_FW_PPF_ID] = MLX4_INVALID_SLAVE_ID; + + return 0; +} + static void get_board_id(void *vsd, char *board_id) { int i; @@ -748,7 +1171,7 @@ int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter) outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) goto out; @@ -772,7 +1195,6 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_VERSION_OFFSET 0x000 #define INIT_HCA_VERSION 2 #define INIT_HCA_CACHELINE_SZ_OFFSET 0x0e -#define INIT_HCA_X86_64_BYTE_CACHELINE_SZ 0x40 #define INIT_HCA_FLAGS_OFFSET 0x014 #define INIT_HCA_QPC_OFFSET 0x020 #define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10) @@ -781,6 +1203,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f) #define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30) #define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37) +#define INIT_HCA_EQE_CQE_OFFSETS (INIT_HCA_QPC_OFFSET + 0x38) #define INIT_HCA_ALTC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40) #define INIT_HCA_AUXC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50) #define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60) @@ -791,7 +1214,17 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00) #define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12) #define INIT_HCA_LOG_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16) +#define INIT_HCA_UC_STEERING_OFFSET (INIT_HCA_MCAST_OFFSET + 0x18) #define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b) +#define INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN 0x6 +#define INIT_HCA_FS_PARAM_OFFSET 0x1d0 +#define INIT_HCA_FS_BASE_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x00) +#define INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x12) +#define INIT_HCA_FS_LOG_TABLE_SZ_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x1b) +#define INIT_HCA_FS_ETH_BITS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x21) +#define INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22) +#define INIT_HCA_FS_IB_BITS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x25) +#define INIT_HCA_FS_IB_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x26) #define INIT_HCA_TPT_OFFSET 0x0f0 #define INIT_HCA_DMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00) #define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b) @@ -809,9 +1242,9 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) memset(inbox, 0, INIT_HCA_IN_SIZE); *((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION; -#if defined(__x86_64__) || defined(__PPC64__) - *((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = INIT_HCA_X86_64_BYTE_CACHELINE_SZ; -#endif + + *((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = + ((ilog2(CACHE_LINE_SIZE) - 4) << 5) | (1 << 4); #if defined(__LITTLE_ENDIAN) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1); @@ -831,9 +1264,31 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) if (enable_qos) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2); - /* counters mode */ - *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= - cpu_to_be32(dev->caps.counters_mode << 4); + /* Enable fast drop performance optimization */ + if (dev->caps.fast_drop) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 7); + + /* enable counters */ + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4); + + /* CX3 is capable of extending CQEs\EQEs from 32 to 64 bytes */ + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) { + *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29); + dev->caps.eqe_size = 64; + dev->caps.eqe_factor = 1; + } else { + dev->caps.eqe_size = 32; + dev->caps.eqe_factor = 0; + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) { + *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30); + dev->caps.cqe_size = 64; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; + } else { + dev->caps.cqe_size = 32; + } /* QPC/EEC/CQC/EQC/RDMARC attributes */ @@ -850,12 +1305,45 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) MLX4_PUT(inbox, param->rdmarc_base, INIT_HCA_RDMARC_BASE_OFFSET); MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET); - /* multicast attributes */ + /* steering attributes */ + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= + cpu_to_be32(1 << + INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN); - MLX4_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); - MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); - MLX4_PUT(inbox, param->log_mc_hash_sz, INIT_HCA_LOG_MC_HASH_SZ_OFFSET); - MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); + MLX4_PUT(inbox, param->mc_base, INIT_HCA_FS_BASE_OFFSET); + MLX4_PUT(inbox, param->log_mc_entry_sz, + INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET); + MLX4_PUT(inbox, param->log_mc_table_sz, + INIT_HCA_FS_LOG_TABLE_SZ_OFFSET); + /* Enable Ethernet flow steering + * with udp unicast and tcp unicast + */ + MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN), + INIT_HCA_FS_ETH_BITS_OFFSET); + MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR, + INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET); + /* Enable IPoIB flow steering + * with udp unicast and tcp unicast + */ + MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN), + INIT_HCA_FS_IB_BITS_OFFSET); + MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR, + INIT_HCA_FS_IB_NUM_ADDRS_OFFSET); + } else { + MLX4_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); + MLX4_PUT(inbox, param->log_mc_entry_sz, + INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); + MLX4_PUT(inbox, param->log_mc_hash_sz, + INIT_HCA_LOG_MC_HASH_SZ_OFFSET); + MLX4_PUT(inbox, param->log_mc_table_sz, + INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); + if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0) { + MLX4_PUT(inbox, (u8) (1 << 3), + INIT_HCA_UC_STEERING_OFFSET); + } + } /* TPT attributes */ @@ -866,13 +1354,11 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) /* UAR attributes */ - MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET); + MLX4_PUT(inbox, param->uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET); MLX4_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET); - if (!mlx4_pre_t11_mode && dev->caps.flags & (u32) MLX4_DEV_CAP_FLAG_FC_T11) - *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 10); - - err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000); + err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000, + MLX4_CMD_NATIVE); if (err) mlx4_err(dev, "INIT_HCA returns %d\n", err); @@ -881,6 +1367,154 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) return err; } +int mlx4_QUERY_HCA(struct mlx4_dev *dev, + struct mlx4_init_hca_param *param) +{ + struct mlx4_cmd_mailbox *mailbox; + __be32 *outbox; + u32 dword_field; + int err; + u8 byte_field; + +#define QUERY_HCA_GLOBAL_CAPS_OFFSET 0x04 +#define QUERY_HCA_CORE_CLOCK_OFFSET 0x0c + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, + MLX4_CMD_QUERY_HCA, + MLX4_CMD_TIME_CLASS_B, + !mlx4_is_slave(dev)); + if (err) + goto out; + + MLX4_GET(param->global_caps, outbox, QUERY_HCA_GLOBAL_CAPS_OFFSET); + MLX4_GET(param->hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET); + + /* QPC/EEC/CQC/EQC/RDMARC attributes */ + + MLX4_GET(param->qpc_base, outbox, INIT_HCA_QPC_BASE_OFFSET); + MLX4_GET(param->log_num_qps, outbox, INIT_HCA_LOG_QP_OFFSET); + MLX4_GET(param->srqc_base, outbox, INIT_HCA_SRQC_BASE_OFFSET); + MLX4_GET(param->log_num_srqs, outbox, INIT_HCA_LOG_SRQ_OFFSET); + MLX4_GET(param->cqc_base, outbox, INIT_HCA_CQC_BASE_OFFSET); + MLX4_GET(param->log_num_cqs, outbox, INIT_HCA_LOG_CQ_OFFSET); + MLX4_GET(param->altc_base, outbox, INIT_HCA_ALTC_BASE_OFFSET); + MLX4_GET(param->auxc_base, outbox, INIT_HCA_AUXC_BASE_OFFSET); + MLX4_GET(param->eqc_base, outbox, INIT_HCA_EQC_BASE_OFFSET); + MLX4_GET(param->log_num_eqs, outbox, INIT_HCA_LOG_EQ_OFFSET); + MLX4_GET(param->rdmarc_base, outbox, INIT_HCA_RDMARC_BASE_OFFSET); + MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET); + + MLX4_GET(dword_field, outbox, INIT_HCA_FLAGS_OFFSET); + if (dword_field & (1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN)) { + param->steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED; + } else { + MLX4_GET(byte_field, outbox, INIT_HCA_UC_STEERING_OFFSET); + if (byte_field & 0x8) { + param->steering_mode = MLX4_STEERING_MODE_B0; + } + else { + param->steering_mode = MLX4_STEERING_MODE_A0; + } + } + if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { + MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET); + MLX4_GET(param->log_mc_entry_sz, outbox, + INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET); + MLX4_GET(param->log_mc_table_sz, outbox, + INIT_HCA_FS_LOG_TABLE_SZ_OFFSET); + } else { + MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET); + MLX4_GET(param->log_mc_entry_sz, outbox, + INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); + MLX4_GET(param->log_mc_hash_sz, outbox, + INIT_HCA_LOG_MC_HASH_SZ_OFFSET); + MLX4_GET(param->log_mc_table_sz, outbox, + INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); + } + + /* CX3 is capable of extending CQEs\EQEs from 32 to 64 bytes */ + MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS); + if (byte_field & 0x20) /* 64-bytes eqe enabled */ + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED; + if (byte_field & 0x40) /* 64-bytes cqe enabled */ + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; + + /* TPT attributes */ + + MLX4_GET(param->dmpt_base, outbox, INIT_HCA_DMPT_BASE_OFFSET); + MLX4_GET(param->log_mpt_sz, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET); + MLX4_GET(param->mtt_base, outbox, INIT_HCA_MTT_BASE_OFFSET); + MLX4_GET(param->cmpt_base, outbox, INIT_HCA_CMPT_BASE_OFFSET); + + /* UAR attributes */ + + MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET); + MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET); + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + + return err; +} + +/* for IB-type ports only in SRIOV mode. Checks that both proxy QP0 + * and real QP0 are active, so that the paravirtualized QP0 is ready + * to operate */ +static int check_qp0_state(struct mlx4_dev *dev, int function, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + /* irrelevant if not infiniband */ + if (priv->mfunc.master.qp0_state[port].proxy_qp0_active && + priv->mfunc.master.qp0_state[port].qp0_active) + return 1; + return 0; +} + +int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int port = vhcr->in_modifier; + int err; + + if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port)) + return 0; + + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) { + /* Enable port only if it was previously disabled */ + if (!priv->mfunc.master.init_port_ref[port]) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + } + priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); + } else { + if (slave == mlx4_master_func_num(dev)) { + if (check_qp0_state(dev, slave, port) && + !priv->mfunc.master.qp0_state[port].port_active) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + priv->mfunc.master.qp0_state[port].port_active = 1; + priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); + } + } else + priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); + } + ++priv->mfunc.master.init_port_ref[port]; + return 0; +} + int mlx4_INIT_PORT(struct mlx4_dev *dev, int port) { struct mlx4_cmd_mailbox *mailbox; @@ -924,33 +1558,76 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port) MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET); err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); } else err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); return err; } EXPORT_SYMBOL_GPL(mlx4_INIT_PORT); +int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int port = vhcr->in_modifier; + int err; + + if (!(priv->mfunc.master.slave_state[slave].init_port_mask & + (1 << port))) + return 0; + + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) { + if (priv->mfunc.master.init_port_ref[port] == 1) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, + 1000, MLX4_CMD_NATIVE); + if (err) + return err; + } + priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); + } else { + /* infiniband port */ + if (slave == mlx4_master_func_num(dev)) { + if (!priv->mfunc.master.qp0_state[port].qp0_active && + priv->mfunc.master.qp0_state[port].port_active) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, + 1000, MLX4_CMD_NATIVE); + if (err) + return err; + priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); + priv->mfunc.master.qp0_state[port].port_active = 0; + } + } else + priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); + } + --priv->mfunc.master.init_port_ref[port]; + return 0; +} + int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port) { - return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000); + return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000, + MLX4_CMD_WRAPPED); } EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT); int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic) { - return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000); + return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000, + MLX4_CMD_NATIVE); } int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages) { int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0, MLX4_CMD_SET_ICM_SIZE, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (ret) return ret; @@ -967,30 +1644,12 @@ int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages) int mlx4_NOP(struct mlx4_dev *dev) { /* Input modifier of 0x1f means "finish as soon as possible." */ - return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100); + return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } -#define MLX4_WOL_SETUP_MODE (5 << 28) -int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port) -{ - u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; - - return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3, - MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A); -} -EXPORT_SYMBOL_GPL(mlx4_wol_read); - -int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port) -{ - u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; - - return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG, - MLX4_CMD_TIME_CLASS_A); -} -EXPORT_SYMBOL_GPL(mlx4_wol_write); - int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length, - u8 op_modifier, u32 in_offset[], u32 counter_out[]) + u8 op_modifier, u32 in_offset[], + u32 counter_out[]) { struct mlx4_cmd_mailbox *mailbox; u32 *outbox; @@ -1003,11 +1662,12 @@ int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length, outbox = mailbox->buf; ret = mlx4_cmd_box(dev, 0, mailbox->dma, 0, op_modifier, - MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); if (ret) goto out; - for (i=0; i < array_length; i++) { + for (i = 0; i < array_length; i++) { if (in_offset[i] > MLX4_MAILBOX_SIZE) { ret = -EINVAL; goto out; @@ -1022,9 +1682,115 @@ int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length, } EXPORT_SYMBOL_GPL(mlx4_query_diag_counters); -void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported) +#define MLX4_WOL_SETUP_MODE (5 << 28) +int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port) { - *enable_pre_t11 = !!mlx4_pre_t11_mode; - *t11_supported = !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FC_T11); + u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; + + return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3, + MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); +} +EXPORT_SYMBOL_GPL(mlx4_wol_read); + +int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port) +{ + u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; + + return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); +} +EXPORT_SYMBOL_GPL(mlx4_wol_write); + +enum { + ADD_TO_MCG = 0x26, +}; + + +void mlx4_opreq_action(struct work_struct *work) +{ + struct mlx4_priv *priv = container_of(work, struct mlx4_priv, opreq_task); + struct mlx4_dev *dev = &priv->dev; + int num_tasks = atomic_read(&priv->opreq_count); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + u32 *outbox; + u32 modifier; + u16 token; + u16 type_m; + u16 type; + int err; + u32 num_qps; + struct mlx4_qp qp; + int i; + u8 rem_mcg; + u8 prot; + +#define GET_OP_REQ_MODIFIER_OFFSET 0x08 +#define GET_OP_REQ_TOKEN_OFFSET 0x14 +#define GET_OP_REQ_TYPE_OFFSET 0x1a +#define GET_OP_REQ_DATA_OFFSET 0x20 + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + mlx4_err(dev, "Failed to allocate mailbox for GET_OP_REQ\n"); + return; + } + outbox = mailbox->buf; + + while (num_tasks) { + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, + MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Failed to retreive required operation: %d\n", err); + return; + } + MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET); + MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET); + MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET); + type_m = type >> 12; + type &= 0xfff; + + switch (type) { + case ADD_TO_MCG: + if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_warn(dev, "ADD MCG operation is not supported in " + "DEVICE_MANAGED steerign mode\n"); + err = EPERM; + break; + } + mgm = (struct mlx4_mgm *) ((u8 *) (outbox) + GET_OP_REQ_DATA_OFFSET); + num_qps = be32_to_cpu(mgm->members_count) & MGM_QPN_MASK; + rem_mcg = ((u8 *) (&mgm->members_count))[0] & 1; + prot = ((u8 *) (&mgm->members_count))[0] >> 6; + + for (i = 0; i < num_qps; i++) { + qp.qpn = be32_to_cpu(mgm->qp[i]); + if (rem_mcg) + err = mlx4_multicast_detach(dev, &qp, mgm->gid, prot, 0); + else + err = mlx4_multicast_attach(dev, &qp, mgm->gid, mgm->gid[5] ,0, prot, NULL); + if (err) + break; + } + break; + default: + mlx4_warn(dev, "Bad type for required operation\n"); + err = EINVAL; + break; + } + err = mlx4_cmd(dev, 0, ((u32) err | cpu_to_be32(token) << 16), 1, + MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Failed to acknowledge required request: %d\n", err); + goto out; + } + memset(outbox, 0, 0xffc); + num_tasks = atomic_dec_return(&priv->opreq_count); + } + +out: + mlx4_free_cmd_mailbox(dev, mailbox); } -EXPORT_SYMBOL_GPL(mlx4_get_fc_t11_settings); diff --git a/sys/ofed/drivers/net/mlx4/fw.h b/sys/ofed/drivers/net/mlx4/fw.h index fbcab2163682..5fe77821aeb3 100644 --- a/sys/ofed/drivers/net/mlx4/fw.h +++ b/sys/ofed/drivers/net/mlx4/fw.h @@ -78,10 +78,10 @@ struct mlx4_dev_cap { u16 wavelength[MLX4_MAX_PORTS + 1]; u64 trans_code[MLX4_MAX_PORTS + 1]; u16 stat_rate_support; - int udp_rss; - int loopback_support; - int wol; + int fs_log_max_ucast_qp_range_size; + int fs_max_num_qp_per_entry; u64 flags; + u64 flags2; int reserved_uars; int uar_size; int min_page_sz; @@ -108,17 +108,41 @@ struct mlx4_dev_cap { int dmpt_entry_sz; int cmpt_entry_sz; int mtt_entry_sz; - int inline_cfg; int resize_srq; u32 bmme_flags; u32 reserved_lkey; u64 max_icm_sz; int max_gso_sz; + int max_rss_tbl_sz; u8 supported_port_types[MLX4_MAX_PORTS + 1]; + u8 suggested_type[MLX4_MAX_PORTS + 1]; + u8 default_sense[MLX4_MAX_PORTS + 1]; u8 log_max_macs[MLX4_MAX_PORTS + 1]; u8 log_max_vlans[MLX4_MAX_PORTS + 1]; u32 max_basic_counters; - u32 max_ext_counters; + u32 sync_qp; + u8 timestamp_support; + u32 max_extended_counters; +}; + +struct mlx4_func_cap { + u8 num_ports; + u8 flags; + u32 pf_context_behaviour; + int qp_quota; + int cq_quota; + int srq_quota; + int mpt_quota; + int mtt_quota; + int max_eq; + int reserved_eq; + int mcg_quota; + u32 qp0_tunnel_qpn; + u32 qp0_proxy_qpn; + u32 qp1_tunnel_qpn; + u32 qp1_proxy_qpn; + u8 physical_port; + u8 port_flags; }; struct mlx4_adapter { @@ -138,8 +162,10 @@ struct mlx4_init_hca_param { u64 dmpt_base; u64 cmpt_base; u64 mtt_base; + u64 global_caps; u16 log_mc_entry_sz; u16 log_mc_hash_sz; + u16 hca_core_clock; u8 log_num_qps; u8 log_num_srqs; u8 log_num_cqs; @@ -148,6 +174,9 @@ struct mlx4_init_hca_param { u8 log_mc_table_sz; u8 log_mpt_sz; u8 log_uar_sz; + u8 uar_page_sz; /* log pg sz in 4k chunks */ + u8 steering_mode; /* for QUERY_HCA */ + u64 dev_cap_enabled; }; struct mlx4_init_ib_param { @@ -172,16 +201,27 @@ struct mlx4_set_ib_param { }; int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap); +int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port, + struct mlx4_func_cap *func_cap); +int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm); int mlx4_UNMAP_FA(struct mlx4_dev *dev); int mlx4_RUN_FW(struct mlx4_dev *dev); int mlx4_QUERY_FW(struct mlx4_dev *dev); int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter); int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param); +int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param); int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic); int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt); int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages); +int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); +int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); int mlx4_NOP(struct mlx4_dev *dev); int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg); +void mlx4_opreq_action(struct work_struct *work); #endif /* MLX4_FW_H */ diff --git a/sys/ofed/drivers/net/mlx4/icm.c b/sys/ofed/drivers/net/mlx4/icm.c index 3a14d6b6b1b7..d18fde150daa 100644 --- a/sys/ofed/drivers/net/mlx4/icm.c +++ b/sys/ofed/drivers/net/mlx4/icm.c @@ -31,10 +31,10 @@ * SOFTWARE. */ -#include #include #include #include +#include #include @@ -93,13 +93,17 @@ void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent) kfree(icm); } -static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask) +static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, + gfp_t gfp_mask, int node) { struct page *page; - page = alloc_pages(gfp_mask, order); - if (!page) - return -ENOMEM; + page = alloc_pages_node(node, gfp_mask, order); + if (!page) { + page = alloc_pages(gfp_mask, order); + if (!page) + return -ENOMEM; + } sg_set_page(mem, page, PAGE_SIZE << order, 0); return 0; @@ -130,9 +134,13 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, /* We use sg_set_buf for coherent allocs, which assumes low memory */ BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); - icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); - if (!icm) - return NULL; + icm = kmalloc_node(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN), + dev->numa_node); + if (!icm) { + icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!icm) + return NULL; + } icm->refcount = 0; INIT_LIST_HEAD(&icm->chunk_list); @@ -141,10 +149,15 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, while (npages > 0) { if (!chunk) { - chunk = kmalloc(sizeof *chunk, - gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); - if (!chunk) - goto fail; + chunk = kmalloc_node(sizeof *chunk, + gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN), + dev->numa_node); + if (!chunk) { + chunk = kmalloc(sizeof *chunk, + gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!chunk) + goto fail; + } sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN); chunk->npages = 0; @@ -161,31 +174,33 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, cur_order, gfp_mask); else ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages], - cur_order, gfp_mask); + cur_order, gfp_mask, + dev->numa_node); - if (!ret) { - ++chunk->npages; + if (ret) { + if (--cur_order < 0) + goto fail; + else + continue; + } - if (coherent) - ++chunk->nsg; - else if (chunk->npages == MLX4_ICM_CHUNK_LEN) { - chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, - chunk->npages, - PCI_DMA_BIDIRECTIONAL); + ++chunk->npages; - if (chunk->nsg <= 0) - goto fail; - } + if (coherent) + ++chunk->nsg; + else if (chunk->npages == MLX4_ICM_CHUNK_LEN) { + chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, + chunk->npages, + PCI_DMA_BIDIRECTIONAL); - if (chunk->npages == MLX4_ICM_CHUNK_LEN) - chunk = NULL; - - npages -= 1 << cur_order; - } else { - --cur_order; - if (cur_order < 0) + if (chunk->nsg <= 0) goto fail; } + + if (chunk->npages == MLX4_ICM_CHUNK_LEN) + chunk = NULL; + + npages -= 1 << cur_order; } if (!coherent && chunk) { @@ -209,36 +224,10 @@ static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt) return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt); } -int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count) +static int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count) { return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM, - MLX4_CMD_TIME_CLASS_B); -} - -int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt) -{ - struct mlx4_cmd_mailbox *mailbox; - __be64 *inbox; - int err; - - mailbox = mlx4_alloc_cmd_mailbox(dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); - inbox = mailbox->buf; - - inbox[0] = cpu_to_be64(virt); - inbox[1] = cpu_to_be64(dma_addr); - - err = mlx4_cmd(dev, mailbox->dma, 1, 0, MLX4_CMD_MAP_ICM, - MLX4_CMD_TIME_CLASS_B); - - mlx4_free_cmd_mailbox(dev, mailbox); - - if (!err) - mlx4_dbg(dev, "Mapped page at %llx to %llx for ICM.\n", - (unsigned long long) dma_addr, (unsigned long long) virt); - - return err; + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm) @@ -248,12 +237,14 @@ int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm) int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev) { - return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, MLX4_CMD_TIME_CLASS_B); + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj) { - int i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size); + u32 i = (obj & (table->num_obj - 1)) / + (MLX4_TABLE_CHUNK_SIZE / table->obj_size); int ret = 0; mutex_lock(&table->mutex); @@ -286,16 +277,18 @@ int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) return ret; } -void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) +void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj) { - int i; + u32 i; + u64 offset; i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size); mutex_lock(&table->mutex); if (--table->icm[i]->refcount == 0) { - mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE, + offset = (u64) i * MLX4_TABLE_CHUNK_SIZE; + mlx4_UNMAP_ICM(dev, table->virt + offset, MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); mlx4_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; @@ -304,9 +297,11 @@ void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) mutex_unlock(&table->mutex); } -void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle) +void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, + dma_addr_t *dma_handle) { - int idx, offset, dma_offset, i; + int offset, dma_offset, i; + u64 idx; struct mlx4_icm_chunk *chunk; struct mlx4_icm *icm; struct page *page = NULL; @@ -316,7 +311,7 @@ void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_han mutex_lock(&table->mutex); - idx = (obj & (table->num_obj - 1)) * table->obj_size; + idx = (u64) (obj & (table->num_obj - 1)) * table->obj_size; icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE]; dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE; @@ -350,10 +345,11 @@ void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_han } int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end) + u32 start, u32 end) { int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size; - int i, err; + int err; + u32 i; for (i = start; i <= end; i += inc) { err = mlx4_table_get(dev, table, i); @@ -373,22 +369,23 @@ int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, } void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end) + u32 start, u32 end) { - int i; + u32 i; for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size) mlx4_table_put(dev, table, i); } int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, - u64 virt, int obj_size, int nobj, int reserved, + u64 virt, int obj_size, u32 nobj, int reserved, int use_lowmem, int use_coherent) { int obj_per_chunk; int num_icm; unsigned chunk_size; int i; + u64 size; obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size; num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk; @@ -404,10 +401,12 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, table->coherent = use_coherent; mutex_init(&table->mutex); + size = (u64) nobj * obj_size; for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) { chunk_size = MLX4_TABLE_CHUNK_SIZE; - if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > nobj * obj_size) - chunk_size = PAGE_ALIGN(nobj * obj_size - i * MLX4_TABLE_CHUNK_SIZE); + if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > size) + chunk_size = PAGE_ALIGN(size - + i * MLX4_TABLE_CHUNK_SIZE); table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT, (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) | @@ -437,6 +436,8 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, mlx4_free_icm(dev, table->icm[i], use_coherent); } + kfree(table->icm); + return -ENOMEM; } diff --git a/sys/ofed/drivers/net/mlx4/icm.h b/sys/ofed/drivers/net/mlx4/icm.h index b87f7261c567..f83ad8141dc4 100644 --- a/sys/ofed/drivers/net/mlx4/icm.h +++ b/sys/ofed/drivers/net/mlx4/icm.h @@ -71,17 +71,17 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, gfp_t gfp_mask, int coherent); void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent); +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj); +void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj); +int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, + u32 start, u32 end); +void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, + u32 start, u32 end); int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, - u64 virt, int obj_size, int nobj, int reserved, + u64 virt, int obj_size, u32 nobj, int reserved, int use_lowmem, int use_coherent); void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table); -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); -void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); -void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle); -int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end); -void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end); +void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, dma_addr_t *dma_handle); static inline void mlx4_icm_first(struct mlx4_icm *icm, struct mlx4_icm_iter *iter) @@ -122,9 +122,5 @@ static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter) return sg_dma_len(&iter->chunk->mem[iter->page_idx]); } -int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count); -int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt); -int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); -int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); #endif /* MLX4_ICM_H */ diff --git a/sys/ofed/drivers/net/mlx4/intf.c b/sys/ofed/drivers/net/mlx4/intf.c index bdf7e7d02a11..0f6754bc547e 100644 --- a/sys/ofed/drivers/net/mlx4/intf.c +++ b/sys/ofed/drivers/net/mlx4/intf.c @@ -31,6 +31,8 @@ * SOFTWARE. */ +#include + #include "mlx4.h" struct mlx4_device_context { @@ -112,37 +114,8 @@ void mlx4_unregister_interface(struct mlx4_interface *intf) } EXPORT_SYMBOL_GPL(mlx4_unregister_interface); -struct mlx4_dev *mlx4_query_interface(void *int_dev, int *port) -{ - struct mlx4_priv *priv; - struct mlx4_device_context *dev_ctx; - enum mlx4_query_reply r; - unsigned long flags; - - mutex_lock(&intf_mutex); - - list_for_each_entry(priv, &dev_list, dev_list) { - spin_lock_irqsave(&priv->ctx_lock, flags); - list_for_each_entry(dev_ctx, &priv->ctx_list, list) { - if (!dev_ctx->intf->query) - continue; - r = dev_ctx->intf->query(dev_ctx->context, int_dev); - if (r != MLX4_QUERY_NOT_MINE) { - *port = r; - spin_unlock_irqrestore(&priv->ctx_lock, flags); - mutex_unlock(&intf_mutex); - return &priv->dev; - } - } - spin_unlock_irqrestore(&priv->ctx_lock, flags); - } - - mutex_unlock(&intf_mutex); - return NULL; -} -EXPORT_SYMBOL_GPL(mlx4_query_interface); - -void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port) +void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, + unsigned long param) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_device_context *dev_ctx; @@ -152,7 +125,7 @@ void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int por list_for_each_entry(dev_ctx, &priv->ctx_list, list) if (dev_ctx->intf->event) - dev_ctx->intf->event(dev, dev_ctx->context, type, port); + dev_ctx->intf->event(dev, dev_ctx->context, type, param); spin_unlock_irqrestore(&priv->ctx_lock, flags); } @@ -169,7 +142,8 @@ int mlx4_register_device(struct mlx4_dev *dev) mlx4_add_device(intf, priv); mutex_unlock(&intf_mutex); - mlx4_start_catas_poll(dev); + if (!mlx4_is_slave(dev)) + mlx4_start_catas_poll(dev); return 0; } @@ -179,7 +153,8 @@ void mlx4_unregister_device(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_interface *intf; - mlx4_stop_catas_poll(dev); + if (!mlx4_is_slave(dev)) + mlx4_stop_catas_poll(dev); mutex_lock(&intf_mutex); list_for_each_entry(intf, &intf_list, list) @@ -190,7 +165,7 @@ void mlx4_unregister_device(struct mlx4_dev *dev) mutex_unlock(&intf_mutex); } -void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port) +void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_device_context *dev_ctx; @@ -200,13 +175,13 @@ void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int por spin_lock_irqsave(&priv->ctx_lock, flags); list_for_each_entry(dev_ctx, &priv->ctx_list, list) - if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_prot_dev) { - result = dev_ctx->intf->get_prot_dev(dev, dev_ctx->context, port); + if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_dev) { + result = dev_ctx->intf->get_dev(dev, dev_ctx->context, port); break; - } + } spin_unlock_irqrestore(&priv->ctx_lock, flags); return result; } - +EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev); diff --git a/sys/ofed/drivers/net/mlx4/main.c b/sys/ofed/drivers/net/mlx4/main.c index b0897bfad183..dd10029af959 100644 --- a/sys/ofed/drivers/net/mlx4/main.c +++ b/sys/ofed/drivers/net/mlx4/main.c @@ -38,7 +38,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -62,10 +65,6 @@ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif /* CONFIG_MLX4_DEBUG */ -int mlx4_blck_lb=1; -module_param_named(block_loopback, mlx4_blck_lb, int, 0644); -MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0"); - #ifdef CONFIG_PCI_MSI static int msi_x = 1; @@ -78,125 +77,178 @@ MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); #endif /* CONFIG_PCI_MSI */ +static int enable_sys_tune = 0; +module_param(enable_sys_tune, int, 0444); +MODULE_PARM_DESC(enable_sys_tune, "Tune the cpu's for better performance (default 0)"); + +int mlx4_blck_lb = 1; +module_param_named(block_loopback, mlx4_blck_lb, int, 0644); +MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0 " + "(default: 1)"); + +static int num_vfs; +module_param(num_vfs, int, 0444); +MODULE_PARM_DESC(num_vfs, "enable #num_vfs functions if num_vfs > 0"); + +static int probe_vf; +module_param(probe_vf, int, 0644); +MODULE_PARM_DESC(probe_vf, "number of vfs to probe by pf driver (num_vfs > 0)"); + +int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE; + +module_param_named(log_num_mgm_entry_size, + mlx4_log_num_mgm_entry_size, int, 0444); +MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num" + " of qp per mcg, for example:" + " 10 gives 248.range: 7 <=" + " log_num_mgm_entry_size <= 12." + " To activate device managed" + " flow steering when available, set to -1"); + +static int high_rate_steer; +module_param(high_rate_steer, int, 0444); +MODULE_PARM_DESC(high_rate_steer, "Enable steering mode for higher packet rate" + " (default off)"); + +static int fast_drop; +module_param_named(fast_drop, fast_drop, int, 0444); +MODULE_PARM_DESC(fast_drop, + "Enable fast packet drop when no recieve WQEs are posted"); + +int mlx4_enable_64b_cqe_eqe; +module_param_named(enable_64b_cqe_eqe, mlx4_enable_64b_cqe_eqe, int, 0644); +MODULE_PARM_DESC(enable_64b_cqe_eqe, + "Enable 64 byte CQEs/EQEs when the the FW supports this, if nonzero"); + +#define HCA_GLOBAL_CAP_MASK 0 + +#define PF_CONTEXT_BEHAVIOUR_MASK MLX4_FUNC_CAP_64B_EQE_CQE + static char mlx4_version[] __devinitdata = DRV_NAME ": Mellanox ConnectX core driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; -struct mutex drv_mutex; - -static struct mlx4_profile default_profile = { - .num_qp = 1 << 18, - .num_srq = 1 << 16, - .rdmarc_per_qp = 1 << 4, - .num_cq = 1 << 16, - .num_mcg = 1 << 13, - .num_mpt = 1 << 19, - .num_mtt = 1 << 20, -}; - -static int log_num_mac = 2; +static int log_num_mac = 7; module_param_named(log_num_mac, log_num_mac, int, 0444); MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)"); -static int use_prio; -module_param_named(use_prio, use_prio, bool, 0444); -MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports " - "(0/1, default 0)"); +static int log_num_vlan; +module_param_named(log_num_vlan, log_num_vlan, int, 0444); +MODULE_PARM_DESC(log_num_vlan, + "(Obsolete) Log2 max number of VLANs per ETH port (0-7)"); +/* Log2 max number of VLANs per ETH port (0-7) */ +#define MLX4_LOG_NUM_VLANS 7 -static struct mlx4_profile mod_param_profile = { 0 }; - -module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444); -MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA"); - -module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444); -MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA"); - -module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, 0444); -MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP"); - -module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444); -MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA"); - -module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444); -MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA"); - -module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444); -MODULE_PARM_DESC(log_num_mpt, - "log maximum number of memory protection table entries per HCA"); - -module_param_named(log_num_mtt, mod_param_profile.num_mtt, int, 0444); -MODULE_PARM_DESC(log_num_mtt, - "log maximum number of memory translation table segments per HCA"); - -static int log_mtts_per_seg = 0; +int log_mtts_per_seg = ilog2(1); module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); -MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)"); +MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment " + "(0-7) (default: 0)"); -static void process_mod_param_profile(void) -{ - default_profile.num_qp = (mod_param_profile.num_qp ? - 1 << mod_param_profile.num_qp : - default_profile.num_qp); - default_profile.num_srq = (mod_param_profile.num_srq ? - 1 << mod_param_profile.num_srq : - default_profile.num_srq); - default_profile.rdmarc_per_qp = (mod_param_profile.rdmarc_per_qp ? - 1 << mod_param_profile.rdmarc_per_qp : - default_profile.rdmarc_per_qp); - default_profile.num_cq = (mod_param_profile.num_cq ? - 1 << mod_param_profile.num_cq : - default_profile.num_cq); - default_profile.num_mcg = (mod_param_profile.num_mcg ? - 1 << mod_param_profile.num_mcg : - default_profile.num_mcg); - default_profile.num_mpt = (mod_param_profile.num_mpt ? - 1 << mod_param_profile.num_mpt : - default_profile.num_mpt); - default_profile.num_mtt = (mod_param_profile.num_mtt ? - 1 << mod_param_profile.num_mtt : - default_profile.num_mtt); -} +static int port_type_array[2] = {MLX4_PORT_TYPE_NONE, MLX4_PORT_TYPE_NONE}; +#if 0 +static int arr_argc = 2; +module_param_array(port_type_array, int, &arr_argc, 0444); +MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default " + "1 for IB, 2 for Ethernet"); +#endif -struct mlx4_port_config -{ +struct mlx4_port_config { struct list_head list; enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; struct pci_dev *pdev; }; -static LIST_HEAD(config_list); -static void mlx4_config_cleanup(void) +#define MLX4_LOG_NUM_MTT 20 +/* We limit to 30 as of a bit map issue which uses int and not uint. + see mlx4_buddy_init -> bitmap_zero which gets int. +*/ +#define MLX4_MAX_LOG_NUM_MTT 30 +static struct mlx4_profile mod_param_profile = { + .num_qp = 19, + .num_srq = 16, + .rdmarc_per_qp = 4, + .num_cq = 16, + .num_mcg = 13, + .num_mpt = 19, + .num_mtt = 0, /* max(20, 2*MTTs for host memory)) */ +}; + +module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444); +MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA (default: 19)"); + +module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444); +MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA " + "(default: 16)"); + +module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, + 0444); +MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP " + "(default: 4)"); + +module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444); +MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA (default: 16)"); + +module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444); +MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA " + "(default: 13)"); + +module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444); +MODULE_PARM_DESC(log_num_mpt, + "log maximum number of memory protection table entries per " + "HCA (default: 19)"); + +module_param_named(log_num_mtt, mod_param_profile.num_mtt, int, 0444); +MODULE_PARM_DESC(log_num_mtt, + "log maximum number of memory translation table segments per " + "HCA (default: max(20, 2*MTTs for register all of the host memory limited to 30))"); + +enum { + MLX4_IF_STATE_BASIC, + MLX4_IF_STATE_EXTENDED +}; +static void process_mod_param_profile(struct mlx4_profile *profile) { - struct mlx4_port_config *config, *tmp; - list_for_each_entry_safe(config, tmp, &config_list, list) { - list_del(&config->list); - kfree(config); + vm_size_t hwphyssz; + hwphyssz = 0; + TUNABLE_ULONG_FETCH("hw.realmem", (u_long *) &hwphyssz); + + profile->num_qp = 1 << mod_param_profile.num_qp; + profile->num_srq = 1 << mod_param_profile.num_srq; + profile->rdmarc_per_qp = 1 << mod_param_profile.rdmarc_per_qp; + profile->num_cq = 1 << mod_param_profile.num_cq; + profile->num_mcg = 1 << mod_param_profile.num_mcg; + profile->num_mpt = 1 << mod_param_profile.num_mpt; + /* + * We want to scale the number of MTTs with the size of the + * system memory, since it makes sense to register a lot of + * memory on a system with a lot of memory. As a heuristic, + * make sure we have enough MTTs to register twice the system + * memory (with PAGE_SIZE entries). + * + * This number has to be a power of two and fit into 32 bits + * due to device limitations. We cap this at 2^30 as of bit map + * limitation to work with int instead of uint (mlx4_buddy_init -> bitmap_zero) + * That limits us to 4TB of memory registration per HCA with + * 4KB pages, which is probably OK for the next few months. + */ + if (mod_param_profile.num_mtt) + profile->num_mtt = 1 << mod_param_profile.num_mtt; + else { + profile->num_mtt = + roundup_pow_of_two(max_t(unsigned, + 1 << (MLX4_LOG_NUM_MTT - log_mtts_per_seg), + min(1UL << + (MLX4_MAX_LOG_NUM_MTT - + log_mtts_per_seg), + (hwphyssz << 1) + >> log_mtts_per_seg))); + /* set the actual value, so it will be reflected to the user + using the sysfs */ + mod_param_profile.num_mtt = ilog2(profile->num_mtt * (1 << log_mtts_per_seg)); } } -void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port) -{ - return mlx4_find_get_prot_dev(dev, proto, port); -} -EXPORT_SYMBOL(mlx4_get_prot_dev); - -void mlx4_set_iboe_counter(struct mlx4_dev *dev, int index, u8 port) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - - priv->iboe_counter_index[port - 1] = index; -} -EXPORT_SYMBOL(mlx4_set_iboe_counter); - -int mlx4_get_iboe_counter(struct mlx4_dev *dev, u8 port) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - - return priv->iboe_counter_index[port - 1]; -} -EXPORT_SYMBOL(mlx4_get_iboe_counter); - int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type) { @@ -230,19 +282,6 @@ static void mlx4_set_port_mask(struct mlx4_dev *dev) dev->caps.port_mask[i] = dev->caps.port_type[i]; } -static u8 get_counters_mode(u64 flags) -{ - switch (flags >> 48 & 3) { - case 2: - case 3: - return MLX4_CUNTERS_EXT; - case 1: - return MLX4_CUNTERS_BASIC; - default: - return MLX4_CUNTERS_DISABLED; - } -} - static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int err; @@ -276,21 +315,29 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) } dev->caps.num_ports = dev_cap->num_ports; + dev->phys_caps.num_phys_eqs = MLX4_MAX_EQ_NUM; for (i = 1; i <= dev->caps.num_ports; ++i) { dev->caps.vl_cap[i] = dev_cap->max_vl[i]; dev->caps.ib_mtu_cap[i] = dev_cap->ib_mtu[i]; + dev->phys_caps.gid_phys_table_len[i] = dev_cap->max_gids[i]; + dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i]; + /* set gid and pkey table operating lengths by default + * to non-sriov values */ dev->caps.gid_table_len[i] = dev_cap->max_gids[i]; dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i]; dev->caps.port_width_cap[i] = dev_cap->max_port_width[i]; dev->caps.eth_mtu_cap[i] = dev_cap->eth_mtu[i]; dev->caps.def_mac[i] = dev_cap->def_mac[i]; dev->caps.supported_type[i] = dev_cap->supported_port_types[i]; + dev->caps.suggested_type[i] = dev_cap->suggested_type[i]; + dev->caps.default_sense[i] = dev_cap->default_sense[i]; dev->caps.trans_type[i] = dev_cap->trans_type[i]; dev->caps.vendor_oui[i] = dev_cap->vendor_oui[i]; dev->caps.wavelength[i] = dev_cap->wavelength[i]; dev->caps.trans_code[i] = dev_cap->trans_code[i]; } + dev->caps.uar_page_size = PAGE_SIZE; dev->caps.num_uars = dev_cap->uar_size / PAGE_SIZE; dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay; dev->caps.bf_reg_size = dev_cap->bf_reg_size; @@ -304,51 +351,96 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.reserved_srqs = dev_cap->reserved_srqs; dev->caps.max_sq_desc_sz = dev_cap->max_sq_desc_sz; dev->caps.max_rq_desc_sz = dev_cap->max_rq_desc_sz; - dev->caps.num_qp_per_mgm = MLX4_QP_PER_MGM; /* * Subtract 1 from the limit because we need to allocate a - * spare CQE so the HCA HW can tell the difference between an - * empty CQ and a full CQ. + * spare CQE to enable resizing the CQ */ dev->caps.max_cqes = dev_cap->max_cq_sz - 1; dev->caps.reserved_cqs = dev_cap->reserved_cqs; dev->caps.reserved_eqs = dev_cap->reserved_eqs; - dev->caps.mtts_per_seg = 1 << log_mtts_per_seg; - dev->caps.reserved_mtts = DIV_ROUND_UP(dev_cap->reserved_mtts, - dev->caps.mtts_per_seg); + dev->caps.reserved_mtts = dev_cap->reserved_mtts; dev->caps.reserved_mrws = dev_cap->reserved_mrws; - dev->caps.reserved_uars = dev_cap->reserved_uars; + + /* The first 128 UARs are used for EQ doorbells */ + dev->caps.reserved_uars = max_t(int, 128, dev_cap->reserved_uars); dev->caps.reserved_pds = dev_cap->reserved_pds; - dev->caps.mtt_entry_sz = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz; + dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? + dev_cap->reserved_xrcds : 0; + dev->caps.max_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? + dev_cap->max_xrcds : 0; + dev->caps.mtt_entry_sz = dev_cap->mtt_entry_sz; + dev->caps.max_msg_sz = dev_cap->max_msg_sz; dev->caps.page_size_cap = ~(u32) (dev_cap->min_page_sz - 1); dev->caps.flags = dev_cap->flags; + dev->caps.flags2 = dev_cap->flags2; dev->caps.bmme_flags = dev_cap->bmme_flags; dev->caps.reserved_lkey = dev_cap->reserved_lkey; dev->caps.stat_rate_support = dev_cap->stat_rate_support; - dev->caps.udp_rss = dev_cap->udp_rss; - dev->caps.loopback_support = dev_cap->loopback_support; - dev->caps.wol = dev_cap->wol; + dev->caps.cq_timestamp = dev_cap->timestamp_support; dev->caps.max_gso_sz = dev_cap->max_gso_sz; - dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? - dev_cap->reserved_xrcds : 0; - dev->caps.max_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? - dev_cap->max_xrcds : 0; + dev->caps.max_rss_tbl_sz = dev_cap->max_rss_tbl_sz; + + /* Sense port always allowed on supported devices for ConnectX-1 and -2 */ + if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT) + dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; + /* Don't do sense port on multifunction devices (for now at least) */ + if (mlx4_is_mfunc(dev)) + dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; dev->caps.log_num_macs = log_num_mac; - dev->caps.log_num_prios = use_prio ? 3 : 0; + dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS; + + dev->caps.fast_drop = fast_drop ? + !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FAST_DROP) : + 0; for (i = 1; i <= dev->caps.num_ports; ++i) { dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE; if (dev->caps.supported_type[i]) { - if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH) - dev->caps.port_type[i] = MLX4_PORT_TYPE_IB; - else + /* if only ETH is supported - assign ETH */ + if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH) dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; + /* if only IB is supported, assign IB */ + else if (dev->caps.supported_type[i] == + MLX4_PORT_TYPE_IB) + dev->caps.port_type[i] = MLX4_PORT_TYPE_IB; + else { + /* if IB and ETH are supported, we set the port + * type according to user selection of port type; + * if user selected none, take the FW hint */ + if (port_type_array[i - 1] == MLX4_PORT_TYPE_NONE) + dev->caps.port_type[i] = dev->caps.suggested_type[i] ? + MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB; + else + dev->caps.port_type[i] = port_type_array[i - 1]; + } } - dev->caps.possible_type[i] = dev->caps.port_type[i]; + /* + * Link sensing is allowed on the port if 3 conditions are true: + * 1. Both protocols are supported on the port. + * 2. Different types are supported on the port + * 3. FW declared that it supports link sensing + */ mlx4_priv(dev)->sense.sense_allowed[i] = - dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO; + ((dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO) && + (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) && + (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)); + + /* + * If "default_sense" bit is set, we move the port to "AUTO" mode + * and perform sense_port FW command to try and set the correct + * port type from beginning + */ + if (mlx4_priv(dev)->sense.sense_allowed[i] && dev->caps.default_sense[i]) { + enum mlx4_port_type sensed_port = MLX4_PORT_TYPE_NONE; + dev->caps.possible_type[i] = MLX4_PORT_TYPE_AUTO; + mlx4_SENSE_PORT(dev, i, &sensed_port); + if (sensed_port != MLX4_PORT_TYPE_NONE) + dev->caps.port_type[i] = sensed_port; + } else { + dev->caps.possible_type[i] = dev->caps.port_type[i]; + } if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) { dev->caps.log_num_macs = dev_cap->log_max_macs[i]; @@ -356,53 +448,317 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) "for port %d, reducing to %d.\n", i, 1 << dev->caps.log_num_macs); } - dev->caps.log_num_vlans = dev_cap->log_max_vlans[i]; + if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) { + dev->caps.log_num_vlans = dev_cap->log_max_vlans[i]; + mlx4_warn(dev, "Requested number of VLANs is too much " + "for port %d, reducing to %d.\n", + i, 1 << dev->caps.log_num_vlans); + } } - dev->caps.counters_mode = get_counters_mode(dev_cap->flags); - dev->caps.max_basic_counters = 1 << ilog2(dev_cap->max_basic_counters); - dev->caps.max_ext_counters = 1 << ilog2(dev_cap->max_ext_counters); + dev->caps.max_basic_counters = dev_cap->max_basic_counters; + dev->caps.max_extended_counters = dev_cap->max_extended_counters; + /* support extended counters if available */ + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT) + dev->caps.max_counters = dev->caps.max_extended_counters; + else + dev->caps.max_counters = dev->caps.max_basic_counters; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] = (1 << dev->caps.log_num_macs) * (1 << dev->caps.log_num_vlans) * - (1 << dev->caps.log_num_prios) * dev->caps.num_ports; + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH; dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] + - dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR]; + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] + + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH]; - return 0; -} + dev->caps.sync_qp = dev_cap->sync_qp; + dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0; -static int mlx4_save_config(struct mlx4_dev *dev) -{ - struct mlx4_port_config *config; - int i; - - list_for_each_entry(config, &config_list, list) { - if (config->pdev == dev->pdev) { - for (i = 1; i <= dev->caps.num_ports; i++) - config->port_type[i] = dev->caps.possible_type[i]; - return 0; + if (!mlx4_enable_64b_cqe_eqe) { + if (dev_cap->flags & + (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) { + mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n"); + dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; + dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; } } - config = kmalloc(sizeof(struct mlx4_port_config), GFP_KERNEL); - if (!config) - return -ENOMEM; - - config->pdev = dev->pdev; - for (i = 1; i <= dev->caps.num_ports; i++) - config->port_type[i] = dev->caps.possible_type[i]; - - list_add_tail(&config->list, &config_list); + if ((dev->caps.flags & + (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) && + mlx4_is_master(dev)) + dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE; return 0; } +/*The function checks if there are live vf, return the num of them*/ +static int mlx4_how_many_lives_vf(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state; + int i; + int ret = 0; + + for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) { + s_state = &priv->mfunc.master.slave_state[i]; + if (s_state->active && s_state->last_cmd != + MLX4_COMM_CMD_RESET) { + mlx4_warn(dev, "%s: slave: %d is still active\n", + __func__, i); + ret++; + } + } + return ret; +} + +int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey) +{ + u32 qk = MLX4_RESERVED_QKEY_BASE; + + if (qpn >= dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX || + qpn < dev->phys_caps.base_proxy_sqpn) + return -EINVAL; + + if (qpn >= dev->phys_caps.base_tunnel_sqpn) + /* tunnel qp */ + qk += qpn - dev->phys_caps.base_tunnel_sqpn; + else + qk += qpn - dev->phys_caps.base_proxy_sqpn; + *qkey = qk; + return 0; +} +EXPORT_SYMBOL(mlx4_get_parav_qkey); + +void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val) +{ + struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); + + if (!mlx4_is_master(dev)) + return; + + priv->virt2phys_pkey[slave][port - 1][i] = val; +} +EXPORT_SYMBOL(mlx4_sync_pkey_table); + +void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid) +{ + struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); + + if (!mlx4_is_master(dev)) + return; + + priv->slave_node_guids[slave] = guid; +} +EXPORT_SYMBOL(mlx4_put_slave_node_guid); + +__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); + + if (!mlx4_is_master(dev)) + return 0; + + return priv->slave_node_guids[slave]; +} +EXPORT_SYMBOL(mlx4_get_slave_node_guid); + +int mlx4_is_slave_active(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave; + + if (!mlx4_is_master(dev)) + return 0; + + s_slave = &priv->mfunc.master.slave_state[slave]; + return !!s_slave->active; +} +EXPORT_SYMBOL(mlx4_is_slave_active); + +static void slave_adjust_steering_mode(struct mlx4_dev *dev, + struct mlx4_dev_cap *dev_cap, + struct mlx4_init_hca_param *hca_param) +{ + dev->caps.steering_mode = hca_param->steering_mode; + if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) + dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry; + else + dev->caps.num_qp_per_mgm = + 4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2); + + mlx4_dbg(dev, "Steering mode is: %s\n", + mlx4_steering_mode_str(dev->caps.steering_mode)); +} + +static int mlx4_slave_cap(struct mlx4_dev *dev) +{ + int err; + u32 page_size; + struct mlx4_dev_cap dev_cap; + struct mlx4_func_cap func_cap; + struct mlx4_init_hca_param hca_param; + int i; + + memset(&hca_param, 0, sizeof(hca_param)); + err = mlx4_QUERY_HCA(dev, &hca_param); + if (err) { + mlx4_err(dev, "QUERY_HCA command failed, aborting.\n"); + return err; + } + + /*fail if the hca has an unknown capability */ + if ((hca_param.global_caps | HCA_GLOBAL_CAP_MASK) != + HCA_GLOBAL_CAP_MASK) { + mlx4_err(dev, "Unknown hca global capabilities\n"); + return -ENOSYS; + } + + mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz; + + dev->caps.hca_core_clock = hca_param.hca_core_clock; + + memset(&dev_cap, 0, sizeof(dev_cap)); + dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp; + err = mlx4_dev_cap(dev, &dev_cap); + if (err) { + mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); + return err; + } + + err = mlx4_QUERY_FW(dev); + if (err) + mlx4_err(dev, "QUERY_FW command failed: could not get FW version.\n"); + + page_size = ~dev->caps.page_size_cap + 1; + mlx4_warn(dev, "HCA minimum page size:%d\n", page_size); + if (page_size > PAGE_SIZE) { + mlx4_err(dev, "HCA minimum page size of %d bigger than " + "kernel PAGE_SIZE of %d, aborting.\n", + page_size, PAGE_SIZE); + return -ENODEV; + } + + /* slave gets uar page size from QUERY_HCA fw command */ + dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12); + + /* TODO: relax this assumption */ + if (dev->caps.uar_page_size != PAGE_SIZE) { + mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %d\n", + dev->caps.uar_page_size, PAGE_SIZE); + return -ENODEV; + } + + memset(&func_cap, 0, sizeof(func_cap)); + err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap); + if (err) { + mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d).\n", + err); + return err; + } + + if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) != + PF_CONTEXT_BEHAVIOUR_MASK) { + mlx4_err(dev, "Unknown pf context behaviour\n"); + return -ENOSYS; + } + + dev->caps.num_ports = func_cap.num_ports; + dev->quotas.qp = func_cap.qp_quota; + dev->quotas.srq = func_cap.srq_quota; + dev->quotas.cq = func_cap.cq_quota; + dev->quotas.mpt = func_cap.mpt_quota; + dev->quotas.mtt = func_cap.mtt_quota; + dev->caps.num_qps = 1 << hca_param.log_num_qps; + dev->caps.num_srqs = 1 << hca_param.log_num_srqs; + dev->caps.num_cqs = 1 << hca_param.log_num_cqs; + dev->caps.num_mpts = 1 << hca_param.log_mpt_sz; + dev->caps.num_eqs = func_cap.max_eq; + dev->caps.reserved_eqs = func_cap.reserved_eq; + dev->caps.num_pds = MLX4_NUM_PDS; + dev->caps.num_mgms = 0; + dev->caps.num_amgms = 0; + + if (dev->caps.num_ports > MLX4_MAX_PORTS) { + mlx4_err(dev, "HCA has %d ports, but we only support %d, " + "aborting.\n", dev->caps.num_ports, MLX4_MAX_PORTS); + return -ENODEV; + } + + dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + + if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy || + !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) { + err = -ENOMEM; + goto err_mem; + } + + for (i = 1; i <= dev->caps.num_ports; ++i) { + err = mlx4_QUERY_FUNC_CAP(dev, (u32) i, &func_cap); + if (err) { + mlx4_err(dev, "QUERY_FUNC_CAP port command failed for" + " port %d, aborting (%d).\n", i, err); + goto err_mem; + } + dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn; + dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn; + dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn; + dev->caps.qp1_proxy[i - 1] = func_cap.qp1_proxy_qpn; + dev->caps.port_mask[i] = dev->caps.port_type[i]; + err = mlx4_get_slave_pkey_gid_tbl_len(dev, i, + &dev->caps.gid_table_len[i], + &dev->caps.pkey_table_len[i]); + if (err) + goto err_mem; + } + + if (dev->caps.uar_page_size * (dev->caps.num_uars - + dev->caps.reserved_uars) > + pci_resource_len(dev->pdev, 2)) { + mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than " + "PCI resource 2 size of 0x%llx, aborting.\n", + dev->caps.uar_page_size * dev->caps.num_uars, + (unsigned long long) pci_resource_len(dev->pdev, 2)); + err = -ENOMEM; + goto err_mem; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) { + dev->caps.eqe_size = 64; + dev->caps.eqe_factor = 1; + } else { + dev->caps.eqe_size = 32; + dev->caps.eqe_factor = 0; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) { + dev->caps.cqe_size = 64; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; + } else { + dev->caps.cqe_size = 32; + } + + slave_adjust_steering_mode(dev, &dev_cap, &hca_param); + + return 0; + +err_mem: + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + dev->caps.qp0_tunnel = dev->caps.qp0_proxy = + dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL; + + return err; +} /* * Change the port configuration of the device. @@ -418,16 +774,15 @@ int mlx4_change_port_types(struct mlx4_dev *dev, for (port = 0; port < dev->caps.num_ports; port++) { /* Change the port type only if the new type is different * from the current, and not set to Auto */ - if (port_types[port] != dev->caps.port_type[port + 1]) { + if (port_types[port] != dev->caps.port_type[port + 1]) change = 1; - dev->caps.port_type[port + 1] = port_types[port]; - } } if (change) { mlx4_unregister_device(dev); for (port = 1; port <= dev->caps.num_ports; port++) { mlx4_CLOSE_PORT(dev, port); - err = mlx4_SET_PORT(dev, port); + dev->caps.port_type[port] = port_types[port - 1]; + err = mlx4_SET_PORT(dev, port, -1); if (err) { mlx4_err(dev, "Failed to set port %d, " "aborting\n", port); @@ -435,7 +790,6 @@ int mlx4_change_port_types(struct mlx4_dev *dev, } } mlx4_set_port_mask(dev); - mlx4_save_config(dev); err = mlx4_register_device(dev); } @@ -487,7 +841,7 @@ static ssize_t set_port_type(struct device *dev, return -EINVAL; } - mlx4_stop_sense(mdev); + mlx4_stop_sense(mdev); mutex_lock(&priv->port_mutex); /* Possible type is always the one that was delivered */ mdev->caps.possible_type[info->port] = info->tmp_type; @@ -499,14 +853,8 @@ static ssize_t set_port_type(struct device *dev, types[i] = mdev->caps.port_type[i+1]; } - if (priv->trig) { - if (++priv->changed_ports < mdev->caps.num_ports) - goto out; - else - priv->trig = priv->changed_ports = 0; - } - - if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { + if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) && + !(mdev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)) { for (i = 1; i <= mdev->caps.num_ports; i++) { if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) { mdev->caps.possible_type[i] = mdev->caps.port_type[i]; @@ -541,22 +889,97 @@ static ssize_t set_port_type(struct device *dev, return err ? err : count; } -static ssize_t trigger_port(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +enum ibta_mtu { + IB_MTU_256 = 1, + IB_MTU_512 = 2, + IB_MTU_1024 = 3, + IB_MTU_2048 = 4, + IB_MTU_4096 = 5 +}; + +static inline int int_to_ibta_mtu(int mtu) { - struct pci_dev *pdev = to_pci_dev(dev); - struct mlx4_dev *mdev = pci_get_drvdata(pdev); - struct mlx4_priv *priv = container_of(mdev, struct mlx4_priv, dev); - - if (!priv) - return -ENODEV; - - mutex_lock(&priv->port_mutex); - priv->trig = 1; - mutex_unlock(&priv->port_mutex); - return count; + switch (mtu) { + case 256: return IB_MTU_256; + case 512: return IB_MTU_512; + case 1024: return IB_MTU_1024; + case 2048: return IB_MTU_2048; + case 4096: return IB_MTU_4096; + default: return -1; + } +} + +static inline int ibta_mtu_to_int(enum ibta_mtu mtu) +{ + switch (mtu) { + case IB_MTU_256: return 256; + case IB_MTU_512: return 512; + case IB_MTU_1024: return 1024; + case IB_MTU_2048: return 2048; + case IB_MTU_4096: return 4096; + default: return -1; + } +} + +static ssize_t show_port_ib_mtu(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, + port_mtu_attr); + struct mlx4_dev *mdev = info->dev; + + if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) + mlx4_warn(mdev, "port level mtu is only used for IB ports\n"); + + sprintf(buf, "%d\n", + ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port])); + return strlen(buf); +} + +static ssize_t set_port_ib_mtu(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, + port_mtu_attr); + struct mlx4_dev *mdev = info->dev; + struct mlx4_priv *priv = mlx4_priv(mdev); + int err, port, mtu, ibta_mtu = -1; + + if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) { + mlx4_warn(mdev, "port level mtu is only used for IB ports\n"); + return -EINVAL; + } + + mtu = (int) simple_strtol(buf, NULL, 0); + ibta_mtu = int_to_ibta_mtu(mtu); + + if (ibta_mtu < 0) { + mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf); + return -EINVAL; + } + + mdev->caps.port_ib_mtu[info->port] = ibta_mtu; + + mlx4_stop_sense(mdev); + mutex_lock(&priv->port_mutex); + mlx4_unregister_device(mdev); + for (port = 1; port <= mdev->caps.num_ports; port++) { + mlx4_CLOSE_PORT(mdev, port); + err = mlx4_SET_PORT(mdev, port, -1); + if (err) { + mlx4_err(mdev, "Failed to set port %d, " + "aborting\n", port); + goto err_set_port; + } + } + err = mlx4_register_device(mdev); +err_set_port: + mutex_unlock(&priv->port_mutex); + mlx4_start_sense(mdev); + return err ? err : count; } -DEVICE_ATTR(port_trigger, S_IWUGO, NULL, trigger_port); static int mlx4_load_fw(struct mlx4_dev *dev) { @@ -597,6 +1020,7 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, { struct mlx4_priv *priv = mlx4_priv(dev); int err; + int num_eqs; err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table, cmpt_base + @@ -626,12 +1050,13 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, if (err) goto err_srq; + num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : + dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_EQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), - cmpt_entry_sz, - dev->caps.num_eqs, dev->caps.num_eqs, 0, 0); + cmpt_entry_sz, num_eqs, num_eqs, 0, 0); if (err) goto err_cq; @@ -655,6 +1080,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, { struct mlx4_priv *priv = mlx4_priv(dev); u64 aux_pages; + int num_eqs; int err; err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages); @@ -686,10 +1112,12 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, goto err_unmap_aux; } + + num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : + dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.table, init_hca->eqc_base, dev_cap->eqc_entry_sz, - dev->caps.num_eqs, dev->caps.num_eqs, - 0, 0); + num_eqs, num_eqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_cmpt; @@ -709,7 +1137,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table, init_hca->mtt_base, dev->caps.mtt_entry_sz, - dev->caps.num_mtt_segs, + dev->caps.num_mtts, dev->caps.reserved_mtts, 1, 0); if (err) { mlx4_err(dev, "Failed to map MTT context memory, aborting.\n"); @@ -791,12 +1219,15 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, } /* - * It's not strictly required, but for simplicity just map the - * whole multicast group table now. The table isn't very big - * and it's a lot easier than trying to track ref counts. + * For flow steering device managed mode it is required to use + * mlx4_init_icm_table. For B0 steering mode it's not strictly + * required, but for simplicity just map the whole multicast + * group table now. The table isn't very big and it's a lot + * easier than trying to track ref counts. */ err = mlx4_init_icm_table(dev, &priv->mcg_table.table, - init_hca->mc_base, MLX4_MGM_ENTRY_SIZE, + init_hca->mc_base, + mlx4_get_mgm_entry_size(dev), dev->caps.num_mgms + dev->caps.num_amgms, dev->caps.num_mgms + dev->caps.num_amgms, 0, 0); @@ -872,6 +1303,16 @@ static void mlx4_free_icms(struct mlx4_dev *dev) mlx4_free_icm(dev, priv->fw.aux_icm, 0); } +static void mlx4_slave_exit(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mutex_lock(&priv->cmd.slave_cmd_mutex); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME)) + mlx4_warn(dev, "Failed to close slave function.\n"); + mutex_unlock(&priv->cmd.slave_cmd_mutex); +} + static int map_bf_area(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -879,8 +1320,13 @@ static int map_bf_area(struct mlx4_dev *dev) resource_size_t bf_len; int err = 0; - bf_start = pci_resource_start(dev->pdev, 2) + (dev->caps.num_uars << PAGE_SHIFT); - bf_len = pci_resource_len(dev->pdev, 2) - (dev->caps.num_uars << PAGE_SHIFT); + if (!dev->caps.bf_reg_size) + return -ENXIO; + + bf_start = pci_resource_start(dev->pdev, 2) + + (dev->caps.num_uars << PAGE_SHIFT); + bf_len = pci_resource_len(dev->pdev, 2) - + (dev->caps.num_uars << PAGE_SHIFT); priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len); if (!priv->bf_mapping) err = -ENOMEM; @@ -894,177 +1340,473 @@ static void unmap_bf_area(struct mlx4_dev *dev) io_mapping_free(mlx4_priv(dev)->bf_mapping); } +cycle_t mlx4_read_clock(struct mlx4_dev *dev) +{ + u32 clockhi, clocklo, clockhi1; + cycle_t cycles; + int i; + struct mlx4_priv *priv = mlx4_priv(dev); + + for (i = 0; i < 10; i++) { + clockhi = swab32(readl(priv->clock_mapping)); + clocklo = swab32(readl(priv->clock_mapping + 4)); + clockhi1 = swab32(readl(priv->clock_mapping)); + if (clockhi == clockhi1) + break; + } + + cycles = (u64) clockhi << 32 | (u64) clocklo; + + return cycles; +} +EXPORT_SYMBOL_GPL(mlx4_read_clock); + + +static int map_internal_clock(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + priv->clock_mapping = ioremap(pci_resource_start(dev->pdev, + priv->fw.clock_bar) + + priv->fw.clock_offset, MLX4_CLOCK_SIZE); + + if (!priv->clock_mapping) + return -ENOMEM; + + return 0; +} + +static void unmap_internal_clock(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (priv->clock_mapping) + iounmap(priv->clock_mapping); +} + static void mlx4_close_hca(struct mlx4_dev *dev) { + unmap_internal_clock(dev); unmap_bf_area(dev); - mlx4_CLOSE_HCA(dev, 0); - mlx4_free_icms(dev); - mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); + if (mlx4_is_slave(dev)) + mlx4_slave_exit(dev); + else { + mlx4_CLOSE_HCA(dev, 0); + mlx4_free_icms(dev); + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); + } +} + +static int mlx4_init_slave(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 dma = (u64) priv->mfunc.vhcr_dma; + int num_of_reset_retries = NUM_OF_RESET_RETRIES; + int ret_from_reset = 0; + u32 slave_read; + u32 cmd_channel_ver; + + mutex_lock(&priv->cmd.slave_cmd_mutex); + priv->cmd.max_cmds = 1; + mlx4_warn(dev, "Sending reset\n"); + ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, + MLX4_COMM_TIME); + /* if we are in the middle of flr the slave will try + * NUM_OF_RESET_RETRIES times before leaving.*/ + if (ret_from_reset) { + if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) { + msleep(SLEEP_TIME_IN_RESET); + while (ret_from_reset && num_of_reset_retries) { + mlx4_warn(dev, "slave is currently in the" + "middle of FLR. retrying..." + "(try num:%d)\n", + (NUM_OF_RESET_RETRIES - + num_of_reset_retries + 1)); + ret_from_reset = + mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, + 0, MLX4_COMM_TIME); + num_of_reset_retries = num_of_reset_retries - 1; + } + } else + goto err; + } + + /* check the driver version - the slave I/F revision + * must match the master's */ + slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); + cmd_channel_ver = mlx4_comm_get_version(); + + if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) != + MLX4_COMM_GET_IF_REV(slave_read)) { + mlx4_err(dev, "slave driver version is not supported" + " by the master\n"); + goto err; + } + + mlx4_warn(dev, "Sending vhcr0\n"); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME)) + goto err; + + mutex_unlock(&priv->cmd.slave_cmd_mutex); + return 0; + +err: + mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0); + mutex_unlock(&priv->cmd.slave_cmd_mutex); + return -EIO; +} + +static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev) +{ + int i; + + for (i = 1; i <= dev->caps.num_ports; i++) { + if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) + dev->caps.gid_table_len[i] = + mlx4_get_slave_num_gids(dev, 0); + else + dev->caps.gid_table_len[i] = 1; + dev->caps.pkey_table_len[i] = + dev->phys_caps.pkey_phys_table_len[i] - 1; + } +} + +static int choose_log_fs_mgm_entry_size(int qp_per_entry) +{ + int i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; + + for (i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE; + i++) { + if (qp_per_entry <= 4 * ((1 << i) / 16 - 2)) + break; + } + + return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1; +} + +static void choose_steering_mode(struct mlx4_dev *dev, + struct mlx4_dev_cap *dev_cap) +{ + // This is only valid to the integrated driver. + // The new ported mlx4_core driver is in B0 steering mode by default + // and the old mlx4_en driver is in A0 steering mode by default. + // If high_rate_steer == TRUE it means that A0 steering mode is on. + // The integration fix is to hard code high_rate_steer to TRUE. + high_rate_steer = 1; + + if (high_rate_steer && !mlx4_is_mfunc(dev)) { + dev->caps.flags &= ~(MLX4_DEV_CAP_FLAG_VEP_MC_STEER | + MLX4_DEV_CAP_FLAG_VEP_UC_STEER); + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_FS_EN; + } + + if (mlx4_log_num_mgm_entry_size == -1 && + dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN && + dev_cap->fs_log_max_ucast_qp_range_size == 0 && + (!mlx4_is_mfunc(dev) || + (dev_cap->fs_max_num_qp_per_entry >= (num_vfs + 1))) && + choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >= + MLX4_MIN_MGM_LOG_ENTRY_SIZE) { + dev->oper_log_mgm_entry_size = + choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry); + dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED; + dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry; + } else { + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER && + dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) { + dev->caps.steering_mode = MLX4_STEERING_MODE_B0; + } + else { + dev->caps.steering_mode = MLX4_STEERING_MODE_A0; + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER || + dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) + mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags " + "set to use B0 steering. Falling back to A0 steering mode.\n"); + } + dev->oper_log_mgm_entry_size = + mlx4_log_num_mgm_entry_size > 0 ? + mlx4_log_num_mgm_entry_size : + MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE; + dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev); + } + mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, " + "log_num_mgm_entry_size = %d\n", + mlx4_steering_mode_str(dev->caps.steering_mode), + dev->oper_log_mgm_entry_size, mlx4_log_num_mgm_entry_size); } static int mlx4_init_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_dev_cap *dev_cap = NULL; struct mlx4_adapter adapter; - struct mlx4_dev_cap dev_cap; struct mlx4_mod_stat_cfg mlx4_cfg; struct mlx4_profile profile; struct mlx4_init_hca_param init_hca; - struct mlx4_port_config *config; u64 icm_size; int err; - int i; - err = mlx4_QUERY_FW(dev); - if (err) { - if (err == -EACCES) - mlx4_info(dev, "non-primary physical function, skipping.\n"); - else - mlx4_err(dev, "QUERY_FW command failed, aborting.\n"); - return err; - } + if (!mlx4_is_slave(dev)) { + err = mlx4_QUERY_FW(dev); + if (err) { + if (err == -EACCES) + mlx4_info(dev, "non-primary physical function, skipping.\n"); + else + mlx4_err(dev, "QUERY_FW command failed, aborting.\n"); + return err; + } - err = mlx4_load_fw(dev); - if (err) { - mlx4_err(dev, "Failed to start FW, aborting.\n"); - return err; - } + err = mlx4_load_fw(dev); + if (err) { + mlx4_err(dev, "Failed to start FW, aborting.\n"); + return err; + } - mlx4_cfg.log_pg_sz_m = 1; - mlx4_cfg.log_pg_sz = 0; - err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); - if (err) - mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); + mlx4_cfg.log_pg_sz_m = 1; + mlx4_cfg.log_pg_sz = 0; + err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); + if (err) + mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); - err = mlx4_dev_cap(dev, &dev_cap); - if (err) { - mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); - goto err_stop_fw; - } + dev_cap = kzalloc(sizeof *dev_cap, GFP_KERNEL); + if (!dev_cap) { + mlx4_err(dev, "Failed to allocate memory for dev_cap\n"); + err = -ENOMEM; + goto err_stop_fw; + } - process_mod_param_profile(); - profile = default_profile; + err = mlx4_dev_cap(dev, dev_cap); + if (err) { + mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); + goto err_stop_fw; + } - list_for_each_entry(config, &config_list, list) { - if (config->pdev == dev->pdev) { - for (i = 1; i <= dev->caps.num_ports; i++) { - dev->caps.possible_type[i] = config->port_type[i]; - if (config->port_type[i] != MLX4_PORT_TYPE_AUTO) - dev->caps.port_type[i] = config->port_type[i]; - } + choose_steering_mode(dev, dev_cap); + + if (mlx4_is_master(dev)) + mlx4_parav_master_pf_caps(dev); + + process_mod_param_profile(&profile); + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) + profile.num_mcg = MLX4_FS_NUM_MCG; + + icm_size = mlx4_make_profile(dev, &profile, dev_cap, + &init_hca); + if ((long long) icm_size < 0) { + err = icm_size; + goto err_stop_fw; + } + + dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1; + + init_hca.log_uar_sz = ilog2(dev->caps.num_uars); + init_hca.uar_page_sz = PAGE_SHIFT - 12; + + err = mlx4_init_icm(dev, dev_cap, &init_hca, icm_size); + if (err) + goto err_stop_fw; + + err = mlx4_INIT_HCA(dev, &init_hca); + if (err) { + mlx4_err(dev, "INIT_HCA command failed, aborting.\n"); + goto err_free_icm; + } + } else { + err = mlx4_init_slave(dev); + if (err) { + mlx4_err(dev, "Failed to initialize slave\n"); + return err; + } + + err = mlx4_slave_cap(dev); + if (err) { + mlx4_err(dev, "Failed to obtain slave caps\n"); + goto err_close; } } - mlx4_set_port_mask(dev); - icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca); - if ((long long) icm_size < 0) { - err = icm_size; - goto err_stop_fw; - } - if (map_bf_area(dev)) - mlx4_dbg(dev, "Kernel support for blue flame is not available for kernels < 2.6.28\n"); + mlx4_dbg(dev, "Failed to map blue flame area\n"); - init_hca.log_uar_sz = ilog2(dev->caps.num_uars); + /* + * Read HCA frequency by QUERY_HCA command + */ + if (dev->caps.cq_timestamp) { + memset(&init_hca, 0, sizeof(init_hca)); + err = mlx4_QUERY_HCA(dev, &init_hca); + if (err) { + mlx4_err(dev, "QUERY_HCA command failed, disable timestamp.\n"); + dev->caps.cq_timestamp = 0; + } else + dev->caps.hca_core_clock = init_hca.hca_core_clock; - err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size); - if (err) - goto err_stop_fw; + /* + * In case we got HCA frequency 0 - disable timestamping + * to avoid dividing by zero + */ + if (!dev->caps.hca_core_clock) { + dev->caps.cq_timestamp = 0; + mlx4_err(dev, "HCA frequency is 0. " + "Timestamping is not supported."); + } - err = mlx4_INIT_HCA(dev, &init_hca); - if (err) { - mlx4_err(dev, "INIT_HCA command failed, aborting.\n"); - goto err_free_icm; + /* + * Map internal clock, in case of failure disable timestamping + */ + if (map_internal_clock(dev)) { + dev->caps.cq_timestamp = 0; + mlx4_err(dev, "Failed to map internal clock. " + "Timestamping is not supported.\n"); + } } + /*Only the master set the ports, all the rest got it from it.*/ + if (!mlx4_is_slave(dev)) + mlx4_set_port_mask(dev); + err = mlx4_QUERY_ADAPTER(dev, &adapter); if (err) { mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n"); - goto err_close; + goto unmap_bf; } priv->eq_table.inta_pin = adapter.inta_pin; memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id); + if (!mlx4_is_slave(dev)) + kfree(dev_cap); + return 0; +unmap_bf: + unmap_internal_clock(dev); + unmap_bf_area(dev); + + if (mlx4_is_slave(dev)) { + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + } + err_close: - mlx4_CLOSE_HCA(dev, 0); + if (mlx4_is_slave(dev)) + mlx4_slave_exit(dev); + else + mlx4_CLOSE_HCA(dev, 0); err_free_icm: - mlx4_free_icms(dev); + if (!mlx4_is_slave(dev)) + mlx4_free_icms(dev); err_stop_fw: - unmap_bf_area(dev); - mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, priv->fw.fw_icm, 0); - + if (!mlx4_is_slave(dev)) { + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, priv->fw.fw_icm, 0); + if (dev_cap) + kfree(dev_cap); + } return err; } static int mlx4_init_counters_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); - int err; - int nent; + int res; + int nent_pow2; - switch (dev->caps.counters_mode) { - case MLX4_CUNTERS_BASIC: - nent = dev->caps.max_basic_counters; - break; - case MLX4_CUNTERS_EXT: - nent = dev->caps.max_ext_counters; - break; - default: + if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) return -ENOENT; - } - err = mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0); - if (err) - return err; - return 0; + nent_pow2 = roundup_pow_of_two(dev->caps.max_counters); + res = mlx4_bitmap_init(&priv->counters_bitmap, nent_pow2, + nent_pow2 - 1, 0, + nent_pow2 - dev->caps.max_counters); + if (res) + return res; + + if (dev->caps.max_counters == dev->caps.max_basic_counters) + return 0; + + res = mlx4_cmd(dev, MLX4_IF_STATE_EXTENDED, 0, 0, + MLX4_CMD_SET_IF_STAT, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + if (res) + mlx4_err(dev, "Failed to set extended counters (err=%d)\n", + res); + return res; + } static void mlx4_cleanup_counters_table(struct mlx4_dev *dev) { - switch (dev->caps.counters_mode) { - case MLX4_CUNTERS_BASIC: - case MLX4_CUNTERS_EXT: + if (!mlx4_is_slave(dev) && + (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap); - break; - default: - break; - } +} + +int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) + return -ENOENT; + + *idx = mlx4_bitmap_alloc(&priv->counters_bitmap); + if (*idx == -1) + return -ENOMEM; + + return 0; } int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) { - struct mlx4_priv *priv = mlx4_priv(dev); + u64 out_param; + int err; - switch (dev->caps.counters_mode) { - case MLX4_CUNTERS_BASIC: - case MLX4_CUNTERS_EXT: - *idx = mlx4_bitmap_alloc(&priv->counters_bitmap); - if (*idx == -1) - return -ENOMEM; - return 0; - default: - return -ENOMEM; + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_COUNTER, + RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (!err) + *idx = get_param_l(&out_param); + + return err; } + return __mlx4_counter_alloc(dev, idx); } EXPORT_SYMBOL_GPL(mlx4_counter_alloc); +void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx) +{ + mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx); + return; +} + void mlx4_counter_free(struct mlx4_dev *dev, u32 idx) { - switch (dev->caps.counters_mode) { - case MLX4_CUNTERS_BASIC: - case MLX4_CUNTERS_EXT: - mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx); - return; - default: + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, idx); + mlx4_cmd(dev, in_param, RES_COUNTER, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); return; } + __mlx4_counter_free(dev, idx); } EXPORT_SYMBOL_GPL(mlx4_counter_free); @@ -1078,18 +1820,19 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) err = mlx4_init_uar_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " - "user access region table, aborting.\n"); + "user access region table (err=%d), aborting.\n", + err); return err; } err = mlx4_uar_alloc(dev, &priv->driver_uar); if (err) { - mlx4_err(dev, "Failed to allocate driver access region, " - "aborting.\n"); + mlx4_err(dev, "Failed to allocate driver access region " + "(err=%d), aborting.\n", err); goto err_uar_table_free; } - priv->kar = ioremap(priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); + priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!priv->kar) { mlx4_err(dev, "Couldn't map kernel access region, " "aborting.\n"); @@ -1100,35 +1843,36 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) err = mlx4_init_pd_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " - "protection domain table, aborting.\n"); + "protection domain table (err=%d), aborting.\n", err); goto err_kar_unmap; } err = mlx4_init_xrcd_table(dev); if (err) { - mlx4_err(dev, "Failed to initialize extended " - "reliably connected domain table, aborting.\n"); + mlx4_err(dev, "Failed to initialize " + "reliable connection domain table (err=%d), " + "aborting.\n", err); goto err_pd_table_free; } err = mlx4_init_mr_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " - "memory region table, aborting.\n"); + "memory region table (err=%d), aborting.\n", err); goto err_xrcd_table_free; } err = mlx4_init_eq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " - "event queue table, aborting.\n"); + "event queue table (err=%d), aborting.\n", err); goto err_mr_table_free; } err = mlx4_cmd_use_events(dev); if (err) { mlx4_err(dev, "Failed to switch to event-driven " - "firmware commands, aborting.\n"); + "firmware commands (err=%d), aborting.\n", err); goto err_eq_table_free; } @@ -1154,50 +1898,74 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) err = mlx4_init_cq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " - "completion queue table, aborting.\n"); + "completion queue table (err=%d), aborting.\n", err); goto err_cmd_poll; } err = mlx4_init_srq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " - "shared receive queue table, aborting.\n"); + "shared receive queue table (err=%d), aborting.\n", + err); goto err_cq_table_free; } err = mlx4_init_qp_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " - "queue pair table, aborting.\n"); + "queue pair table (err=%d), aborting.\n", err); goto err_srq_table_free; } - err = mlx4_init_mcg_table(dev); - if (err) { - mlx4_err(dev, "Failed to initialize " - "multicast group table, aborting.\n"); - goto err_qp_table_free; - } - - err = mlx4_init_counters_table(dev); - if (err && err != -ENOENT) { - mlx4_err(dev, "Failed to initialize counters table, aborting.\n"); - goto err_mcg_table_free; - } - - for (port = 1; port <= dev->caps.num_ports; port++) { - ib_port_default_caps = 0; - err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps); - if (err) - mlx4_warn(dev, "failed to get port %d default " - "ib capabilities (%d). Continuing with " - "caps = 0\n", port, err); - dev->caps.ib_port_def_cap[port] = ib_port_default_caps; - err = mlx4_SET_PORT(dev, port); + if (!mlx4_is_slave(dev)) { + err = mlx4_init_mcg_table(dev); if (err) { - mlx4_err(dev, "Failed to set port %d, aborting\n", - port); - goto err_counters_table_free; + mlx4_err(dev, "Failed to initialize " + "multicast group table (err=%d), aborting.\n", + err); + goto err_qp_table_free; + } + + err = mlx4_init_counters_table(dev); + if (err && err != -ENOENT) { + mlx4_err(dev, "Failed to initialize counters table (err=%d), " + "aborting.\n", err); + goto err_mcg_table_free; + } + + for (port = 1; port <= dev->caps.num_ports; port++) { + ib_port_default_caps = 0; + err = mlx4_get_port_ib_caps(dev, port, + &ib_port_default_caps); + if (err) + mlx4_warn(dev, "failed to get port %d default " + "ib capabilities (%d). Continuing " + "with caps = 0\n", port, err); + dev->caps.ib_port_def_cap[port] = ib_port_default_caps; + + /* initialize per-slave default ib port capabilities */ + if (mlx4_is_master(dev)) { + int i; + for (i = 0; i < dev->num_slaves; i++) { + if (i == mlx4_master_func_num(dev)) + continue; + priv->mfunc.master.slave_state[i].ib_cap_mask[port] = + ib_port_default_caps; + } + } + + if (mlx4_is_mfunc(dev)) + dev->caps.port_ib_mtu[port] = IB_MTU_2048; + else + dev->caps.port_ib_mtu[port] = IB_MTU_4096; + + err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ? + dev->caps.pkey_table_len[port] : -1); + if (err) { + mlx4_err(dev, "Failed to set port %d (err=%d), " + "aborting\n", port, err); + goto err_counters_table_free; + } } } @@ -1248,13 +2016,16 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct msix_entry *entries; - int nreq; + int nreq = min_t(int, dev->caps.num_ports * + min_t(int, num_possible_cpus() + 1, MAX_MSIX_P_PORT) + + MSIX_LEGACY_SZ, MAX_MSIX); int err; int i; if (msi_x) { nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs, - num_possible_cpus() + 1); + nreq); + entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); if (!entries) goto no_msi; @@ -1277,7 +2048,15 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) goto no_msi; } - dev->caps.num_comp_vectors = nreq - 1; + if (nreq < + MSIX_LEGACY_SZ + dev->caps.num_ports * MIN_MSIX_P_PORT) { + /*Working in legacy mode , all EQ's shared*/ + dev->caps.comp_pool = 0; + dev->caps.num_comp_vectors = nreq - 1; + } else { + dev->caps.comp_pool = nreq - MSIX_LEGACY_SZ; + dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1; + } for (i = 0; i < nreq; ++i) priv->eq_table.eq[i].irq = entries[i].vector; @@ -1289,6 +2068,7 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) no_msi: dev->caps.num_comp_vectors = 1; + dev->caps.comp_pool = 0; for (i = 0; i < 2; ++i) priv->eq_table.eq[i].irq = dev->pdev->irq; @@ -1301,14 +2081,22 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port) info->dev = dev; info->port = port; - mlx4_init_mac_table(dev, &info->mac_table); - mlx4_init_vlan_table(dev, &info->vlan_table); + if (!mlx4_is_slave(dev)) { + mlx4_init_mac_table(dev, &info->mac_table); + mlx4_init_vlan_table(dev, &info->vlan_table); + info->base_qpn = mlx4_get_base_qpn(dev, port); + } sprintf(info->dev_name, "mlx4_port%d", port); info->port_attr.attr.name = info->dev_name; - info->port_attr.attr.mode = S_IRUGO | S_IWUSR; + if (mlx4_is_mfunc(dev)) + info->port_attr.attr.mode = S_IRUGO; + else { + info->port_attr.attr.mode = S_IRUGO | S_IWUSR; + info->port_attr.store = set_port_type; + } info->port_attr.show = show_port_type; - info->port_attr.store = set_port_type; + sysfs_attr_init(&info->port_attr.attr); err = device_create_file(&dev->pdev->dev, &info->port_attr); if (err) { @@ -1316,6 +2104,24 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port) info->port = -1; } + sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port); + info->port_mtu_attr.attr.name = info->dev_mtu_name; + if (mlx4_is_mfunc(dev)) + info->port_mtu_attr.attr.mode = S_IRUGO; + else { + info->port_mtu_attr.attr.mode = S_IRUGO | S_IWUSR; + info->port_mtu_attr.store = set_port_ib_mtu; + } + info->port_mtu_attr.show = show_port_ib_mtu; + sysfs_attr_init(&info->port_mtu_attr.attr); + + err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr); + if (err) { + mlx4_err(dev, "Failed to create mtu file for port %d\n", port); + device_remove_file(&info->dev->pdev->dev, &info->port_attr); + info->port = -1; + } + return err; } @@ -1325,25 +2131,114 @@ static void mlx4_cleanup_port_info(struct mlx4_port_info *info) return; device_remove_file(&info->dev->pdev->dev, &info->port_attr); + device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr); } -static int mlx4_init_trigger(struct mlx4_priv *priv) +static int mlx4_init_steering(struct mlx4_dev *dev) { - memcpy(&priv->trigger_attr, &dev_attr_port_trigger, - sizeof(struct device_attribute)); - return device_create_file(&priv->dev.pdev->dev, &priv->trigger_attr); + struct mlx4_priv *priv = mlx4_priv(dev); + int num_entries = dev->caps.num_ports; + int i, j; + + priv->steer = kzalloc(sizeof(struct mlx4_steer) * num_entries, GFP_KERNEL); + if (!priv->steer) + return -ENOMEM; + + for (i = 0; i < num_entries; i++) + for (j = 0; j < MLX4_NUM_STEERS; j++) { + INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]); + INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]); + } + return 0; } -static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) +static void mlx4_clear_steering(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_steer_index *entry, *tmp_entry; + struct mlx4_promisc_qp *pqp, *tmp_pqp; + int num_entries = dev->caps.num_ports; + int i, j; + + for (i = 0; i < num_entries; i++) { + for (j = 0; j < MLX4_NUM_STEERS; j++) { + list_for_each_entry_safe(pqp, tmp_pqp, + &priv->steer[i].promisc_qps[j], + list) { + list_del(&pqp->list); + kfree(pqp); + } + list_for_each_entry_safe(entry, tmp_entry, + &priv->steer[i].steer_entries[j], + list) { + list_del(&entry->list); + list_for_each_entry_safe(pqp, tmp_pqp, + &entry->duplicates, + list) { + list_del(&pqp->list); + kfree(pqp); + } + kfree(entry); + } + } + } + kfree(priv->steer); +} + +static int extended_func_num(struct pci_dev *pdev) +{ + return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn); +} + +#define MLX4_OWNER_BASE 0x8069c +#define MLX4_OWNER_SIZE 4 + +static int mlx4_get_ownership(struct mlx4_dev *dev) +{ + void __iomem *owner; + u32 ret; + + if (pci_channel_offline(dev->pdev)) + return -EIO; + + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, + MLX4_OWNER_SIZE); + if (!owner) { + mlx4_err(dev, "Failed to obtain ownership bit\n"); + return -ENOMEM; + } + + ret = readl(owner); + iounmap(owner); + return (int) !!ret; +} + +static void mlx4_free_ownership(struct mlx4_dev *dev) +{ + void __iomem *owner; + + if (pci_channel_offline(dev->pdev)) + return; + + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, + MLX4_OWNER_SIZE); + if (!owner) { + mlx4_err(dev, "Failed to obtain ownership bit\n"); + return; + } + writel(0, owner); + msleep(1000); + iounmap(owner); +} + +static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) { struct mlx4_priv *priv; struct mlx4_dev *dev; int err; int port; - int i; - printk(KERN_INFO PFX "Initializing %s\n", - pci_name(pdev)); + pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); err = pci_enable_device(pdev); if (err) { @@ -1351,13 +2246,24 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) "aborting.\n"); return err; } + if (num_vfs > MLX4_MAX_NUM_VF) { + dev_err(&pdev->dev, "There are more VF's (%d) than allowed(%d)\n", + num_vfs, MLX4_MAX_NUM_VF); + return -EINVAL; + } + if (num_vfs < 0) { + dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); + return -EINVAL; + } /* - * Check for BARs. We expect 0: 1MB + * Check for BARs. */ - if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) || - pci_resource_len(pdev, 0) != 1 << 20) { - dev_err(&pdev->dev, "Missing DCS, aborting.\n"); + if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && + !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing DCS, aborting." + "(driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%x)\n", + pci_dev_data, pci_resource_flags(pdev, 0)); err = -ENODEV; goto err_disable_pdev; } @@ -1367,18 +2273,12 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_disable_pdev; } - err = pci_request_region(pdev, 0, DRV_NAME); + err = pci_request_regions(pdev, DRV_NAME); if (err) { - dev_err(&pdev->dev, "Cannot request control region, aborting.\n"); + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); goto err_disable_pdev; } - err = pci_request_region(pdev, 2, DRV_NAME); - if (err) { - dev_err(&pdev->dev, "Cannot request UAR region, aborting.\n"); - goto err_release_bar0; - } - pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); @@ -1387,7 +2287,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); - goto err_release_bar2; + goto err_release_regions; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); @@ -1398,16 +2298,19 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) { dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " "aborting.\n"); - goto err_release_bar2; + goto err_release_regions; } } + /* Allow large DMA segments, up to the firmware limit of 1 GB */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + priv = kzalloc(sizeof *priv, GFP_KERNEL); if (!priv) { dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; - goto err_release_bar2; + goto err_release_regions; } dev = &priv->dev; @@ -1419,47 +2322,154 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&priv->pgdir_list); mutex_init(&priv->pgdir_mutex); - for (i = 0; i < MLX4_MAX_PORTS; ++i) - priv->iboe_counter_index[i] = -1; INIT_LIST_HEAD(&priv->bf_list); mutex_init(&priv->bf_mutex); - /* - * Now reset the HCA before we touch the PCI capabilities or - * attempt a firmware command, since a boot ROM may have left - * the HCA in an undefined state. - */ - err = mlx4_reset(dev); - if (err) { - mlx4_err(dev, "Failed to reset HCA, aborting.\n"); - goto err_free_dev; + dev->rev_id = pdev->revision; + dev->numa_node = dev_to_node(&pdev->dev); + /* Detect if this device is a virtual function */ + if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { + /* When acting as pf, we normally skip vfs unless explicitly + * requested to probe them. */ + if (num_vfs && extended_func_num(pdev) > probe_vf) { + mlx4_warn(dev, "Skipping virtual function:%d\n", + extended_func_num(pdev)); + err = -ENODEV; + goto err_free_dev; + } + mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); + dev->flags |= MLX4_FLAG_SLAVE; + } else { + /* We reset the device and enable SRIOV only for physical + * devices. Try to claim ownership on the device; + * if already taken, skip -- do not allow multiple PFs */ + err = mlx4_get_ownership(dev); + if (err) { + if (err < 0) + goto err_free_dev; + else { + mlx4_warn(dev, "Multiple PFs not yet supported." + " Skipping PF.\n"); + err = -EINVAL; + goto err_free_dev; + } + } + + if (num_vfs) { + mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", num_vfs); + err = pci_enable_sriov(pdev, num_vfs); + if (err) { + mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d).\n", + err); + err = 0; + } else { + mlx4_warn(dev, "Running in master mode\n"); + dev->flags |= MLX4_FLAG_SRIOV | + MLX4_FLAG_MASTER; + dev->num_vfs = num_vfs; + } + } + + atomic_set(&priv->opreq_count, 0); + INIT_WORK(&priv->opreq_task, mlx4_opreq_action); + + /* + * Now reset the HCA before we touch the PCI capabilities or + * attempt a firmware command, since a boot ROM may have left + * the HCA in an undefined state. + */ + err = mlx4_reset(dev); + if (err) { + mlx4_err(dev, "Failed to reset HCA, aborting.\n"); + goto err_sriov; + } } - if (mlx4_cmd_init(dev)) { +slave_start: + err = mlx4_cmd_init(dev); + if (err) { mlx4_err(dev, "Failed to init command interface, aborting.\n"); - goto err_free_dev; + goto err_sriov; + } + + /* In slave functions, the communication channel must be initialized + * before posting commands. Also, init num_slaves before calling + * mlx4_init_hca */ + if (mlx4_is_mfunc(dev)) { + if (mlx4_is_master(dev)) + dev->num_slaves = MLX4_MAX_NUM_SLAVES; + else { + dev->num_slaves = 0; + err = mlx4_multi_func_init(dev); + if (err) { + mlx4_err(dev, "Failed to init slave mfunc" + " interface, aborting.\n"); + goto err_cmd; + } + } } err = mlx4_init_hca(dev); - if (err) - goto err_cmd; + if (err) { + if (err == -EACCES) { + /* Not primary Physical function + * Running in slave mode */ + mlx4_cmd_cleanup(dev); + dev->flags |= MLX4_FLAG_SLAVE; + dev->flags &= ~MLX4_FLAG_MASTER; + goto slave_start; + } else + goto err_mfunc; + } + + /* In master functions, the communication channel must be initialized + * after obtaining its address from fw */ + if (mlx4_is_master(dev)) { + err = mlx4_multi_func_init(dev); + if (err) { + mlx4_err(dev, "Failed to init master mfunc" + "interface, aborting.\n"); + goto err_close; + } + } err = mlx4_alloc_eq_table(dev); if (err) - goto err_close; + goto err_master_mfunc; + + priv->msix_ctl.pool_bm = 0; + mutex_init(&priv->msix_ctl.pool_lock); mlx4_enable_msi_x(dev); + if ((mlx4_is_mfunc(dev)) && + !(dev->flags & MLX4_FLAG_MSI_X)) { + err = -ENOSYS; + mlx4_err(dev, "INTx is not supported in multi-function mode." + " aborting.\n"); + goto err_free_eq; + } + + if (!mlx4_is_slave(dev)) { + err = mlx4_init_steering(dev); + if (err) + goto err_free_eq; + } err = mlx4_setup_hca(dev); - if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X)) { + if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) && + !mlx4_is_mfunc(dev)) { dev->flags &= ~MLX4_FLAG_MSI_X; + dev->caps.num_comp_vectors = 1; + dev->caps.comp_pool = 0; pci_disable_msix(pdev); err = mlx4_setup_hca(dev); } if (err) - goto err_free_eq; + goto err_steer; + + mlx4_init_quotas(dev); for (port = 1; port <= dev->caps.num_ports; port++) { err = mlx4_init_port_info(dev, port); @@ -1471,24 +2481,17 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) goto err_port; - err = mlx4_init_trigger(priv); - if (err) - goto err_register; - err = mlx4_sense_init(dev); if (err) - goto err_trigger; + goto err_port; mlx4_start_sense(dev); + priv->pci_dev_data = pci_dev_data; pci_set_drvdata(pdev, dev); return 0; -err_trigger: - device_remove_file(&dev->pdev->dev, &priv->trigger_attr); -err_register: - mlx4_unregister_device(dev); err_port: for (--port; port >= 1; --port) mlx4_cleanup_port_info(&priv->port[port]); @@ -1505,26 +2508,49 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) mlx4_cleanup_pd_table(dev); mlx4_cleanup_uar_table(dev); +err_steer: + if (!mlx4_is_slave(dev)) + mlx4_clear_steering(dev); + err_free_eq: mlx4_free_eq_table(dev); +err_master_mfunc: + if (mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); + + if (mlx4_is_slave(dev)) { + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + } + err_close: if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); mlx4_close_hca(dev); +err_mfunc: + if (mlx4_is_slave(dev)) + mlx4_multi_func_cleanup(dev); + err_cmd: mlx4_cmd_cleanup(dev); +err_sriov: + if (dev->flags & MLX4_FLAG_SRIOV) + pci_disable_sriov(pdev); + + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + err_free_dev: kfree(priv); -err_release_bar2: - pci_release_region(pdev, 2); - -err_release_bar0: - pci_release_region(pdev, 0); +err_release_regions: + pci_release_regions(pdev); err_disable_pdev: pci_disable_device(pdev); @@ -1535,14 +2561,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) static int __devinit mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { - static int mlx4_version_printed; + printk_once(KERN_INFO "%s", mlx4_version); - if (!mlx4_version_printed) { - printk(KERN_INFO "%s", mlx4_version); - ++mlx4_version_printed; - } - - return __mlx4_init_one(pdev, id); + return __mlx4_init_one(pdev, id->driver_data); } static void mlx4_remove_one(struct pci_dev *pdev) @@ -1552,16 +2573,25 @@ static void mlx4_remove_one(struct pci_dev *pdev) int p; if (dev) { + /* in SRIOV it is not allowed to unload the pf's + * driver while there are alive vf's */ + if (mlx4_is_master(dev)) { + if (mlx4_how_many_lives_vf(dev)) + mlx4_err(dev, "Removing PF when there are assigned VF's !!!\n"); + } mlx4_sense_cleanup(dev); mlx4_unregister_device(dev); - device_remove_file(&dev->pdev->dev, &priv->trigger_attr); for (p = 1; p <= dev->caps.num_ports; p++) { mlx4_cleanup_port_info(&priv->port[p]); mlx4_CLOSE_PORT(dev, p); } - mlx4_cleanup_counters_table(dev); + if (mlx4_is_master(dev)) + mlx4_free_resource_tracker(dev, + RES_TR_FREE_SLAVES_ONLY); + + mlx4_cleanup_counters_table(dev); mlx4_cleanup_mcg_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); @@ -1572,19 +2602,40 @@ static void mlx4_remove_one(struct pci_dev *pdev) mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); + if (mlx4_is_master(dev)) + mlx4_free_resource_tracker(dev, + RES_TR_FREE_STRUCTS_ONLY); + iounmap(priv->kar); mlx4_uar_free(dev, &priv->driver_uar); mlx4_cleanup_uar_table(dev); + if (!mlx4_is_slave(dev)) + mlx4_clear_steering(dev); mlx4_free_eq_table(dev); + if (mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); mlx4_close_hca(dev); + if (mlx4_is_slave(dev)) + mlx4_multi_func_cleanup(dev); mlx4_cmd_cleanup(dev); if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); + if (dev->flags & MLX4_FLAG_SRIOV) { + mlx4_warn(dev, "Disabling SR-IOV\n"); + pci_disable_sriov(pdev); + } + + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); kfree(priv); - pci_release_region(pdev, 2); - pci_release_region(pdev, 0); + pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } @@ -1592,66 +2643,187 @@ static void mlx4_remove_one(struct pci_dev *pdev) int mlx4_restart_one(struct pci_dev *pdev) { + struct mlx4_dev *dev = pci_get_drvdata(pdev); + struct mlx4_priv *priv = mlx4_priv(dev); + int pci_dev_data; + + pci_dev_data = priv->pci_dev_data; mlx4_remove_one(pdev); - return __mlx4_init_one(pdev, NULL); + return __mlx4_init_one(pdev, pci_dev_data); } -static struct pci_device_id mlx4_pci_table[] = { - { PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */ - { PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */ - { PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */ - { PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */ - { PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */ - { PCI_VDEVICE(MELLANOX, 0x6778) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */ - { PCI_VDEVICE(MELLANOX, 0x1000) }, - { PCI_VDEVICE(MELLANOX, 0x1001) }, - { PCI_VDEVICE(MELLANOX, 0x1002) }, - { PCI_VDEVICE(MELLANOX, 0x1003) }, - { PCI_VDEVICE(MELLANOX, 0x1004) }, - { PCI_VDEVICE(MELLANOX, 0x1005) }, - { PCI_VDEVICE(MELLANOX, 0x1006) }, - { PCI_VDEVICE(MELLANOX, 0x1007) }, - { PCI_VDEVICE(MELLANOX, 0x1008) }, - { PCI_VDEVICE(MELLANOX, 0x1009) }, - { PCI_VDEVICE(MELLANOX, 0x100a) }, - { PCI_VDEVICE(MELLANOX, 0x100b) }, - { PCI_VDEVICE(MELLANOX, 0x100c) }, - { PCI_VDEVICE(MELLANOX, 0x100d) }, - { PCI_VDEVICE(MELLANOX, 0x100e) }, - { PCI_VDEVICE(MELLANOX, 0x100f) }, +static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = { + /* MT25408 "Hermon" SDR */ + { PCI_VDEVICE(MELLANOX, 0x6340), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" DDR */ + { PCI_VDEVICE(MELLANOX, 0x634a), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" QDR */ + { PCI_VDEVICE(MELLANOX, 0x6354), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" DDR PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6732), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" QDR PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x673c), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" EN 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x6368), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6750), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x6372), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x675a), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT26468 ConnectX EN 10GigE PCIe gen2*/ + { PCI_VDEVICE(MELLANOX, 0x6764), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ + { PCI_VDEVICE(MELLANOX, 0x6746), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT26478 ConnectX2 40GigE PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x676e), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25400 Family [ConnectX-2 Virtual Function] */ + { PCI_VDEVICE(MELLANOX, 0x1002), MLX4_PCI_DEV_IS_VF }, + /* MT27500 Family [ConnectX-3] */ + { PCI_VDEVICE(MELLANOX, 0x1003), 0 }, + /* MT27500 Family [ConnectX-3 Virtual Function] */ + { PCI_VDEVICE(MELLANOX, 0x1004), MLX4_PCI_DEV_IS_VF }, + { PCI_VDEVICE(MELLANOX, 0x1005), 0 }, /* MT27510 Family */ + { PCI_VDEVICE(MELLANOX, 0x1006), 0 }, /* MT27511 Family */ + { PCI_VDEVICE(MELLANOX, 0x1007), 0 }, /* MT27520 Family */ + { PCI_VDEVICE(MELLANOX, 0x1008), 0 }, /* MT27521 Family */ + { PCI_VDEVICE(MELLANOX, 0x1009), 0 }, /* MT27530 Family */ + { PCI_VDEVICE(MELLANOX, 0x100a), 0 }, /* MT27531 Family */ + { PCI_VDEVICE(MELLANOX, 0x100b), 0 }, /* MT27540 Family */ + { PCI_VDEVICE(MELLANOX, 0x100c), 0 }, /* MT27541 Family */ + { PCI_VDEVICE(MELLANOX, 0x100d), 0 }, /* MT27550 Family */ + { PCI_VDEVICE(MELLANOX, 0x100e), 0 }, /* MT27551 Family */ + { PCI_VDEVICE(MELLANOX, 0x100f), 0 }, /* MT27560 Family */ + { PCI_VDEVICE(MELLANOX, 0x1010), 0 }, /* MT27561 Family */ { 0, } }; MODULE_DEVICE_TABLE(pci, mlx4_pci_table); +static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + mlx4_remove_one(pdev); + + return state == pci_channel_io_perm_failure ? + PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) +{ + int ret = __mlx4_init_one(pdev, 0); + + return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +static const struct pci_error_handlers mlx4_err_handler = { + .error_detected = mlx4_pci_err_detected, + .slot_reset = mlx4_pci_slot_reset, +}; + +static int suspend(struct pci_dev *pdev, pm_message_t state) +{ + mlx4_remove_one(pdev); + + if (mlx4_log_num_mgm_entry_size != -1 && + (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE || + mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) { + pr_warning("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not " + "in legal range (-1 or %d..%d)\n", + mlx4_log_num_mgm_entry_size, + MLX4_MIN_MGM_LOG_ENTRY_SIZE, + MLX4_MAX_MGM_LOG_ENTRY_SIZE); + return -1; + } + return 0; +} + +static int resume(struct pci_dev *pdev) +{ + return __mlx4_init_one(pdev, 0); +} + static struct pci_driver mlx4_driver = { .name = DRV_NAME, - .id_table = mlx4_pci_table, + .id_table = (struct pci_device_id*)mlx4_pci_table, .probe = mlx4_init_one, - .remove = __devexit_p(mlx4_remove_one) + .remove = __devexit_p(mlx4_remove_one), + .suspend = suspend, + .resume = resume, + .err_handler = (struct pci_error_handlers*)&mlx4_err_handler, }; static int __init mlx4_verify_params(void) { if ((log_num_mac < 0) || (log_num_mac > 7)) { - printk(KERN_WARNING "mlx4_core: bad num_mac: %d\n", log_num_mac); + pr_warning("mlx4_core: bad num_mac: %d\n", log_num_mac); return -1; } - if (log_mtts_per_seg == 0) - log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); - if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) { - printk(KERN_WARNING "mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg); + if (log_num_vlan != 0) + pr_warning("mlx4_core: log_num_vlan - obsolete module param, using %d\n", + MLX4_LOG_NUM_VLANS); + + if (mlx4_set_4k_mtu != -1) + pr_warning("mlx4_core: set_4k_mtu - obsolete module param\n"); + + if ((log_mtts_per_seg < 0) || (log_mtts_per_seg > 7)) { + pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg); return -1; } + /* Check if module param for ports type has legal combination */ + if (port_type_array[0] == false && port_type_array[1] == true) { + pr_warning("mlx4_core: module parameter configuration ETH/IB is not supported. Switching to default configuration IB/IB\n"); + port_type_array[0] = true; + } + + if (mlx4_log_num_mgm_entry_size != -1 && + (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE || + mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) { + pr_warning("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not " + "in legal range (-1 or %d..%d)\n", + mlx4_log_num_mgm_entry_size, + MLX4_MIN_MGM_LOG_ENTRY_SIZE, + MLX4_MAX_MGM_LOG_ENTRY_SIZE); + return -1; + } + + if (mod_param_profile.num_qp < 18 || mod_param_profile.num_qp > 23) { + pr_warning("mlx4_core: bad log_num_qp: %d\n", + mod_param_profile.num_qp); + return -1; + } + + if (mod_param_profile.num_srq < 10) { + pr_warning("mlx4_core: too low log_num_srq: %d\n", + mod_param_profile.num_srq); + return -1; + } + + if (mod_param_profile.num_cq < 10) { + pr_warning("mlx4_core: too low log_num_cq: %d\n", + mod_param_profile.num_cq); + return -1; + } + + if (mod_param_profile.num_mpt < 10) { + pr_warning("mlx4_core: too low log_num_mpt: %d\n", + mod_param_profile.num_mpt); + return -1; + } + + if (mod_param_profile.num_mtt && mod_param_profile.num_mtt < 15) { + pr_warning("mlx4_core: too low log_num_mtt: %d\n", + mod_param_profile.num_mtt); + return -1; + } + + if (mod_param_profile.num_mtt > MLX4_MAX_LOG_NUM_MTT) { + pr_warning("mlx4_core: too high log_num_mtt: %d\n", + mod_param_profile.num_mtt); + return -1; + } return 0; } @@ -1659,8 +2831,6 @@ static int __init mlx4_init(void) { int ret; - mutex_init(&drv_mutex); - if (mlx4_verify_params()) return -EINVAL; @@ -1670,20 +2840,26 @@ static int __init mlx4_init(void) if (!mlx4_wq) return -ENOMEM; + if (enable_sys_tune) + sys_tune_init(); + ret = pci_register_driver(&mlx4_driver); + if (ret < 0 && enable_sys_tune) + sys_tune_fini(); + return ret < 0 ? ret : 0; } static void __exit mlx4_cleanup(void) { - mutex_lock(&drv_mutex); - mlx4_config_cleanup(); + if (enable_sys_tune) + sys_tune_fini(); + pci_unregister_driver(&mlx4_driver); - mutex_unlock(&drv_mutex); destroy_workqueue(mlx4_wq); } -module_init_order(mlx4_init, SI_ORDER_MIDDLE); +module_init(mlx4_init); module_exit(mlx4_cleanup); #undef MODULE_VERSION diff --git a/sys/ofed/drivers/net/mlx4/mcg.c b/sys/ofed/drivers/net/mlx4/mcg.c index 70493e3ae398..625e5e42e9dd 100644 --- a/sys/ofed/drivers/net/mlx4/mcg.c +++ b/sys/ofed/drivers/net/mlx4/mcg.c @@ -31,50 +31,88 @@ * SOFTWARE. */ -#include #include -#include #include -#include #include "mlx4.h" -#define MGM_QPN_MASK 0x00FFFFFF -#define MGM_BLCK_LB_BIT 30 - -struct mlx4_mgm { - __be32 next_gid_index; - __be32 members_count; - u32 reserved[2]; - u8 gid[16]; - __be32 qp[MLX4_QP_PER_MGM]; -}; static const u8 zero_gid[16]; /* automatically initialized to 0 */ -static int mlx4_READ_MCG(struct mlx4_dev *dev, int index, - struct mlx4_cmd_mailbox *mailbox) +int mlx4_get_mgm_entry_size(struct mlx4_dev *dev) +{ + return 1 << dev->oper_log_mgm_entry_size; +} + +int mlx4_get_qp_per_mgm(struct mlx4_dev *dev) +{ + return 4 * (mlx4_get_mgm_entry_size(dev) / 16 - 2); +} + +static int mlx4_QP_FLOW_STEERING_ATTACH(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox, + u32 size, + u64 *reg_id) +{ + u64 imm; + int err = 0; + + err = mlx4_cmd_imm(dev, mailbox->dma, &imm, size, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + return err; + *reg_id = imm; + + return err; +} + +static int mlx4_QP_FLOW_STEERING_DETACH(struct mlx4_dev *dev, u64 regid) +{ + int err = 0; + + err = mlx4_cmd(dev, regid, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + return err; +} + +static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index, + struct mlx4_cmd_mailbox *mailbox) { return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } -static int mlx4_WRITE_MCG(struct mlx4_dev *dev, int index, - struct mlx4_cmd_mailbox *mailbox) +static int mlx4_WRITE_ENTRY(struct mlx4_dev *dev, int index, + struct mlx4_cmd_mailbox *mailbox) { return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } -static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, - u16 *hash) +static int mlx4_WRITE_PROMISC(struct mlx4_dev *dev, u8 port, u8 steer, + struct mlx4_cmd_mailbox *mailbox) +{ + u32 in_mod; + + in_mod = (u32) port << 16 | steer << 1; + return mlx4_cmd(dev, mailbox->dma, in_mod, 0x1, + MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); +} + +static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + u16 *hash, u8 op_mod) { u64 imm; int err; - err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, 0, MLX4_CMD_MGID_HASH, - MLX4_CMD_TIME_CLASS_A); + err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, op_mod, + MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); if (!err) *hash = imm; @@ -82,6 +120,476 @@ static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox return err; } +static struct mlx4_promisc_qp *get_promisc_qp(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + u32 qpn) +{ + struct mlx4_steer *s_steer = &mlx4_priv(dev)->steer[port - 1]; + struct mlx4_promisc_qp *pqp; + + list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) { + if (pqp->qpn == qpn) + return pqp; + } + /* not found */ + return NULL; +} + +/* + * Add new entry to steering data structure. + * All promisc QPs should be added as well + */ +static int new_steering_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + u32 members_count; + struct mlx4_steer_index *new_entry; + struct mlx4_promisc_qp *pqp; + struct mlx4_promisc_qp *dqp = NULL; + u32 prot; + int err; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + + INIT_LIST_HEAD(&new_entry->duplicates); + new_entry->index = index; + list_add_tail(&new_entry->list, &s_steer->steer_entries[steer]); + + /* If the given qpn is also a promisc qp, + * it should be inserted to duplicates list + */ + pqp = get_promisc_qp(dev, port, steer, qpn); + if (pqp) { + dqp = kmalloc(sizeof *dqp, GFP_KERNEL); + if (!dqp) { + err = -ENOMEM; + goto out_alloc; + } + dqp->qpn = qpn; + list_add_tail(&dqp->list, &new_entry->duplicates); + } + + /* if no promisc qps for this vep, we are done */ + if (list_empty(&s_steer->promisc_qps[steer])) + return 0; + + /* now need to add all the promisc qps to the new + * steering entry, as they should also receive the packets + * destined to this address */ + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = -ENOMEM; + goto out_alloc; + } + mgm = mailbox->buf; + + err = mlx4_READ_ENTRY(dev, index, mailbox); + if (err) + goto out_mailbox; + + members_count = be32_to_cpu(mgm->members_count) & 0xffffff; + prot = be32_to_cpu(mgm->members_count) >> 30; + list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) { + /* don't add already existing qpn */ + if (pqp->qpn == qpn) + continue; + if (members_count == dev->caps.num_qp_per_mgm) { + /* out of space */ + err = -ENOMEM; + goto out_mailbox; + } + + /* add the qpn */ + mgm->qp[members_count++] = cpu_to_be32(pqp->qpn & MGM_QPN_MASK); + } + /* update the qps count and update the entry with all the promisc qps*/ + mgm->members_count = cpu_to_be32(members_count | (prot << 30)); + err = mlx4_WRITE_ENTRY(dev, index, mailbox); + +out_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); + if (!err) + return 0; +out_alloc: + if (dqp) { + list_del(&dqp->list); + kfree(dqp); + } + list_del(&new_entry->list); + kfree(new_entry); + return err; +} + +/* update the data structures with existing steering entry */ +static int existing_steering_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_steer_index *tmp_entry, *entry = NULL; + struct mlx4_promisc_qp *pqp; + struct mlx4_promisc_qp *dqp; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + pqp = get_promisc_qp(dev, port, steer, qpn); + if (!pqp) + return 0; /* nothing to do */ + + list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) { + if (tmp_entry->index == index) { + entry = tmp_entry; + break; + } + } + if (unlikely(!entry)) { + mlx4_warn(dev, "Steering entry at index %x is not registered\n", index); + return -EINVAL; + } + + /* the given qpn is listed as a promisc qpn + * we need to add it as a duplicate to this entry + * for future references */ + list_for_each_entry(dqp, &entry->duplicates, list) { + if (qpn == pqp->qpn) + return 0; /* qp is already duplicated */ + } + + /* add the qp as a duplicate on this index */ + dqp = kmalloc(sizeof *dqp, GFP_KERNEL); + if (!dqp) + return -ENOMEM; + dqp->qpn = qpn; + list_add_tail(&dqp->list, &entry->duplicates); + + return 0; +} + +/* Check whether a qpn is a duplicate on steering entry + * If so, it should not be removed from mgm */ +static bool check_duplicate_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_steer_index *tmp_entry, *entry = NULL; + struct mlx4_promisc_qp *dqp, *tmp_dqp; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + /* if qp is not promisc, it cannot be duplicated */ + if (!get_promisc_qp(dev, port, steer, qpn)) + return false; + + /* The qp is promisc qp so it is a duplicate on this index + * Find the index entry, and remove the duplicate */ + list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) { + if (tmp_entry->index == index) { + entry = tmp_entry; + break; + } + } + if (unlikely(!entry)) { + mlx4_warn(dev, "Steering entry for index %x is not registered\n", index); + return false; + } + list_for_each_entry_safe(dqp, tmp_dqp, &entry->duplicates, list) { + if (dqp->qpn == qpn) { + list_del(&dqp->list); + kfree(dqp); + } + } + return true; +} + +/* I a steering entry contains only promisc QPs, it can be removed. */ +static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 tqpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + struct mlx4_steer_index *entry = NULL, *tmp_entry; + u32 qpn; + u32 members_count; + bool ret = false; + int i; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return false; + mgm = mailbox->buf; + + if (mlx4_READ_ENTRY(dev, index, mailbox)) + goto out; + members_count = be32_to_cpu(mgm->members_count) & 0xffffff; + for (i = 0; i < members_count; i++) { + qpn = be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK; + if (!get_promisc_qp(dev, port, steer, qpn) && qpn != tqpn) { + /* the qp is not promisc, the entry can't be removed */ + goto out; + } + } + /* All the qps currently registered for this entry are promiscuous, + * Checking for duplicates */ + ret = true; + list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) { + if (entry->index == index) { + if (list_empty(&entry->duplicates) || members_count == 1) { + struct mlx4_promisc_qp *pqp, *tmp_pqp; + /* + * If there is only 1 entry in duplicates than + * this is the QP we want to delete, going over + * the list and deleting the entry. + */ + list_del(&entry->list); + list_for_each_entry_safe(pqp, tmp_pqp, + &entry->duplicates, + list) { + list_del(&pqp->list); + kfree(pqp); + } + kfree(entry); + } else { + /* This entry contains duplicates so it shouldn't be removed */ + ret = false; + goto out; + } + } + } + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; +} + +static int add_promisc_qp(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + struct mlx4_steer_index *entry; + struct mlx4_promisc_qp *pqp; + struct mlx4_promisc_qp *dqp; + u32 members_count; + u32 prot; + int i; + bool found; + int err; + struct mlx4_priv *priv = mlx4_priv(dev); + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + mutex_lock(&priv->mcg_table.mutex); + + if (get_promisc_qp(dev, port, steer, qpn)) { + err = 0; /* Noting to do, already exists */ + goto out_mutex; + } + + pqp = kmalloc(sizeof *pqp, GFP_KERNEL); + if (!pqp) { + err = -ENOMEM; + goto out_mutex; + } + pqp->qpn = qpn; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = -ENOMEM; + goto out_alloc; + } + mgm = mailbox->buf; + + /* the promisc qp needs to be added for each one of the steering + * entries, if it already exists, needs to be added as a duplicate + * for this entry */ + list_for_each_entry(entry, &s_steer->steer_entries[steer], list) { + err = mlx4_READ_ENTRY(dev, entry->index, mailbox); + if (err) + goto out_mailbox; + + members_count = be32_to_cpu(mgm->members_count) & 0xffffff; + prot = be32_to_cpu(mgm->members_count) >> 30; + found = false; + for (i = 0; i < members_count; i++) { + if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qpn) { + /* Entry already exists, add to duplicates */ + dqp = kmalloc(sizeof *dqp, GFP_KERNEL); + if (!dqp) { + err = -ENOMEM; + goto out_mailbox; + } + dqp->qpn = qpn; + list_add_tail(&dqp->list, &entry->duplicates); + found = true; + } + } + if (!found) { + /* Need to add the qpn to mgm */ + if (members_count == dev->caps.num_qp_per_mgm) { + /* entry is full */ + err = -ENOMEM; + goto out_mailbox; + } + mgm->qp[members_count++] = cpu_to_be32(qpn & MGM_QPN_MASK); + mgm->members_count = cpu_to_be32(members_count | (prot << 30)); + err = mlx4_WRITE_ENTRY(dev, entry->index, mailbox); + if (err) + goto out_mailbox; + } + } + + /* add the new qpn to list of promisc qps */ + list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]); + /* now need to add all the promisc qps to default entry */ + memset(mgm, 0, sizeof *mgm); + members_count = 0; + list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) { + if (members_count == dev->caps.num_qp_per_mgm) { + /* entry is full */ + err = -ENOMEM; + goto out_list; + } + mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK); + } + mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30); + + err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox); + if (err) + goto out_list; + + mlx4_free_cmd_mailbox(dev, mailbox); + mutex_unlock(&priv->mcg_table.mutex); + return 0; + +out_list: + list_del(&pqp->list); +out_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); +out_alloc: + kfree(pqp); +out_mutex: + mutex_unlock(&priv->mcg_table.mutex); + return err; +} + +static int remove_promisc_qp(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, u32 qpn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_steer *s_steer; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + struct mlx4_steer_index *entry; + struct mlx4_promisc_qp *pqp; + struct mlx4_promisc_qp *dqp; + u32 members_count; + bool found; + bool back_to_list = false; + int i, loc = -1; + int err; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + mutex_lock(&priv->mcg_table.mutex); + + pqp = get_promisc_qp(dev, port, steer, qpn); + if (unlikely(!pqp)) { + mlx4_warn(dev, "QP %x is not promiscuous QP\n", qpn); + /* nothing to do */ + err = 0; + goto out_mutex; + } + + /*remove from list of promisc qps */ + list_del(&pqp->list); + + /* set the default entry not to include the removed one */ + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = -ENOMEM; + back_to_list = true; + goto out_list; + } + mgm = mailbox->buf; + memset(mgm, 0, sizeof *mgm); + members_count = 0; + list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) + mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK); + mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30); + + err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox); + if (err) + goto out_mailbox; + + /* remove the qp from all the steering entries*/ + list_for_each_entry(entry, &s_steer->steer_entries[steer], list) { + found = false; + list_for_each_entry(dqp, &entry->duplicates, list) { + if (dqp->qpn == qpn) { + found = true; + break; + } + } + if (found) { + /* a duplicate, no need to change the mgm, + * only update the duplicates list */ + list_del(&dqp->list); + kfree(dqp); + } else { + err = mlx4_READ_ENTRY(dev, entry->index, mailbox); + if (err) + goto out_mailbox; + members_count = be32_to_cpu(mgm->members_count) & 0xffffff; + for (i = 0; i < members_count; ++i) + if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qpn) { + loc = i; + break; + } + + if (loc < 0) { + mlx4_err(dev, "QP %06x wasn't found in entry %d\n", + qpn, entry->index); + err = -EINVAL; + goto out_mailbox; + } + + /* copy the last QP in this MGM over removed QP */ + mgm->qp[loc] = mgm->qp[members_count - 1]; + mgm->qp[members_count - 1] = 0; + mgm->members_count = cpu_to_be32(--members_count | + (MLX4_PROT_ETH << 30)); + + err = mlx4_WRITE_ENTRY(dev, entry->index, mailbox); + if (err) + goto out_mailbox; + } + + } + +out_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); +out_list: + if (back_to_list) + list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]); + else + kfree(pqp); +out_mutex: + mutex_unlock(&priv->mcg_table.mutex); + return err; +} + /* * Caller must hold MCG table semaphore. gid and mgm parameters must * be properly aligned for command interface. @@ -97,15 +605,18 @@ static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox * If no AMGM exists for given gid, *index = -1, *prev = index of last * entry in hash chain and *mgm holds end of hash chain. */ -static int find_mgm(struct mlx4_dev *dev, - u8 *gid, enum mlx4_mcast_prot prot, - struct mlx4_cmd_mailbox *mgm_mailbox, - u16 *hash, int *prev, int *index) +static int find_entry(struct mlx4_dev *dev, u8 port, + u8 *gid, enum mlx4_protocol prot, + struct mlx4_cmd_mailbox *mgm_mailbox, + int *prev, int *index) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm = mgm_mailbox->buf; u8 *mgid; int err; + u16 hash; + u8 op_mod = (prot == MLX4_PROT_ETH) ? + !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) : 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -114,24 +625,24 @@ static int find_mgm(struct mlx4_dev *dev, memcpy(mgid, gid, 16); - err = mlx4_MGID_HASH(dev, mailbox, hash); + err = mlx4_GID_HASH(dev, mailbox, &hash, op_mod); mlx4_free_cmd_mailbox(dev, mailbox); if (err) return err; if (0) - mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash); + mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, hash); - *index = *hash; + *index = hash; *prev = -1; do { - err = mlx4_READ_MCG(dev, *index, mgm_mailbox); + err = mlx4_READ_ENTRY(dev, *index, mgm_mailbox); if (err) return err; - if (!memcmp(mgm->gid, zero_gid, 16)) { - if (*index != *hash) { + if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) { + if (*index != hash) { mlx4_err(dev, "Found zero MGID in AMGM.\n"); err = -EINVAL; } @@ -139,7 +650,7 @@ static int find_mgm(struct mlx4_dev *dev, } if (!memcmp(mgm->gid, gid, 16) && - (prot == be32_to_cpu(mgm->members_count) >> 30)) + be32_to_cpu(mgm->members_count) >> 30 == prot) return err; *prev = *index; @@ -150,18 +661,266 @@ static int find_mgm(struct mlx4_dev *dev, return err; } -int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - int block_mcast_loopback, enum mlx4_mcast_prot prot) +static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl, + struct mlx4_net_trans_rule_hw_ctrl *hw) +{ + static const u8 __promisc_mode[] = { + [MLX4_FS_REGULAR] = 0x0, + [MLX4_FS_ALL_DEFAULT] = 0x1, + [MLX4_FS_MC_DEFAULT] = 0x3, + [MLX4_FS_UC_SNIFFER] = 0x4, + [MLX4_FS_MC_SNIFFER] = 0x5, + }; + + u32 dw = 0; + + dw = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0; + dw |= ctrl->exclusive ? (1 << 2) : 0; + dw |= ctrl->allow_loopback ? (1 << 3) : 0; + dw |= __promisc_mode[ctrl->promisc_mode] << 8; + dw |= ctrl->priority << 16; + + hw->ctrl = cpu_to_be32(dw); + hw->port = ctrl->port; + hw->qpn = cpu_to_be32(ctrl->qpn); +} + +const u16 __sw_id_hw[] = { + [MLX4_NET_TRANS_RULE_ID_ETH] = 0xE001, + [MLX4_NET_TRANS_RULE_ID_IB] = 0xE005, + [MLX4_NET_TRANS_RULE_ID_IPV6] = 0xE003, + [MLX4_NET_TRANS_RULE_ID_IPV4] = 0xE002, + [MLX4_NET_TRANS_RULE_ID_TCP] = 0xE004, + [MLX4_NET_TRANS_RULE_ID_UDP] = 0xE006 +}; + +static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec, + struct _rule_hw *rule_hw) +{ + static const size_t __rule_hw_sz[] = { + [MLX4_NET_TRANS_RULE_ID_ETH] = + sizeof(struct mlx4_net_trans_rule_hw_eth), + [MLX4_NET_TRANS_RULE_ID_IB] = + sizeof(struct mlx4_net_trans_rule_hw_ib), + [MLX4_NET_TRANS_RULE_ID_IPV6] = 0, + [MLX4_NET_TRANS_RULE_ID_IPV4] = + sizeof(struct mlx4_net_trans_rule_hw_ipv4), + [MLX4_NET_TRANS_RULE_ID_TCP] = + sizeof(struct mlx4_net_trans_rule_hw_tcp_udp), + [MLX4_NET_TRANS_RULE_ID_UDP] = + sizeof(struct mlx4_net_trans_rule_hw_tcp_udp) + }; + if (spec->id >= MLX4_NET_TRANS_RULE_NUM) { + mlx4_err(dev, "Invalid network rule id. id = %d\n", spec->id); + return -EINVAL; + } + memset(rule_hw, 0, __rule_hw_sz[spec->id]); + rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]); + rule_hw->size = __rule_hw_sz[spec->id] >> 2; + + switch (spec->id) { + case MLX4_NET_TRANS_RULE_ID_ETH: + memcpy(rule_hw->eth.dst_mac, spec->eth.dst_mac, ETH_ALEN); + memcpy(rule_hw->eth.dst_mac_msk, spec->eth.dst_mac_msk, + ETH_ALEN); + memcpy(rule_hw->eth.src_mac, spec->eth.src_mac, ETH_ALEN); + memcpy(rule_hw->eth.src_mac_msk, spec->eth.src_mac_msk, + ETH_ALEN); + if (spec->eth.ether_type_enable) { + rule_hw->eth.ether_type_enable = 1; + rule_hw->eth.ether_type = spec->eth.ether_type; + } + rule_hw->eth.vlan_id = spec->eth.vlan_id; + rule_hw->eth.vlan_id_msk = spec->eth.vlan_id_msk; + break; + + case MLX4_NET_TRANS_RULE_ID_IB: + rule_hw->ib.r_u_qpn = spec->ib.r_u_qpn; + rule_hw->ib.qpn_mask = spec->ib.qpn_msk; + memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16); + memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16); + break; + + case MLX4_NET_TRANS_RULE_ID_IPV6: + return -EOPNOTSUPP; + + case MLX4_NET_TRANS_RULE_ID_IPV4: + rule_hw->ipv4.src_ip = spec->ipv4.src_ip; + rule_hw->ipv4.src_ip_msk = spec->ipv4.src_ip_msk; + rule_hw->ipv4.dst_ip = spec->ipv4.dst_ip; + rule_hw->ipv4.dst_ip_msk = spec->ipv4.dst_ip_msk; + break; + + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + rule_hw->tcp_udp.dst_port = spec->tcp_udp.dst_port; + rule_hw->tcp_udp.dst_port_msk = spec->tcp_udp.dst_port_msk; + rule_hw->tcp_udp.src_port = spec->tcp_udp.src_port; + rule_hw->tcp_udp.src_port_msk = spec->tcp_udp.src_port_msk; + break; + + default: + return -EINVAL; + } + + return __rule_hw_sz[spec->id]; +} + +static void mlx4_err_rule(struct mlx4_dev *dev, char *str, + struct mlx4_net_trans_rule *rule) +{ +#define BUF_SIZE 256 + struct mlx4_spec_list *cur; + char buf[BUF_SIZE]; + int len = 0; + + mlx4_err(dev, "%s", str); + len += snprintf(buf + len, BUF_SIZE - len, + "port = %d prio = 0x%x qp = 0x%x ", + rule->port, rule->priority, rule->qpn); + + list_for_each_entry(cur, &rule->list, list) { + switch (cur->id) { + case MLX4_NET_TRANS_RULE_ID_ETH: + len += snprintf(buf + len, BUF_SIZE - len, + "dmac = %pM ", &cur->eth.dst_mac); + if (cur->eth.ether_type) + len += snprintf(buf + len, BUF_SIZE - len, + "ethertype = 0x%x ", + be16_to_cpu(cur->eth.ether_type)); + if (cur->eth.vlan_id) + len += snprintf(buf + len, BUF_SIZE - len, + "vlan-id = %d ", + be16_to_cpu(cur->eth.vlan_id)); + break; + + case MLX4_NET_TRANS_RULE_ID_IPV4: + if (cur->ipv4.src_ip) + len += snprintf(buf + len, BUF_SIZE - len, + "src-ip = %pI4 ", + &cur->ipv4.src_ip); + if (cur->ipv4.dst_ip) + len += snprintf(buf + len, BUF_SIZE - len, + "dst-ip = %pI4 ", + &cur->ipv4.dst_ip); + break; + + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + if (cur->tcp_udp.src_port) + len += snprintf(buf + len, BUF_SIZE - len, + "src-port = %d ", + be16_to_cpu(cur->tcp_udp.src_port)); + if (cur->tcp_udp.dst_port) + len += snprintf(buf + len, BUF_SIZE - len, + "dst-port = %d ", + be16_to_cpu(cur->tcp_udp.dst_port)); + break; + + case MLX4_NET_TRANS_RULE_ID_IB: + len += snprintf(buf + len, BUF_SIZE - len, + "dst-gid = %pI6\n", cur->ib.dst_gid); + len += snprintf(buf + len, BUF_SIZE - len, + "dst-gid-mask = %pI6\n", + cur->ib.dst_gid_msk); + break; + + case MLX4_NET_TRANS_RULE_ID_IPV6: + break; + + default: + break; + } + } + len += snprintf(buf + len, BUF_SIZE - len, "\n"); + mlx4_err(dev, "%s", buf); + + if (len >= BUF_SIZE) + mlx4_err(dev, "Network rule error message was truncated, print buffer is too small.\n"); +} + +int mlx4_flow_attach(struct mlx4_dev *dev, + struct mlx4_net_trans_rule *rule, u64 *reg_id) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_spec_list *cur; + u32 size = 0; + int ret; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + memset(mailbox->buf, 0, sizeof(struct mlx4_net_trans_rule_hw_ctrl)); + trans_rule_ctrl_to_hw(rule, mailbox->buf); + + size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + + list_for_each_entry(cur, &rule->list, list) { + ret = parse_trans_rule(dev, cur, mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(dev, mailbox); + return -EINVAL; + } + size += ret; + } + + ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id); + if (ret == -ENOMEM) + mlx4_err_rule(dev, + "mcg table is full. Fail to register network rule.\n", + rule); + else if (ret) + mlx4_err_rule(dev, "Fail to register network rule.\n", rule); + + mlx4_free_cmd_mailbox(dev, mailbox); + + return ret; +} +EXPORT_SYMBOL_GPL(mlx4_flow_attach); + +int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id) +{ + int err; + + err = mlx4_QP_FLOW_STEERING_DETACH(dev, reg_id); + if (err) + mlx4_err(dev, "Fail to detach network rule. registration id = 0x%llx\n", + reg_id); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_flow_detach); + +int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, u32 max_range_qpn) +{ + int err; + u64 in_param; + + in_param = ((u64) min_range_qpn) << 32; + in_param |= ((u64) max_range_qpn) & 0xFFFFFFFF; + + err = mlx4_cmd(dev, in_param, 0, 0, + MLX4_FLOW_STEERING_IB_UC_QP_RANGE, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + return err; +} +EXPORT_SYMBOL_GPL(mlx4_FLOW_STEERING_IB_UC_QP_RANGE); + +int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot, + enum mlx4_steer_type steer) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm; u32 members_count; - u16 hash; int index, prev; int link = 0; int i; int err; + u8 port = gid[5]; + u8 new_entry = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -169,14 +928,16 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mgm = mailbox->buf; mutex_lock(&priv->mcg_table.mutex); - - err = find_mgm(dev, gid, prot, mailbox, &hash, &prev, &index); + err = find_entry(dev, port, gid, prot, + mailbox, &prev, &index); if (err) goto out; if (index != -1) { - if (!memcmp(mgm->gid, zero_gid, 16)) + if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) { + new_entry = 1; memcpy(mgm->gid, gid, 16); + } } else { link = 1; @@ -188,12 +949,13 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], } index += dev->caps.num_mgms; + new_entry = 1; memset(mgm, 0, sizeof *mgm); memcpy(mgm->gid, gid, 16); } members_count = be32_to_cpu(mgm->members_count) & 0xffffff; - if (members_count == MLX4_QP_PER_MGM) { + if (members_count == dev->caps.num_qp_per_mgm) { mlx4_err(dev, "MGM at index %x is full.\n", index); err = -ENOMEM; goto out; @@ -209,25 +971,34 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) | (!!mlx4_blck_lb << MGM_BLCK_LB_BIT)); - mgm->members_count = cpu_to_be32(members_count | ((u32) prot << 30)); + mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30); - err = mlx4_WRITE_MCG(dev, index, mailbox); + err = mlx4_WRITE_ENTRY(dev, index, mailbox); if (err) goto out; if (!link) goto out; - err = mlx4_READ_MCG(dev, prev, mailbox); + err = mlx4_READ_ENTRY(dev, prev, mailbox); if (err) goto out; mgm->next_gid_index = cpu_to_be32(index << 6); - err = mlx4_WRITE_MCG(dev, prev, mailbox); + err = mlx4_WRITE_ENTRY(dev, prev, mailbox); if (err) goto out; + if (prot == MLX4_PROT_ETH) { + /* manage the steering entry for promisc mode */ + if (new_entry) + new_steering_entry(dev, port, steer, index, qp->qpn); + else + existing_steering_entry(dev, port, steer, + index, qp->qpn); + } + out: if (err && link && index != -1) { if (index < dev->caps.num_mgms) @@ -242,19 +1013,19 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mlx4_free_cmd_mailbox(dev, mailbox); return err; } -EXPORT_SYMBOL_GPL(mlx4_multicast_attach); -int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - enum mlx4_mcast_prot prot) +int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot, enum mlx4_steer_type steer) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm; u32 members_count; - u16 hash; int prev, index; - int i, loc; + int i, loc = -1; int err; + u8 port = gid[5]; + bool removed_entry = false; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -263,7 +1034,8 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mutex_lock(&priv->mcg_table.mutex); - err = find_mgm(dev, gid, prot, mailbox, &hash, &prev, &index); + err = find_entry(dev, port, gid, prot, + mailbox, &prev, &index); if (err) goto out; @@ -273,10 +1045,17 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], goto out; } + /* if this pq is also a promisc qp, it shouldn't be removed */ + if (prot == MLX4_PROT_ETH && + check_duplicate_entry(dev, port, steer, index, qp->qpn)) + goto out; + members_count = be32_to_cpu(mgm->members_count) & 0xffffff; - for (loc = -1, i = 0; i < members_count; ++i) - if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) + for (i = 0; i < members_count; ++i) + if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) { loc = i; + break; + } if (loc == -1) { mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn); @@ -284,27 +1063,33 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], goto out; } + /* copy the last QP in this MGM over removed QP */ + mgm->qp[loc] = mgm->qp[members_count - 1]; + mgm->qp[members_count - 1] = 0; + mgm->members_count = cpu_to_be32(--members_count | (u32) prot << 30); - mgm->members_count = cpu_to_be32(--members_count | ((u32) prot << 30)); - mgm->qp[loc] = mgm->qp[i - 1]; - mgm->qp[i - 1] = 0; - - if (i != 1) { - err = mlx4_WRITE_MCG(dev, index, mailbox); + if (prot == MLX4_PROT_ETH) + removed_entry = can_remove_steering_entry(dev, port, steer, + index, qp->qpn); + if (members_count && (prot != MLX4_PROT_ETH || !removed_entry)) { + err = mlx4_WRITE_ENTRY(dev, index, mailbox); goto out; } + /* We are going to delete the entry, members count should be 0 */ + mgm->members_count = cpu_to_be32((u32) prot << 30); + if (prev == -1) { /* Remove entry from MGM */ int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6; if (amgm_index) { - err = mlx4_READ_MCG(dev, amgm_index, mailbox); + err = mlx4_READ_ENTRY(dev, amgm_index, mailbox); if (err) goto out; } else memset(mgm->gid, 0, 16); - err = mlx4_WRITE_MCG(dev, index, mailbox); + err = mlx4_WRITE_ENTRY(dev, index, mailbox); if (err) goto out; @@ -319,13 +1104,13 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], } else { /* Remove entry from AMGM */ int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6; - err = mlx4_READ_MCG(dev, prev, mailbox); + err = mlx4_READ_ENTRY(dev, prev, mailbox); if (err) goto out; mgm->next_gid_index = cpu_to_be32(cur_next_index << 6); - err = mlx4_WRITE_MCG(dev, prev, mailbox); + err = mlx4_WRITE_ENTRY(dev, prev, mailbox); if (err) goto out; @@ -343,13 +1128,286 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mlx4_free_cmd_mailbox(dev, mailbox); return err; } + +static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp, + u8 gid[16], u8 attach, u8 block_loopback, + enum mlx4_protocol prot) +{ + struct mlx4_cmd_mailbox *mailbox; + int err = 0; + int qpn; + + if (!mlx4_is_mfunc(dev)) + return -EBADF; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + memcpy(mailbox->buf, gid, 16); + qpn = qp->qpn; + qpn |= (prot << 28); + if (attach && block_loopback) + qpn |= (1 << 31); + + err = mlx4_cmd(dev, mailbox->dma, qpn, attach, + MLX4_CMD_QP_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + u8 port, int block_mcast_loopback, + enum mlx4_protocol prot, u64 *reg_id) +{ + + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_A0: + if (prot == MLX4_PROT_ETH) + return 0; + + case MLX4_STEERING_MODE_B0: + if (prot == MLX4_PROT_ETH) + gid[7] |= (MLX4_MC_STEER << 1); + + if (mlx4_is_mfunc(dev)) + return mlx4_QP_ATTACH(dev, qp, gid, 1, + block_mcast_loopback, prot); + return mlx4_qp_attach_common(dev, qp, gid, + block_mcast_loopback, prot, + MLX4_MC_STEER); + + case MLX4_STEERING_MODE_DEVICE_MANAGED: { + struct mlx4_spec_list spec = { {NULL} }; + __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16); + + struct mlx4_net_trans_rule rule = { + .queue_mode = MLX4_NET_TRANS_Q_FIFO, + .exclusive = 0, + .promisc_mode = MLX4_FS_REGULAR, + .priority = MLX4_DOMAIN_NIC, + }; + + rule.allow_loopback = !block_mcast_loopback; + rule.port = port; + rule.qpn = qp->qpn; + INIT_LIST_HEAD(&rule.list); + + switch (prot) { + case MLX4_PROT_ETH: + spec.id = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(spec.eth.dst_mac, &gid[10], ETH_ALEN); + memcpy(spec.eth.dst_mac_msk, &mac_mask, ETH_ALEN); + break; + + case MLX4_PROT_IB_IPV6: + spec.id = MLX4_NET_TRANS_RULE_ID_IB; + memcpy(spec.ib.dst_gid, gid, 16); + memset(&spec.ib.dst_gid_msk, 0xff, 16); + break; + default: + return -EINVAL; + } + list_add_tail(&spec.list, &rule.list); + + return mlx4_flow_attach(dev, &rule, reg_id); + } + + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_GPL(mlx4_multicast_attach); + +int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot, u64 reg_id) +{ + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_A0: + if (prot == MLX4_PROT_ETH) + return 0; + + case MLX4_STEERING_MODE_B0: + if (prot == MLX4_PROT_ETH) + gid[7] |= (MLX4_MC_STEER << 1); + + if (mlx4_is_mfunc(dev)) + return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot); + + return mlx4_qp_detach_common(dev, qp, gid, prot, + MLX4_MC_STEER); + + case MLX4_STEERING_MODE_DEVICE_MANAGED: + return mlx4_flow_detach(dev, reg_id); + + default: + return -EINVAL; + } +} EXPORT_SYMBOL_GPL(mlx4_multicast_detach); +int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, + u32 qpn, enum mlx4_net_trans_promisc_mode mode) +{ + struct mlx4_net_trans_rule rule; + u64 *regid_p; + + switch (mode) { + case MLX4_FS_ALL_DEFAULT: + regid_p = &dev->regid_promisc_array[port]; + break; + case MLX4_FS_MC_DEFAULT: + regid_p = &dev->regid_allmulti_array[port]; + break; + default: + return -1; + } + + if (*regid_p != 0) + return -1; + + rule.promisc_mode = mode; + rule.port = port; + rule.qpn = qpn; + INIT_LIST_HEAD(&rule.list); + mlx4_err(dev, "going promisc on %x\n", port); + + return mlx4_flow_attach(dev, &rule, regid_p); +} +EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_add); + +int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port, + enum mlx4_net_trans_promisc_mode mode) +{ + int ret; + u64 *regid_p; + + switch (mode) { + case MLX4_FS_ALL_DEFAULT: + regid_p = &dev->regid_promisc_array[port]; + break; + case MLX4_FS_MC_DEFAULT: + regid_p = &dev->regid_allmulti_array[port]; + break; + default: + return -1; + } + + if (*regid_p == 0) + return -1; + + ret = mlx4_flow_detach(dev, *regid_p); + if (ret == 0) + *regid_p = 0; + + return ret; +} +EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_remove); + +int mlx4_unicast_attach(struct mlx4_dev *dev, + struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot) +{ + if (prot == MLX4_PROT_ETH) + gid[7] |= (MLX4_UC_STEER << 1); + + if (mlx4_is_mfunc(dev)) + return mlx4_QP_ATTACH(dev, qp, gid, 1, + block_mcast_loopback, prot); + + return mlx4_qp_attach_common(dev, qp, gid, block_mcast_loopback, + prot, MLX4_UC_STEER); +} +EXPORT_SYMBOL_GPL(mlx4_unicast_attach); + +int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, + u8 gid[16], enum mlx4_protocol prot) +{ + if (prot == MLX4_PROT_ETH) + gid[7] |= (MLX4_UC_STEER << 1); + + if (mlx4_is_mfunc(dev)) + return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot); + + return mlx4_qp_detach_common(dev, qp, gid, prot, MLX4_UC_STEER); +} +EXPORT_SYMBOL_GPL(mlx4_unicast_detach); + +int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u32 qpn = (u32) vhcr->in_param & 0xffffffff; + u8 port = vhcr->in_param >> 62; + enum mlx4_steer_type steer = vhcr->in_modifier; + + /* Promiscuous unicast is not allowed in mfunc */ + if (mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER) + return 0; + + if (vhcr->op_modifier) + return add_promisc_qp(dev, port, steer, qpn); + else + return remove_promisc_qp(dev, port, steer, qpn); +} + +static int mlx4_PROMISC(struct mlx4_dev *dev, u32 qpn, + enum mlx4_steer_type steer, u8 add, u8 port) +{ + return mlx4_cmd(dev, (u64) qpn | (u64) port << 62, (u32) steer, add, + MLX4_CMD_PROMISC, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); +} + +int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 1, port); + + return add_promisc_qp(dev, port, MLX4_MC_STEER, qpn); +} +EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add); + +int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 0, port); + + return remove_promisc_qp(dev, port, MLX4_MC_STEER, qpn); +} +EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove); + +int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 1, port); + + return add_promisc_qp(dev, port, MLX4_UC_STEER, qpn); +} +EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add); + +int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 0, port); + + return remove_promisc_qp(dev, port, MLX4_UC_STEER, qpn); +} +EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_remove); + int mlx4_init_mcg_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err; + /* No need for mcg_table when fw managed the mcg table*/ + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) + return 0; err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms, dev->caps.num_amgms - 1, 0, 0); if (err) @@ -362,5 +1420,7 @@ int mlx4_init_mcg_table(struct mlx4_dev *dev) void mlx4_cleanup_mcg_table(struct mlx4_dev *dev) { - mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap); + if (dev->caps.steering_mode != + MLX4_STEERING_MODE_DEVICE_MANAGED) + mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap); } diff --git a/sys/ofed/drivers/net/mlx4/mlx4.h b/sys/ofed/drivers/net/mlx4/mlx4.h index d5d3da9a4ff3..b342d9aee395 100644 --- a/sys/ofed/drivers/net/mlx4/mlx4.h +++ b/sys/ofed/drivers/net/mlx4/mlx4.h @@ -39,28 +39,56 @@ #include #include +#include #include +#include #include #include #include #include +#include #define DRV_NAME "mlx4_core" #define PFX DRV_NAME ": " -#define DRV_VERSION "1.0-ofed1.5.2" -#define DRV_RELDATE "August 4, 2010" +#define DRV_VERSION "1.1" +#define DRV_RELDATE "Dec, 2011" + +#define MLX4_FS_UDP_UC_EN (1 << 1) +#define MLX4_FS_TCP_UC_EN (1 << 2) +#define MLX4_FS_NUM_OF_L2_ADDR 8 +#define MLX4_FS_MGM_LOG_ENTRY_SIZE 7 +#define MLX4_FS_NUM_MCG (1 << 17) + +struct mlx4_set_port_prio2tc_context { + u8 prio2tc[4]; +}; + +struct mlx4_port_scheduler_tc_cfg_be { + __be16 pg; + __be16 bw_precentage; + __be16 max_bw_units; /* 3-100Mbps, 4-1Gbps, other values - reserved */ + __be16 max_bw_value; +}; + +struct mlx4_set_port_scheduler_context { + struct mlx4_port_scheduler_tc_cfg_be tc[MLX4_NUM_TC]; +}; enum { MLX4_HCR_BASE = 0x80680, MLX4_HCR_SIZE = 0x0001c, - MLX4_CLR_INT_SIZE = 0x00008 + MLX4_CLR_INT_SIZE = 0x00008, + MLX4_SLAVE_COMM_BASE = 0x0, + MLX4_COMM_PAGESIZE = 0x1000, + MLX4_CLOCK_SIZE = 0x00008 }; enum { - MLX4_MGM_ENTRY_SIZE = 0x100, - MLX4_QP_PER_MGM = 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2), - MLX4_MTT_ENTRY_PER_SEG = 8 + MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE = 10, + MLX4_MIN_MGM_LOG_ENTRY_SIZE = 7, + MLX4_MAX_MGM_LOG_ENTRY_SIZE = 12, + MLX4_MAX_QP_PER_MGM = 4 * ((1 << MLX4_MAX_MGM_LOG_ENTRY_SIZE)/16 - 2), }; enum { @@ -80,6 +108,100 @@ enum { MLX4_NUM_CMPTS = MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT }; +enum mlx4_mr_state { + MLX4_MR_DISABLED = 0, + MLX4_MR_EN_HW, + MLX4_MR_EN_SW +}; + +#define MLX4_COMM_TIME 10000 +enum { + MLX4_COMM_CMD_RESET, + MLX4_COMM_CMD_VHCR0, + MLX4_COMM_CMD_VHCR1, + MLX4_COMM_CMD_VHCR2, + MLX4_COMM_CMD_VHCR_EN, + MLX4_COMM_CMD_VHCR_POST, + MLX4_COMM_CMD_FLR = 254 +}; + +/*The flag indicates that the slave should delay the RESET cmd*/ +#define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb +/*indicates how many retries will be done if we are in the middle of FLR*/ +#define NUM_OF_RESET_RETRIES 10 +#define SLEEP_TIME_IN_RESET (2 * 1000) +enum mlx4_resource { + RES_QP, + RES_CQ, + RES_SRQ, + RES_XRCD, + RES_MPT, + RES_MTT, + RES_MAC, + RES_VLAN, + RES_EQ, + RES_COUNTER, + RES_FS_RULE, + MLX4_NUM_OF_RESOURCE_TYPE +}; + +enum mlx4_alloc_mode { + RES_OP_RESERVE, + RES_OP_RESERVE_AND_MAP, + RES_OP_MAP_ICM, +}; + +enum mlx4_res_tracker_free_type { + RES_TR_FREE_ALL, + RES_TR_FREE_SLAVES_ONLY, + RES_TR_FREE_STRUCTS_ONLY, +}; + +/* + *Virtual HCR structures. + * mlx4_vhcr is the sw representation, in machine endianess + * + * mlx4_vhcr_cmd is the formalized structure, the one that is passed + * to FW to go through communication channel. + * It is big endian, and has the same structure as the physical HCR + * used by command interface + */ +struct mlx4_vhcr { + u64 in_param; + u64 out_param; + u32 in_modifier; + u32 errno; + u16 op; + u16 token; + u8 op_modifier; + u8 e_bit; +}; + +struct mlx4_vhcr_cmd { + __be64 in_param; + __be32 in_modifier; + __be64 out_param; + __be16 token; + u16 reserved; + u8 status; + u8 flags; + __be16 opcode; +}; + +struct mlx4_cmd_info { + u16 opcode; + bool has_inbox; + bool has_outbox; + bool out_is_imm; + bool encode_slave_id; + int (*verify)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox); + int (*wrapper)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +}; + #ifdef CONFIG_MLX4_DEBUG extern int mlx4_debug_level; #else /* CONFIG_MLX4_DEBUG */ @@ -87,19 +209,25 @@ extern int mlx4_debug_level; #endif /* CONFIG_MLX4_DEBUG */ #define mlx4_dbg(mdev, format, arg...) \ - do { \ - if (mlx4_debug_level) \ - dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \ - } while (0) +do { \ + if (mlx4_debug_level) \ + dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ##arg); \ +} while (0) #define mlx4_err(mdev, format, arg...) \ - dev_err(&mdev->pdev->dev, format, ## arg) + dev_err(&mdev->pdev->dev, format, ##arg) #define mlx4_info(mdev, format, arg...) \ - dev_info(&mdev->pdev->dev, format, ## arg) + dev_info(&mdev->pdev->dev, format, ##arg) #define mlx4_warn(mdev, format, arg...) \ - dev_warn(&mdev->pdev->dev, format, ## arg) + dev_warn(&mdev->pdev->dev, format, ##arg) +extern int mlx4_log_num_mgm_entry_size; +extern int log_mtts_per_seg; extern int mlx4_blck_lb; +extern int mlx4_set_4k_mtu; + +#define MLX4_MAX_NUM_SLAVES (MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF) +#define ALL_SLAVES 0xff struct mlx4_bitmap { u32 last; @@ -115,7 +243,7 @@ struct mlx4_bitmap { struct mlx4_buddy { unsigned long **bits; unsigned int *num_free; - int max_order; + u32 max_order; spinlock_t lock; }; @@ -124,7 +252,7 @@ struct mlx4_icm; struct mlx4_icm_table { u64 virt; int num_icm; - int num_obj; + u32 num_obj; int obj_size; int lowmem; int coherent; @@ -132,6 +260,91 @@ struct mlx4_icm_table { struct mlx4_icm **icm; }; +/* + * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits. + */ +struct mlx4_mpt_entry { + __be32 flags; + __be32 qpn; + __be32 key; + __be32 pd_flags; + __be64 start; + __be64 length; + __be32 lkey; + __be32 win_cnt; + u8 reserved1[3]; + u8 mtt_rep; + __be64 mtt_addr; + __be32 mtt_sz; + __be32 entity_size; + __be32 first_byte_offset; +} __packed; + +/* + * Must be packed because start is 64 bits but only aligned to 32 bits. + */ +struct mlx4_eq_context { + __be32 flags; + u16 reserved1[3]; + __be16 page_offset; + u8 log_eq_size; + u8 reserved2[4]; + u8 eq_period; + u8 reserved3; + u8 eq_max_count; + u8 reserved4[3]; + u8 intr; + u8 log_page_size; + u8 reserved5[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + u32 reserved6[2]; + __be32 consumer_index; + __be32 producer_index; + u32 reserved7[4]; +}; + +struct mlx4_cq_context { + __be32 flags; + u16 reserved1[3]; + __be16 page_offset; + __be32 logsize_usrpage; + __be16 cq_period; + __be16 cq_max_count; + u8 reserved2[3]; + u8 comp_eqn; + u8 log_page_size; + u8 reserved3[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + __be32 last_notified_index; + __be32 solicit_producer_index; + __be32 consumer_index; + __be32 producer_index; + u32 reserved4[2]; + __be64 db_rec_addr; +}; + +struct mlx4_srq_context { + __be32 state_logsize_srqn; + u8 logstride; + u8 reserved1; + __be16 xrcd; + __be32 pg_offset_cqn; + u32 reserved2; + u8 log_page_size; + u8 reserved3[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + __be32 pd; + __be16 limit_watermark; + __be16 wqe_cnt; + u16 reserved4; + __be16 wqe_counter; + u32 reserved5; + __be64 db_rec_addr; +}; + struct mlx4_eq { struct mlx4_dev *dev; void __iomem *doorbell; @@ -140,11 +353,22 @@ struct mlx4_eq { u16 irq; u16 have_irq; int nent; - int load; + int load; struct mlx4_buf_list *page_list; struct mlx4_mtt mtt; }; +struct mlx4_slave_eqe { + u8 type; + u8 port; + u32 param; +}; + +struct mlx4_slave_event_eq_info { + int eqn; + u16 token; +}; + struct mlx4_profile { int num_qp; int rdmarc_per_qp; @@ -152,24 +376,194 @@ struct mlx4_profile { int num_cq; int num_mcg; int num_mpt; - int num_mtt; + unsigned num_mtt; }; struct mlx4_fw { u64 clr_int_base; u64 catas_offset; + u64 comm_base; + u64 clock_offset; struct mlx4_icm *fw_icm; struct mlx4_icm *aux_icm; u32 catas_size; u16 fw_pages; u8 clr_int_bar; u8 catas_bar; + u8 comm_bar; + u8 clock_bar; +}; + +struct mlx4_comm { + u32 slave_write; + u32 slave_read; +}; + +enum { + MLX4_MCAST_CONFIG = 0, + MLX4_MCAST_DISABLE = 1, + MLX4_MCAST_ENABLE = 2, +}; + +#define VLAN_FLTR_SIZE 128 + +struct mlx4_vlan_fltr { + __be32 entry[VLAN_FLTR_SIZE]; +}; + +struct mlx4_mcast_entry { + struct list_head list; + u64 addr; +}; + +struct mlx4_promisc_qp { + struct list_head list; + u32 qpn; +}; + +struct mlx4_steer_index { + struct list_head list; + unsigned int index; + struct list_head duplicates; +}; + +#define MLX4_EVENT_TYPES_NUM 64 + +struct mlx4_slave_state { + u8 comm_toggle; + u8 last_cmd; + u8 init_port_mask; + bool active; + u8 function; + dma_addr_t vhcr_dma; + u16 mtu[MLX4_MAX_PORTS + 1]; + __be32 ib_cap_mask[MLX4_MAX_PORTS + 1]; + struct mlx4_slave_eqe eq[MLX4_MFUNC_MAX_EQES]; + struct list_head mcast_filters[MLX4_MAX_PORTS + 1]; + struct mlx4_vlan_fltr *vlan_filter[MLX4_MAX_PORTS + 1]; + /* event type to eq number lookup */ + struct mlx4_slave_event_eq_info event_eq[MLX4_EVENT_TYPES_NUM]; + u16 eq_pi; + u16 eq_ci; + spinlock_t lock; + /*initialized via the kzalloc*/ + u8 is_slave_going_down; + u32 cookie; + enum slave_port_state port_state[MLX4_MAX_PORTS + 1]; +}; + +#define MLX4_VGT 4095 +#define NO_INDX (-1) + +struct mlx4_vport_state { + u64 mac; + u16 default_vlan; + u8 default_qos; + u32 tx_rate; + bool spoofchk; +}; + +struct mlx4_vf_admin_state { + struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1]; +}; + +struct mlx4_vport_oper_state { + struct mlx4_vport_state state; + int mac_idx; + int vlan_idx; +}; +struct mlx4_vf_oper_state { + struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1]; +}; + +struct slave_list { + struct mutex mutex; + struct list_head res_list[MLX4_NUM_OF_RESOURCE_TYPE]; +}; + +struct resource_allocator { + spinlock_t alloc_lock; + union { + int res_reserved; + int res_port_rsvd[MLX4_MAX_PORTS]; + }; + union { + int res_free; + int res_port_free[MLX4_MAX_PORTS]; + }; + int *quota; + int *allocated; + int *guaranteed; +}; + +struct mlx4_resource_tracker { + spinlock_t lock; + /* tree for each resources */ + struct rb_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE]; + /* num_of_slave's lists, one per slave */ + struct slave_list *slave_list; + struct resource_allocator res_alloc[MLX4_NUM_OF_RESOURCE_TYPE]; +}; + +#define SLAVE_EVENT_EQ_SIZE 128 +struct mlx4_slave_event_eq { + u32 eqn; + u32 cons; + u32 prod; + spinlock_t event_lock; + struct mlx4_eqe event_eqe[SLAVE_EVENT_EQ_SIZE]; +}; + +struct mlx4_master_qp0_state { + int proxy_qp0_active; + int qp0_active; + int port_active; +}; + +struct mlx4_mfunc_master_ctx { + struct mlx4_slave_state *slave_state; + struct mlx4_vf_admin_state *vf_admin; + struct mlx4_vf_oper_state *vf_oper; + struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1]; + int init_port_ref[MLX4_MAX_PORTS + 1]; + u16 max_mtu[MLX4_MAX_PORTS + 1]; + int disable_mcast_ref[MLX4_MAX_PORTS + 1]; + struct mlx4_resource_tracker res_tracker; + struct workqueue_struct *comm_wq; + struct work_struct comm_work; + struct work_struct slave_event_work; + struct work_struct slave_flr_event_work; + spinlock_t slave_state_lock; + __be32 comm_arm_bit_vector[4]; + struct mlx4_eqe cmd_eqe; + struct mlx4_slave_event_eq slave_eq; + struct mutex gen_eqe_mutex[MLX4_MFUNC_MAX]; +}; + +struct mlx4_mfunc { + struct mlx4_comm __iomem *comm; + struct mlx4_vhcr_cmd *vhcr; + dma_addr_t vhcr_dma; + + struct mlx4_mfunc_master_ctx master; +}; + +#define MGM_QPN_MASK 0x00FFFFFF +#define MGM_BLCK_LB_BIT 30 + +struct mlx4_mgm { + __be32 next_gid_index; + __be32 members_count; + u32 reserved[2]; + u8 gid[16]; + __be32 qp[MLX4_MAX_QP_PER_MGM]; }; struct mlx4_cmd { struct pci_pool *pool; void __iomem *hcr; struct mutex hcr_mutex; + struct mutex slave_cmd_mutex; struct semaphore poll_sem; struct semaphore event_sem; int max_cmds; @@ -179,6 +573,7 @@ struct mlx4_cmd { u16 token_mask; u8 use_events; u8 toggle; + u8 comm_toggle; }; struct mlx4_uar_table { @@ -218,6 +613,7 @@ struct mlx4_eq_table { struct mlx4_srq_table { struct mlx4_bitmap bitmap; spinlock_t lock; + struct radix_tree_root tree; struct mlx4_icm_table table; struct mlx4_icm_table cmpt_table; }; @@ -268,14 +664,59 @@ struct mlx4_vlan_table { int max; }; +#define SET_PORT_GEN_ALL_VALID 0x7 +#define SET_PORT_PROMISC_SHIFT 31 +#define SET_PORT_MC_PROMISC_SHIFT 30 + +enum { + MCAST_DIRECT_ONLY = 0, + MCAST_DIRECT = 1, + MCAST_DEFAULT = 2 +}; + + +struct mlx4_set_port_general_context { + u8 reserved[3]; + u8 flags; + u16 reserved2; + __be16 mtu; + u8 pptx; + u8 pfctx; + u16 reserved3; + u8 pprx; + u8 pfcrx; + u16 reserved4; +}; + +struct mlx4_set_port_rqp_calc_context { + __be32 base_qpn; + u8 rererved; + u8 n_mac; + u8 n_vlan; + u8 n_prio; + u8 reserved2[3]; + u8 mac_miss; + u8 intra_no_vlan; + u8 no_vlan; + u8 intra_vlan_miss; + u8 vlan_miss; + u8 reserved3[3]; + u8 no_vlan_prio; + __be32 promisc; + __be32 mcast; +}; + struct mlx4_port_info { struct mlx4_dev *dev; int port; char dev_name[16]; struct device_attribute port_attr; enum mlx4_port_type tmp_type; + char dev_mtu_name[16]; + struct device_attribute port_mtu_attr; struct mlx4_mac_table mac_table; struct mlx4_vlan_table vlan_table; + int base_qpn; }; struct mlx4_sense { @@ -283,11 +724,107 @@ struct mlx4_sense { u8 do_sense_port[MLX4_MAX_PORTS + 1]; u8 sense_allowed[MLX4_MAX_PORTS + 1]; struct delayed_work sense_poll; - struct workqueue_struct *sense_wq; - u32 resched; + struct workqueue_struct *sense_wq; + u32 resched; }; -extern struct mutex drv_mutex; +struct mlx4_msix_ctl { + u64 pool_bm; + struct mutex pool_lock; +}; + +struct mlx4_steer { + struct list_head promisc_qps[MLX4_NUM_STEERS]; + struct list_head steer_entries[MLX4_NUM_STEERS]; +}; + +struct mlx4_net_trans_rule_hw_ctrl { + __be32 ctrl; + u8 rsvd1; + u8 funcid; + u8 vep; + u8 port; + __be32 qpn; + __be32 rsvd2; +}; + +struct mlx4_net_trans_rule_hw_ib { + u8 size; + u8 rsvd1; + __be16 id; + u32 rsvd2; + __be32 r_u_qpn; + __be32 qpn_mask; + u8 dst_gid[16]; + u8 dst_gid_msk[16]; +} __packed; + +struct mlx4_net_trans_rule_hw_eth { + u8 size; + u8 rsvd; + __be16 id; + u8 rsvd1[6]; + u8 dst_mac[6]; + u16 rsvd2; + u8 dst_mac_msk[6]; + u16 rsvd3; + u8 src_mac[6]; + u16 rsvd4; + u8 src_mac_msk[6]; + u8 rsvd5; + u8 ether_type_enable; + __be16 ether_type; + __be16 vlan_id_msk; + __be16 vlan_id; +} __packed; + +struct mlx4_net_trans_rule_hw_tcp_udp { + u8 size; + u8 rsvd; + __be16 id; + __be16 rsvd1[3]; + __be16 dst_port; + __be16 rsvd2; + __be16 dst_port_msk; + __be16 rsvd3; + __be16 src_port; + __be16 rsvd4; + __be16 src_port_msk; +} __packed; + +struct mlx4_net_trans_rule_hw_ipv4 { + u8 size; + u8 rsvd; + __be16 id; + __be32 rsvd1; + __be32 dst_ip; + __be32 dst_ip_msk; + __be32 src_ip; + __be32 src_ip_msk; +} __packed; + +struct _rule_hw { + union { + struct { + u8 size; + u8 rsvd; + __be16 id; + }; + struct mlx4_net_trans_rule_hw_eth eth; + struct mlx4_net_trans_rule_hw_ib ib; + struct mlx4_net_trans_rule_hw_ipv4 ipv4; + struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp; + }; +}; + +enum { + MLX4_PCI_DEV_IS_VF = 1 << 0, + MLX4_PCI_DEV_FORCE_SENSE_PORT = 1 << 1, +}; + +struct mlx4_roce_gid_entry { + u8 raw[16]; +}; struct mlx4_priv { struct mlx4_dev dev; @@ -296,11 +833,14 @@ struct mlx4_priv { struct list_head ctx_list; spinlock_t ctx_lock; + int pci_dev_data; + struct list_head pgdir_list; struct mutex pgdir_mutex; struct mlx4_fw fw; struct mlx4_cmd cmd; + struct mlx4_mfunc mfunc; struct mlx4_bitmap pd_bitmap; struct mlx4_bitmap xrcd_bitmap; @@ -312,8 +852,6 @@ struct mlx4_priv { struct mlx4_qp_table qp_table; struct mlx4_mcg_table mcg_table; struct mlx4_bitmap counters_bitmap; - struct list_head bf_list; - struct mutex bf_mutex; struct mlx4_catas_err catas_err; @@ -322,13 +860,21 @@ struct mlx4_priv { struct mlx4_uar driver_uar; void __iomem *kar; struct mlx4_port_info port[MLX4_MAX_PORTS + 1]; - struct device_attribute trigger_attr; - int trig; - int changed_ports; struct mlx4_sense sense; struct mutex port_mutex; - int iboe_counter_index[MLX4_MAX_PORTS]; - struct io_mapping *bf_mapping; + struct mlx4_msix_ctl msix_ctl; + struct mlx4_steer *steer; + struct list_head bf_list; + struct mutex bf_mutex; + struct io_mapping *bf_mapping; + void __iomem *clock_mapping; + int reserved_mtts; + int fs_hash_mode; + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + __be64 slave_node_guids[MLX4_MFUNC_MAX]; + struct mlx4_roce_gid_entry roce_gids[MLX4_MAX_PORTS][128]; + atomic_t opreq_count; + struct work_struct opreq_task; }; static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev) @@ -342,7 +888,8 @@ extern struct workqueue_struct *mlx4_wq; u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap); void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj); -u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align); +u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, + int align, u32 skip_mask); void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt); u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap); int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, @@ -365,6 +912,7 @@ int mlx4_init_srq_table(struct mlx4_dev *dev); int mlx4_init_mcg_table(struct mlx4_dev *dev); void mlx4_cleanup_pd_table(struct mlx4_dev *dev); +void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev); void mlx4_cleanup_uar_table(struct mlx4_dev *dev); void mlx4_cleanup_mr_table(struct mlx4_dev *dev); void mlx4_cleanup_eq_table(struct mlx4_dev *dev); @@ -372,7 +920,65 @@ void mlx4_cleanup_cq_table(struct mlx4_dev *dev); void mlx4_cleanup_qp_table(struct mlx4_dev *dev); void mlx4_cleanup_srq_table(struct mlx4_dev *dev); void mlx4_cleanup_mcg_table(struct mlx4_dev *dev); -void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev); +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn); +void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn); +int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn); +void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn); +int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn); +void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn); +int __mlx4_mr_reserve(struct mlx4_dev *dev); +void __mlx4_mr_release(struct mlx4_dev *dev, u32 index); +int __mlx4_mr_alloc_icm(struct mlx4_dev *dev, u32 index); +void __mlx4_mr_free_icm(struct mlx4_dev *dev, u32 index); +u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order); +void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order); + +int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SYNC_TPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, + int *base, u8 bf_qp); +void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); +int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac); +void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac); +int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list); +int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); +void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx); +int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn); +void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn); void mlx4_start_catas_poll(struct mlx4_dev *dev); void mlx4_stop_catas_poll(struct mlx4_dev *dev); @@ -380,8 +986,8 @@ void mlx4_catas_init(void); int mlx4_restart_one(struct pci_dev *pdev); int mlx4_register_device(struct mlx4_dev *dev); void mlx4_unregister_device(struct mlx4_dev *dev); -void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port); -void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port); +void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, + unsigned long param); struct mlx4_dev_cap; struct mlx4_init_hca_param; @@ -390,13 +996,158 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, struct mlx4_profile *request, struct mlx4_dev_cap *dev_cap, struct mlx4_init_hca_param *init_hca); +void mlx4_master_comm_channel(struct work_struct *work); +void mlx4_gen_slave_eqe(struct work_struct *work); +void mlx4_master_handle_slave_flr(struct work_struct *work); + +int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_COMM_INT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_2ERR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTS2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe); int mlx4_cmd_init(struct mlx4_dev *dev); void mlx4_cmd_cleanup(struct mlx4_dev *dev); +int mlx4_multi_func_init(struct mlx4_dev *dev); +void mlx4_multi_func_cleanup(struct mlx4_dev *dev); void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param); int mlx4_cmd_use_events(struct mlx4_dev *dev); void mlx4_cmd_use_polling(struct mlx4_dev *dev); +int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, + unsigned long timeout); + void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn); void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type); @@ -406,13 +1157,15 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type); void mlx4_handle_catas_err(struct mlx4_dev *dev); +int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, + enum mlx4_port_type *type); void mlx4_do_sense_ports(struct mlx4_dev *dev, enum mlx4_port_type *stype, enum mlx4_port_type *defaults); void mlx4_start_sense(struct mlx4_dev *dev); void mlx4_stop_sense(struct mlx4_dev *dev); -int mlx4_sense_init(struct mlx4_dev *dev); void mlx4_sense_cleanup(struct mlx4_dev *dev); +int mlx4_sense_init(struct mlx4_dev *dev); int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type); int mlx4_change_port_types(struct mlx4_dev *dev, @@ -420,8 +1173,147 @@ int mlx4_change_port_types(struct mlx4_dev *dev, void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table); void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table); +void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); +int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); -int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port); +int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz); +/* resource tracker functions*/ +int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev, + enum mlx4_resource resource_type, + u64 resource_id, int *slave); +void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id); +int mlx4_init_resource_tracker(struct mlx4_dev *dev); + +void mlx4_free_resource_tracker(struct mlx4_dev *dev, + enum mlx4_res_tracker_free_type type); + +int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps); +int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port, + int *gid_tbl_len, int *pkey_tbl_len); + +int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot, enum mlx4_steer_type steer); +int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot, + enum mlx4_steer_type steer); +int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_common_set_vlan_fltr(struct mlx4_dev *dev, int function, + int port, void *buf); +int mlx4_common_dump_eth_stats(struct mlx4_dev *dev, int slave, u32 in_mod, + struct mlx4_cmd_mailbox *outbox); +int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_PKEY_TABLE_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_get_mgm_entry_size(struct mlx4_dev *dev); +int mlx4_get_qp_per_mgm(struct mlx4_dev *dev); + +static inline void set_param_l(u64 *arg, u32 val) +{ + *arg = (*arg & 0xffffffff00000000ULL) | (u64) val; +} + +static inline void set_param_h(u64 *arg, u32 val) +{ + *arg = (*arg & 0xffffffff) | ((u64) val << 32); +} + +static inline u32 get_param_l(u64 *arg) +{ + return (u32) (*arg & 0xffffffff); +} + +static inline u32 get_param_h(u64 *arg) +{ + return (u32)(*arg >> 32); +} + +static inline spinlock_t *mlx4_tlock(struct mlx4_dev *dev) +{ + return &mlx4_priv(dev)->mfunc.master.res_tracker.lock; +} + +#define NOT_MASKED_PD_BITS 17 + +void sys_tune_init(void); +void sys_tune_fini(void); + +void mlx4_init_quotas(struct mlx4_dev *dev); + +int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave); +int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave); + #endif /* MLX4_H */ diff --git a/sys/ofed/drivers/net/mlx4/mlx4_en.h b/sys/ofed/drivers/net/mlx4/mlx4_en.h index 5b21d9395211..f3f71c79c789 100644 --- a/sys/ofed/drivers/net/mlx4/mlx4_en.h +++ b/sys/ofed/drivers/net/mlx4/mlx4_en.h @@ -568,6 +568,7 @@ enum mlx4_en_wol { MLX4_EN_WOL_DO_MODIFY = (1ULL << 63), }; + int mlx4_en_transmit(struct net_device *dev, struct mbuf *mb); void mlx4_en_qflush(struct net_device *dev); @@ -635,12 +636,12 @@ void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv); int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring); void mlx4_en_rx_irq(struct mlx4_cq *mcq); -int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode); +//int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode); int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans); -int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, - u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx); -int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, - u8 promisc); +//int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, +// u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx); +//int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, +// u8 promisc); int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset); int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port); diff --git a/sys/ofed/drivers/net/mlx4/mr.c b/sys/ofed/drivers/net/mlx4/mr.c index 9ed610aa4cfb..3daa995190cb 100644 --- a/sys/ofed/drivers/net/mlx4/mr.c +++ b/sys/ofed/drivers/net/mlx4/mr.c @@ -34,34 +34,15 @@ #include #include +#include +#include +#include #include #include "mlx4.h" #include "icm.h" -/* - * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits. - */ -struct mlx4_mpt_entry { - __be32 flags; - __be32 qpn; - __be32 key; - __be32 pd_flags; - __be64 start; - __be64 length; - __be32 lkey; - __be32 win_cnt; - u8 reserved1; - u8 flags2; - u8 reserved2; - u8 mtt_rep; - __be64 mtt_seg; - __be32 mtt_sz; - __be32 entity_size; - __be32 first_byte_offset; -} __attribute__((packed)); - #define MLX4_MPT_FLAG_SW_OWNS (0xfUL << 28) #define MLX4_MPT_FLAG_FREE (0x3UL << 28) #define MLX4_MPT_FLAG_MIO (1 << 17) @@ -73,8 +54,6 @@ struct mlx4_mpt_entry { #define MLX4_MPT_PD_FLAG_RAE (1 << 28) #define MLX4_MPT_PD_FLAG_EN_INV (3 << 24) -#define MLX4_MPT_FLAG2_FBO_EN (1 << 7) - #define MLX4_MPT_STATUS_SW 0xF0 #define MLX4_MPT_STATUS_HW 0x00 @@ -141,19 +120,19 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) buddy->max_order = max_order; spin_lock_init(&buddy->lock); - buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *), + buddy->bits = kcalloc(buddy->max_order + 1, sizeof (long *), GFP_KERNEL); - buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *), + buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free, GFP_KERNEL); if (!buddy->bits || !buddy->num_free) goto err_out; for (i = 0; i <= buddy->max_order; ++i) { s = BITS_TO_LONGS(1 << (buddy->max_order - i)); - buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL); - if (!buddy->bits[i]) - goto err_out_free; - bitmap_zero(buddy->bits[i], 1 << (buddy->max_order - i)); + buddy->bits[i] = kcalloc(s, sizeof (long), GFP_KERNEL | __GFP_NOWARN); + if (!buddy->bits[i]) { + goto err_out_free; + } } set_bit(0, buddy->bits[buddy->max_order]); @@ -163,7 +142,8 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) err_out_free: for (i = 0; i <= buddy->max_order; ++i) - kfree(buddy->bits[i]); + if ( buddy->bits[i] ) + kfree(buddy->bits[i]); err_out: kfree(buddy->bits); @@ -177,28 +157,54 @@ static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy) int i; for (i = 0; i <= buddy->max_order; ++i) - kfree(buddy->bits[i]); + kfree(buddy->bits[i]); kfree(buddy->bits); kfree(buddy->num_free); } -static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order) +u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order) { struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; u32 seg; + int seg_order; + u32 offset; - seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, order); + seg_order = max_t(int, order - log_mtts_per_seg, 0); + + seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, seg_order); if (seg == -1) return -1; - if (mlx4_table_get_range(dev, &mr_table->mtt_table, seg, - seg + (1 << order) - 1)) { - mlx4_buddy_free(&mr_table->mtt_buddy, seg, order); + offset = seg * (1 << log_mtts_per_seg); + + if (mlx4_table_get_range(dev, &mr_table->mtt_table, offset, + offset + (1 << order) - 1)) { + mlx4_buddy_free(&mr_table->mtt_buddy, seg, seg_order); return -1; } - return seg; + return offset; +} + +static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order) +{ + u64 in_param = 0; + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, order); + err = mlx4_cmd_imm(dev, in_param, &out_param, RES_MTT, + RES_OP_RESERVE_AND_MAP, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + if (err) + return -1; + return get_param_l(&out_param); + } + return __mlx4_alloc_mtt_range(dev, order); } int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, @@ -213,33 +219,66 @@ int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, } else mtt->page_shift = page_shift; - for (mtt->order = 0, i = dev->caps.mtts_per_seg; i < npages; i <<= 1) + for (mtt->order = 0, i = 1; i < npages; i <<= 1) ++mtt->order; - mtt->first_seg = mlx4_alloc_mtt_range(dev, mtt->order); - if (mtt->first_seg == -1) + mtt->offset = mlx4_alloc_mtt_range(dev, mtt->order); + if (mtt->offset == -1) { + mlx4_err(dev, "Failed to allocate mtts for %d pages(order %d)\n", + npages, mtt->order); return -ENOMEM; + } return 0; } EXPORT_SYMBOL_GPL(mlx4_mtt_init); -void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt) +void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order) { + u32 first_seg; + int seg_order; struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + seg_order = max_t(int, order - log_mtts_per_seg, 0); + first_seg = offset / (1 << log_mtts_per_seg); + + mlx4_buddy_free(&mr_table->mtt_buddy, first_seg, seg_order); + mlx4_table_put_range(dev, &mr_table->mtt_table, offset, + offset + (1 << order) - 1); +} + +static void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order) +{ + u64 in_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, offset); + set_param_h(&in_param, order); + err = mlx4_cmd(dev, in_param, RES_MTT, RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + if (err) + mlx4_warn(dev, "Failed to free mtt range at:" + "%d order:%d\n", offset, order); + return; + } + __mlx4_free_mtt_range(dev, offset, order); +} + +void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt) +{ if (mtt->order < 0) return; - mlx4_buddy_free(&mr_table->mtt_buddy, mtt->first_seg, mtt->order); - mlx4_table_put_range(dev, &mr_table->mtt_table, mtt->first_seg, - mtt->first_seg + (1 << mtt->order) - 1); + mlx4_free_mtt_range(dev, mtt->offset, mtt->order); } EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup); u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt) { - return (u64) mtt->first_seg * dev->caps.mtt_entry_sz; + return (u64) mtt->offset * dev->caps.mtt_entry_sz; } EXPORT_SYMBOL_GPL(mlx4_mtt_addr); @@ -256,40 +295,20 @@ static u32 key_to_hw_index(u32 key) static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int mpt_index) { - return mlx4_cmd(dev, mailbox->dma, mpt_index, 0, MLX4_CMD_SW2HW_MPT, - MLX4_CMD_TIME_CLASS_B); + return mlx4_cmd(dev, mailbox->dma, mpt_index, + 0, MLX4_CMD_SW2HW_MPT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); } static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int mpt_index) { return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index, - !mailbox, MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_B); + !mailbox, MLX4_CMD_HW2SW_MPT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); } -int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - u32 mridx; - - mridx = mlx4_bitmap_alloc_range(&priv->mr_table.mpt_bitmap, cnt, align); - if (mridx == -1) - return -ENOMEM; - - *base_mridx = mridx; - return 0; - -} -EXPORT_SYMBOL_GPL(mlx4_mr_reserve_range); - -void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - mlx4_bitmap_free_range(&priv->mr_table.mpt_bitmap, base_mridx, cnt); -} -EXPORT_SYMBOL_GPL(mlx4_mr_release_range); - -int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, +static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, u64 iova, u64 size, u32 access, int npages, int page_shift, struct mlx4_mr *mr) { @@ -297,65 +316,159 @@ int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, mr->size = size; mr->pd = pd; mr->access = access; - mr->enabled = 0; + mr->enabled = MLX4_MR_DISABLED; mr->key = hw_index_to_key(mridx); return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt); } -EXPORT_SYMBOL_GPL(mlx4_mr_alloc_reserved); + +static int mlx4_WRITE_MTT(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox, + int num_entries) +{ + return mlx4_cmd(dev, mailbox->dma, num_entries, 0, MLX4_CMD_WRITE_MTT, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); +} + +int __mlx4_mr_reserve(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap); +} + +static int mlx4_mr_reserve(struct mlx4_dev *dev) +{ + u64 out_param; + + if (mlx4_is_mfunc(dev)) { + if (mlx4_cmd_imm(dev, 0, &out_param, RES_MPT, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED)) + return -1; + return get_param_l(&out_param); + } + return __mlx4_mr_reserve(dev); +} + +void __mlx4_mr_release(struct mlx4_dev *dev, u32 index) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index); +} + +static void mlx4_mr_release(struct mlx4_dev *dev, u32 index) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, index); + if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED)) + mlx4_warn(dev, "Failed to release mr index:%d\n", + index); + return; + } + __mlx4_mr_release(dev, index); +} + +int __mlx4_mr_alloc_icm(struct mlx4_dev *dev, u32 index) +{ + struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + + return mlx4_table_get(dev, &mr_table->dmpt_table, index); +} + +static int mlx4_mr_alloc_icm(struct mlx4_dev *dev, u32 index) +{ + u64 param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(¶m, index); + return mlx4_cmd_imm(dev, param, ¶m, RES_MPT, RES_OP_MAP_ICM, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + } + return __mlx4_mr_alloc_icm(dev, index); +} + +void __mlx4_mr_free_icm(struct mlx4_dev *dev, u32 index) +{ + struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + + mlx4_table_put(dev, &mr_table->dmpt_table, index); +} + +static void mlx4_mr_free_icm(struct mlx4_dev *dev, u32 index) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, index); + if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_MAP_ICM, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED)) + mlx4_warn(dev, "Failed to free icm of mr index:%d\n", + index); + return; + } + return __mlx4_mr_free_icm(dev, index); +} int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access, int npages, int page_shift, struct mlx4_mr *mr) { - struct mlx4_priv *priv = mlx4_priv(dev); u32 index; int err; - index = mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap); + index = mlx4_mr_reserve(dev); if (index == -1) return -ENOMEM; err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size, access, npages, page_shift, mr); if (err) - mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index); + mlx4_mr_release(dev, index); return err; } EXPORT_SYMBOL_GPL(mlx4_mr_alloc); -void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr) +static void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr) { int err; - if (mr->enabled) { + if (mr->enabled == MLX4_MR_EN_HW) { err = mlx4_HW2SW_MPT(dev, NULL, key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1)); if (err) - mlx4_warn(dev, "HW2SW_MPT failed (%d)\n", err); - } + mlx4_warn(dev, "xxx HW2SW_MPT failed (%d)\n", err); + mr->enabled = MLX4_MR_EN_SW; + } mlx4_mtt_cleanup(dev, &mr->mtt); } -EXPORT_SYMBOL_GPL(mlx4_mr_free_reserved); void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr) { - struct mlx4_priv *priv = mlx4_priv(dev); mlx4_mr_free_reserved(dev, mr); - mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, key_to_hw_index(mr->key)); + if (mr->enabled) + mlx4_mr_free_icm(dev, key_to_hw_index(mr->key)); + mlx4_mr_release(dev, key_to_hw_index(mr->key)); } EXPORT_SYMBOL_GPL(mlx4_mr_free); int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) { - struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; struct mlx4_cmd_mailbox *mailbox; struct mlx4_mpt_entry *mpt_entry; int err; - err = mlx4_table_get(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key)); + err = mlx4_mr_alloc_icm(dev, key_to_hw_index(mr->key)); if (err) return err; @@ -380,9 +493,10 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) if (mr->mtt.order < 0) { mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL); - mpt_entry->mtt_seg = 0; + mpt_entry->mtt_addr = 0; } else { - mpt_entry->mtt_seg = cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt)); + mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev, + &mr->mtt)); } if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) { @@ -390,8 +504,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_FREE); mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG | MLX4_MPT_PD_FLAG_RAE); - mpt_entry->mtt_sz = cpu_to_be32((1 << mr->mtt.order) * - dev->caps.mtts_per_seg); + mpt_entry->mtt_sz = cpu_to_be32(1 << mr->mtt.order); } else { mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS); } @@ -402,8 +515,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err); goto err_cmd; } - - mr->enabled = 1; + mr->enabled = MLX4_MR_EN_HW; mlx4_free_cmd_mailbox(dev, mailbox); @@ -413,7 +525,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) mlx4_free_cmd_mailbox(dev, mailbox); err_table: - mlx4_table_put(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key)); + mlx4_mr_free_icm(dev, key_to_hw_index(mr->key)); return err; } EXPORT_SYMBOL_GPL(mlx4_mr_enable); @@ -425,50 +537,94 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt, __be64 *mtts; dma_addr_t dma_handle; int i; - int s = start_index * sizeof (u64); - /* All MTTs must fit in the same page */ - if (start_index / (PAGE_SIZE / sizeof (u64)) != - (start_index + npages - 1) / (PAGE_SIZE / sizeof (u64))) - return -EINVAL; + mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->offset + + start_index, &dma_handle); - if (start_index & (dev->caps.mtts_per_seg - 1)) - return -EINVAL; - - mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->first_seg + - s / dev->caps.mtt_entry_sz, &dma_handle); if (!mtts) return -ENOMEM; + dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle, + npages * sizeof (u64), DMA_TO_DEVICE); + for (i = 0; i < npages; ++i) mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); - dma_sync_single(&dev->pdev->dev, dma_handle, npages * sizeof (u64), DMA_TO_DEVICE); + dma_sync_single_for_device(&dev->pdev->dev, dma_handle, + npages * sizeof (u64), DMA_TO_DEVICE); return 0; } +int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list) +{ + int err = 0; + int chunk; + int mtts_per_page; + int max_mtts_first_page; + + /* compute how may mtts fit in the first page */ + mtts_per_page = PAGE_SIZE / sizeof(u64); + max_mtts_first_page = mtts_per_page - (mtt->offset + start_index) + % mtts_per_page; + + chunk = min_t(int, max_mtts_first_page, npages); + + while (npages > 0) { + err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list); + if (err) + return err; + npages -= chunk; + start_index += chunk; + page_list += chunk; + + chunk = min_t(int, mtts_per_page, npages); + } + return err; +} + int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, int start_index, int npages, u64 *page_list) { + struct mlx4_cmd_mailbox *mailbox = NULL; + __be64 *inbox = NULL; int chunk; - int err; + int err = 0; + int i; if (mtt->order < 0) return -EINVAL; - while (npages > 0) { - chunk = min_t(int, PAGE_SIZE / sizeof(u64), npages); - err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list); - if (err) - return err; + if (mlx4_is_mfunc(dev)) { + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; - npages -= chunk; - start_index += chunk; - page_list += chunk; + while (npages > 0) { + chunk = min_t(int, MLX4_MAILBOX_SIZE / sizeof(u64) - 2, + npages); + inbox[0] = cpu_to_be64(mtt->offset + start_index); + inbox[1] = 0; + for (i = 0; i < chunk; ++i) + inbox[i + 2] = cpu_to_be64(page_list[i] | + MLX4_MTT_FLAG_PRESENT); + err = mlx4_WRITE_MTT(dev, mailbox, chunk); + if (err) { + mlx4_free_cmd_mailbox(dev, mailbox); + return err; + } + + npages -= chunk; + start_index += chunk; + page_list += chunk; + } + mlx4_free_cmd_mailbox(dev, mailbox); + return err; } - return 0; + return __mlx4_write_mtt(dev, mtt, start_index, npages, page_list); } EXPORT_SYMBOL_GPL(mlx4_write_mtt); @@ -484,7 +640,7 @@ int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, return -ENOMEM; for (i = 0; i < buf->npages; ++i) - if (buf->direct.map) + if (buf->nbufs == 1) page_list[i] = buf->direct.map + (i << buf->page_shift); else page_list[i] = buf->page_list[i].map; @@ -498,9 +654,15 @@ EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt); int mlx4_init_mr_table(struct mlx4_dev *dev) { - struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_mr_table *mr_table = &priv->mr_table; int err; + /* Nothing to do for slaves - all MR handling is forwarded + * to the master */ + if (mlx4_is_slave(dev)) + return 0; + if (!is_power_of_2(dev->caps.num_mpts)) return -EINVAL; @@ -510,13 +672,17 @@ int mlx4_init_mr_table(struct mlx4_dev *dev) return err; err = mlx4_buddy_init(&mr_table->mtt_buddy, - ilog2(dev->caps.num_mtt_segs)); + ilog2((u32)dev->caps.num_mtts / + (1 << log_mtts_per_seg))); if (err) goto err_buddy; if (dev->caps.reserved_mtts) { - if (mlx4_alloc_mtt_range(dev, fls(dev->caps.reserved_mtts - 1)) == -1) { - mlx4_warn(dev, "MTT table of order %d is too small.\n", + priv->reserved_mtts = + mlx4_alloc_mtt_range(dev, + fls(dev->caps.reserved_mtts - 1)); + if (priv->reserved_mtts < 0) { + mlx4_warn(dev, "MTT table of order %u is too small.\n", mr_table->mtt_buddy.max_order); err = -ENOMEM; goto err_reserve_mtts; @@ -536,8 +702,14 @@ int mlx4_init_mr_table(struct mlx4_dev *dev) void mlx4_cleanup_mr_table(struct mlx4_dev *dev) { - struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_mr_table *mr_table = &priv->mr_table; + if (mlx4_is_slave(dev)) + return; + if (priv->reserved_mtts >= 0) + mlx4_free_mtt_range(dev, priv->reserved_mtts, + fls(dev->caps.reserved_mtts - 1)); mlx4_buddy_cleanup(&mr_table->mtt_buddy); mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); } @@ -569,9 +741,8 @@ static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list, return 0; } -int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr, - u64 *page_list, int npages, u64 iova, u32 fbo, - u32 len, u32 *lkey, u32 *rkey, int same_key) +int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, + int npages, u64 iova, u32 *lkey, u32 *rkey) { u32 key; int i, err; @@ -583,8 +754,7 @@ int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr, ++fmr->maps; key = key_to_hw_index(fmr->mr.key); - if (!same_key) - key += dev->caps.num_mpts; + key += dev->caps.num_mpts; *lkey = *rkey = fmr->mr.key = hw_index_to_key(key); *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; @@ -592,18 +762,19 @@ int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr, /* Make sure MPT status is visible before writing MTT entries */ wmb(); + dma_sync_single_for_cpu(&dev->pdev->dev, fmr->dma_handle, + npages * sizeof(u64), DMA_TO_DEVICE); + for (i = 0; i < npages; ++i) fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); - dma_sync_single(&dev->pdev->dev, fmr->dma_handle, - npages * sizeof(u64), DMA_TO_DEVICE); + dma_sync_single_for_device(&dev->pdev->dev, fmr->dma_handle, + npages * sizeof(u64), DMA_TO_DEVICE); fmr->mpt->key = cpu_to_be32(key); fmr->mpt->lkey = cpu_to_be32(key); - fmr->mpt->length = cpu_to_be64(len); + fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift)); fmr->mpt->start = cpu_to_be64(iova); - fmr->mpt->first_byte_offset = cpu_to_be32(fbo & 0x001fffff); - fmr->mpt->flags2 = (fbo ? MLX4_MPT_FLAG2_FBO_EN : 0); /* Make MTT entries are visible before setting MPT status */ wmb(); @@ -615,25 +786,17 @@ int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr, return 0; } -EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr_fbo); - -int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, - int npages, u64 iova, u32 *lkey, u32 *rkey) -{ - u32 len = npages * (1ull << fmr->page_shift); - - return mlx4_map_phys_fmr_fbo(dev, fmr, page_list, npages, iova, 0, - len, lkey, rkey, 0); -} EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr); int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, int max_maps, u8 page_shift, struct mlx4_fmr *fmr) { struct mlx4_priv *priv = mlx4_priv(dev); - u64 mtt_seg; int err = -ENOMEM; + if (max_maps > dev->caps.max_fmr_maps) + return -EINVAL; + if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32) return -EINVAL; @@ -651,11 +814,10 @@ int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, if (err) return err; - mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz; - fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table, - fmr->mr.mtt.first_seg, + fmr->mr.mtt.offset, &fmr->dma_handle); + if (!fmr->mtts) { err = -ENOMEM; goto err_free; @@ -669,49 +831,6 @@ int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, } EXPORT_SYMBOL_GPL(mlx4_fmr_alloc); -int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, - u32 pd, u32 access, int max_pages, - int max_maps, u8 page_shift, struct mlx4_fmr *fmr) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - u64 mtt_seg; - int err = -ENOMEM; - - if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32) - return -EINVAL; - - /* All MTTs must fit in the same page */ - if (max_pages * sizeof *fmr->mtts > PAGE_SIZE) - return -EINVAL; - - fmr->page_shift = page_shift; - fmr->max_pages = max_pages; - fmr->max_maps = max_maps; - fmr->maps = 0; - - err = mlx4_mr_alloc_reserved(dev, mridx, pd, 0, 0, access, max_pages, - page_shift, &fmr->mr); - if (err) - return err; - - mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz; - - fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table, - fmr->mr.mtt.first_seg, - &fmr->dma_handle); - if (!fmr->mtts) { - err = -ENOMEM; - goto err_free; - } - - return 0; - -err_free: - mlx4_mr_free_reserved(dev, &fmr->mr); - return err; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_alloc_reserved); - int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -733,12 +852,30 @@ EXPORT_SYMBOL_GPL(mlx4_fmr_enable); void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u32 *lkey, u32 *rkey) { + struct mlx4_cmd_mailbox *mailbox; + int err; + if (!fmr->maps) return; fmr->maps = 0; - *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + mlx4_warn(dev, "mlx4_alloc_cmd_mailbox failed (%d)\n", err); + return; + } + + err = mlx4_HW2SW_MPT(dev, NULL, + key_to_hw_index(fmr->mr.key) & + (dev->caps.num_mpts - 1)); + mlx4_free_cmd_mailbox(dev, mailbox); + if (err) { + mlx4_warn(dev, "mlx4_HW2SW_MPT failed (%d)\n", err); + return; + } + fmr->mr.enabled = MLX4_MR_EN_SW; } EXPORT_SYMBOL_GPL(mlx4_fmr_unmap); @@ -747,27 +884,16 @@ int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr) if (fmr->maps) return -EBUSY; - fmr->mr.enabled = 0; mlx4_mr_free(dev, &fmr->mr); + fmr->mr.enabled = MLX4_MR_DISABLED; return 0; } EXPORT_SYMBOL_GPL(mlx4_fmr_free); -int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr) -{ - if (fmr->maps) - return -EBUSY; - - fmr->mr.enabled = 0; - mlx4_mr_free_reserved(dev, &fmr->mr); - - return 0; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_free_reserved); - int mlx4_SYNC_TPT(struct mlx4_dev *dev) { - return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000); + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000, + MLX4_CMD_NATIVE); } EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT); diff --git a/sys/ofed/drivers/net/mlx4/pd.c b/sys/ofed/drivers/net/mlx4/pd.c index cce922671273..91f4b85fea65 100644 --- a/sys/ofed/drivers/net/mlx4/pd.c +++ b/sys/ofed/drivers/net/mlx4/pd.c @@ -62,12 +62,66 @@ void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn) } EXPORT_SYMBOL_GPL(mlx4_pd_free); +int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + *xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap); + if (*xrcdn == -1) + return -ENOMEM; + + return 0; +} + +int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, + RES_XRCD, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + return err; + + *xrcdn = get_param_l(&out_param); + return 0; + } + return __mlx4_xrcd_alloc(dev, xrcdn); +} +EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc); + +void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn) +{ + mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn); +} + +void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn) +{ + u64 in_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, xrcdn); + err = mlx4_cmd(dev, in_param, RES_XRCD, + RES_OP_RESERVE, MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + mlx4_warn(dev, "Failed to release xrcdn %d\n", xrcdn); + } else + __mlx4_xrcd_free(dev, xrcdn); +} +EXPORT_SYMBOL_GPL(mlx4_xrcd_free); + int mlx4_init_pd_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds, - (1 << 24) - 1, dev->caps.reserved_pds, 0); + (1 << NOT_MASKED_PD_BITS) - 1, + dev->caps.reserved_pds, 0); } void mlx4_cleanup_pd_table(struct mlx4_dev *dev) @@ -75,16 +129,34 @@ void mlx4_cleanup_pd_table(struct mlx4_dev *dev) mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap); } +int mlx4_init_xrcd_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16), + (1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0); +} + +void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev) +{ + mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap); +} int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar) { + int offset; + uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap); if (uar->index == -1) return -ENOMEM; - uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index; + if (mlx4_is_slave(dev)) + offset = uar->index % ((int) pci_resource_len(dev->pdev, 2) / + dev->caps.uar_page_size); + else + offset = uar->index; + uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + offset; uar->map = NULL; - return 0; } EXPORT_SYMBOL_GPL(mlx4_uar_alloc); @@ -95,7 +167,8 @@ void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar) } EXPORT_SYMBOL_GPL(mlx4_uar_free); -int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf) +#ifndef CONFIG_PPC +int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_uar *uar; @@ -113,10 +186,13 @@ int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf) err = -ENOMEM; goto out; } - uar = kmalloc(sizeof *uar, GFP_KERNEL); + uar = kmalloc_node(sizeof *uar, GFP_KERNEL, node); if (!uar) { - err = -ENOMEM; - goto out; + uar = kmalloc(sizeof *uar, GFP_KERNEL); + if (!uar) { + err = -ENOMEM; + goto out; + } } err = mlx4_uar_alloc(dev, uar); if (err) @@ -191,6 +267,21 @@ void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf) } EXPORT_SYMBOL_GPL(mlx4_bf_free); +#else +int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node) +{ + memset(bf, 0, sizeof *bf); + return -ENOSYS; +} +EXPORT_SYMBOL_GPL(mlx4_bf_alloc); + +void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf) +{ + return; +} +EXPORT_SYMBOL_GPL(mlx4_bf_free); +#endif + int mlx4_init_uar_table(struct mlx4_dev *dev) { if (dev->caps.num_uars <= 128) { @@ -202,7 +293,7 @@ int mlx4_init_uar_table(struct mlx4_dev *dev) return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap, dev->caps.num_uars, dev->caps.num_uars - 1, - max(128, dev->caps.reserved_uars), 0); + dev->caps.reserved_uars, 0); } void mlx4_cleanup_uar_table(struct mlx4_dev *dev) diff --git a/sys/ofed/drivers/net/mlx4/port.c b/sys/ofed/drivers/net/mlx4/port.c index c8df375a41ae..2a009ead879e 100644 --- a/sys/ofed/drivers/net/mlx4/port.c +++ b/sys/ofed/drivers/net/mlx4/port.c @@ -34,19 +34,26 @@ #include #include - +#include #include "mlx4.h" -int mlx4_ib_set_4k_mtu = 0; -module_param_named(set_4k_mtu, mlx4_ib_set_4k_mtu, int, 0444); -MODULE_PARM_DESC(set_4k_mtu, "attempt to set 4K MTU to all ConnectX ports"); +int mlx4_set_4k_mtu = -1; +module_param_named(set_4k_mtu, mlx4_set_4k_mtu, int, 0444); +MODULE_PARM_DESC(set_4k_mtu, + "(Obsolete) attempt to set 4K MTU to all ConnectX ports"); + #define MLX4_MAC_VALID (1ull << 63) -#define MLX4_MAC_MASK 0xffffffffffffULL #define MLX4_VLAN_VALID (1u << 31) #define MLX4_VLAN_MASK 0xfff +#define MLX4_STATS_TRAFFIC_COUNTERS_MASK 0xfULL +#define MLX4_STATS_TRAFFIC_DROPS_MASK 0xc0ULL +#define MLX4_STATS_ERROR_COUNTERS_MASK 0x1ffc30ULL +#define MLX4_STATS_PORT_COUNTERS_MASK 0x1fe00000ULL +#define MLX4_STATS_IF_RX_ERRORS_COUNTERS_MASK 0x8010ULL + void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table) { int i; @@ -69,10 +76,36 @@ void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table) table->entries[i] = 0; table->refs[i] = 0; } - table->max = 1 << dev->caps.log_num_vlans; + table->max = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR; table->total = 0; } +static int validate_index(struct mlx4_dev *dev, + struct mlx4_mac_table *table, int index) +{ + int err = 0; + + if (index < 0 || index >= table->max || !table->entries[index]) { + mlx4_warn(dev, "No valid Mac entry for the given index\n"); + err = -EINVAL; + } + return err; +} + +static int find_index(struct mlx4_dev *dev, + struct mlx4_mac_table *table, u64 mac) +{ + int i; + + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + if ((mac & MLX4_MAC_MASK) == + (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) + return i; + } + /* Mac not found */ + return -EINVAL; +} + static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port, __be64 *entries) { @@ -87,40 +120,39 @@ static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port, memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE); in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); return err; } -int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index) +int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac) { - struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table; + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + struct mlx4_mac_table *table = &info->mac_table; int i, err = 0; int free = -1; - mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac); + mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d\n", + (unsigned long long) mac, port); + mutex_lock(&table->mutex); - for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) { - if (free < 0 && !table->refs[i]) { + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + if (free < 0 && !table->entries[i]) { free = i; continue; } if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) { - /* MAC already registered, increase refernce count */ - *index = i; + /* MAC already registered, Must not have duplicates */ + err = i; ++table->refs[i]; goto out; } } - if (free < 0) { - err = -ENOMEM; - goto out; - } - mlx4_dbg(dev, "Free MAC index is %d\n", free); if (table->total == table->max) { @@ -130,47 +162,128 @@ int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index) } /* Register new MAC */ - table->refs[free] = 1; table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID); err = mlx4_set_port_mac_table(dev, port, table->entries); if (unlikely(err)) { - mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) mac); - table->refs[free] = 0; + mlx4_err(dev, "Failed adding MAC: 0x%llx\n", + (unsigned long long) mac); table->entries[free] = 0; goto out; } + table->refs[free] = 1; - *index = free; + err = free; ++table->total; out: mutex_unlock(&table->mutex); return err; } +EXPORT_SYMBOL_GPL(__mlx4_register_mac); + +int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac) +{ + u64 out_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, mac, &out_param, + ((u32) port) << 8 | (u32) RES_MAC, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + return err; + + return get_param_l(&out_param); + } + return __mlx4_register_mac(dev, port, mac); +} EXPORT_SYMBOL_GPL(mlx4_register_mac); -void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index) +int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port) { - struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table; + return dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] + + (port - 1) * (1 << dev->caps.log_num_macs); +} +EXPORT_SYMBOL_GPL(mlx4_get_base_qpn); +void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac) +{ + struct mlx4_port_info *info; + struct mlx4_mac_table *table; + int index; + + if (port < 1 || port > dev->caps.num_ports) { + mlx4_warn(dev, "invalid port number (%d), aborting...\n", port); + return; + } + info = &mlx4_priv(dev)->port[port]; + table = &info->mac_table; mutex_lock(&table->mutex); - if (!table->refs[index]) { - mlx4_warn(dev, "No MAC entry for index %d\n", index); + + index = find_index(dev, table, mac); + + if (validate_index(dev, table, index)) goto out; - } + if (--table->refs[index]) { - mlx4_warn(dev, "Have more references for index %d," - "no need to modify MAC table\n", index); + mlx4_dbg(dev, "Have more references for index %d," + "no need to modify mac table\n", index); goto out; } + table->entries[index] = 0; mlx4_set_port_mac_table(dev, port, table->entries); --table->total; out: mutex_unlock(&table->mutex); } +EXPORT_SYMBOL_GPL(__mlx4_unregister_mac); + +void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac) +{ + u64 out_param = 0; + + if (mlx4_is_mfunc(dev)) { + (void) mlx4_cmd_imm(dev, mac, &out_param, + ((u32) port) << 8 | (u32) RES_MAC, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + return; + } + __mlx4_unregister_mac(dev, port, mac); + return; +} EXPORT_SYMBOL_GPL(mlx4_unregister_mac); +int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac) +{ + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + struct mlx4_mac_table *table = &info->mac_table; + int index = qpn - info->base_qpn; + int err = 0; + + /* CX1 doesn't support multi-functions */ + mutex_lock(&table->mutex); + + err = validate_index(dev, table, index); + if (err) + goto out; + + table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID); + + err = mlx4_set_port_mac_table(dev, port, table->entries); + if (unlikely(err)) { + mlx4_err(dev, "Failed adding MAC: 0x%llx\n", + (unsigned long long) new_mac); + table->entries[index] = 0; + } +out: + mutex_unlock(&table->mutex); + return err; +} +EXPORT_SYMBOL_GPL(__mlx4_replace_mac); + static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port, __be32 *entries) { @@ -185,7 +298,7 @@ static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port, memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE); in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, mailbox); @@ -201,7 +314,7 @@ int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx) if (table->refs[i] && (vid == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))) { - /* Vlan already registered, increase refernce count */ + /* VLAN already registered, increase reference count */ *idx = i; return 0; } @@ -211,13 +324,21 @@ int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx) } EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan); -int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) +int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, + int *index) { struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; int i, err = 0; int free = -1; mutex_lock(&table->mutex); + + if (table->total == table->max) { + /* No free vlan entries */ + err = -ENOSPC; + goto out; + } + for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) { if (free < 0 && (table->refs[i] == 0)) { free = i; @@ -227,7 +348,7 @@ int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) if (table->refs[i] && (vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))) { - /* Vlan already registered, increase refernce count */ + /* Vlan already registered, increase references count */ *index = i; ++table->refs[i]; goto out; @@ -239,13 +360,7 @@ int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) goto out; } - if (table->total == table->max) { - /* No free vlan entries */ - err = -ENOSPC; - goto out; - } - - /* Register new MAC */ + /* Register new VLAN */ table->refs[free] = 1; table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID); @@ -263,25 +378,49 @@ int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) mutex_unlock(&table->mutex); return err; } + +int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) +{ + u64 out_param = 0; + int err; + + if (vlan > 4095) + return -EINVAL; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, vlan, &out_param, + ((u32) port) << 8 | (u32) RES_VLAN, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (!err) + *index = get_param_l(&out_param); + + return err; + } + return __mlx4_register_vlan(dev, port, vlan, index); +} EXPORT_SYMBOL_GPL(mlx4_register_vlan); -void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index) +void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan) { struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; + int index; + + mutex_lock(&table->mutex); + if (mlx4_find_cached_vlan(dev, port, vlan, &index)) { + mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan); + goto out; + } if (index < MLX4_VLAN_REGULAR) { mlx4_warn(dev, "Trying to free special vlan index %d\n", index); - return; - } - - mutex_lock(&table->mutex); - if (!table->refs[index]) { - mlx4_warn(dev, "No vlan entry for index %d\n", index); goto out; } + if (--table->refs[index]) { - mlx4_dbg(dev, "Have more references for index %d," - "no need to modify vlan table\n", index); + mlx4_dbg(dev, "Have %d more references for index %d, " + "no need to modify vlan table\n", table->refs[index], + index); goto out; } table->entries[index] = 0; @@ -290,6 +429,21 @@ void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index) out: mutex_unlock(&table->mutex); } + +void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan) +{ + u64 out_param = 0; + + if (mlx4_is_mfunc(dev)) { + (void) mlx4_cmd_imm(dev, vlan, &out_param, + ((u32) port) << 8 | (u32) RES_VLAN, + RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + return; + } + __mlx4_unregister_vlan(dev, port, vlan); +} EXPORT_SYMBOL_GPL(mlx4_unregister_vlan); int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) @@ -320,20 +474,275 @@ int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) *(__be32 *) (&inbuf[20]) = cpu_to_be32(port); err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3, - MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C); + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); if (!err) *caps = *(__be32 *) (outbuf + 84); mlx4_free_cmd_mailbox(dev, inmailbox); mlx4_free_cmd_mailbox(dev, outmailbox); return err; } +static struct mlx4_roce_gid_entry zgid_entry; -int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port) +int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave) +{ + if (slave == 0) + return MLX4_ROCE_PF_GIDS; + if (slave <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % dev->num_vfs)) + return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs) + 1; + return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs; +} + +int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave) +{ + int gids; + int vfs; + + gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; + vfs = dev->num_vfs; + + if (slave == 0) + return 0; + if (slave <= gids % vfs) + return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1); + + return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1)); +} + +static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, + u8 op_mod, struct mlx4_cmd_mailbox *inbox) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_port_info *port_info; + struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master; + struct mlx4_slave_state *slave_st = &master->slave_state[slave]; + struct mlx4_set_port_rqp_calc_context *qpn_context; + struct mlx4_set_port_general_context *gen_context; + struct mlx4_roce_gid_entry *gid_entry_tbl, *gid_entry_mbox, *gid_entry_mb1; + int reset_qkey_viols; + int port; + int is_eth; + int num_gids; + int base; + u32 in_modifier; + u32 promisc; + u16 mtu, prev_mtu; + int err; + int i, j; + int offset; + __be32 agg_cap_mask; + __be32 slave_cap_mask; + __be32 new_cap_mask; + + port = in_mod & 0xff; + in_modifier = in_mod >> 8; + is_eth = op_mod; + port_info = &priv->port[port]; + + /* Slaves cannot perform SET_PORT operations except changing MTU */ + if (is_eth) { + if (slave != dev->caps.function && + in_modifier != MLX4_SET_PORT_GENERAL && + in_modifier != MLX4_SET_PORT_GID_TABLE) { + mlx4_warn(dev, "denying SET_PORT for slave:%d\n", + slave); + return -EINVAL; + } + switch (in_modifier) { + case MLX4_SET_PORT_RQP_CALC: + qpn_context = inbox->buf; + qpn_context->base_qpn = + cpu_to_be32(port_info->base_qpn); + qpn_context->n_mac = 0x7; + promisc = be32_to_cpu(qpn_context->promisc) >> + SET_PORT_PROMISC_SHIFT; + qpn_context->promisc = cpu_to_be32( + promisc << SET_PORT_PROMISC_SHIFT | + port_info->base_qpn); + promisc = be32_to_cpu(qpn_context->mcast) >> + SET_PORT_MC_PROMISC_SHIFT; + qpn_context->mcast = cpu_to_be32( + promisc << SET_PORT_MC_PROMISC_SHIFT | + port_info->base_qpn); + break; + case MLX4_SET_PORT_GENERAL: + gen_context = inbox->buf; + /* Mtu is configured as the max MTU among all the + * the functions on the port. */ + mtu = be16_to_cpu(gen_context->mtu); + mtu = min_t(int, mtu, dev->caps.eth_mtu_cap[port]); + prev_mtu = slave_st->mtu[port]; + slave_st->mtu[port] = mtu; + if (mtu > master->max_mtu[port]) + master->max_mtu[port] = mtu; + if (mtu < prev_mtu && prev_mtu == + master->max_mtu[port]) { + slave_st->mtu[port] = mtu; + master->max_mtu[port] = mtu; + for (i = 0; i < dev->num_slaves; i++) { + master->max_mtu[port] = + max(master->max_mtu[port], + master->slave_state[i].mtu[port]); + } + } + + gen_context->mtu = cpu_to_be16(master->max_mtu[port]); + break; + case MLX4_SET_PORT_GID_TABLE: + /* change to MULTIPLE entries: number of guest's gids + * need a FOR-loop here over number of gids the guest has. + * 1. Check no duplicates in gids passed by slave + */ + num_gids = mlx4_get_slave_num_gids(dev, slave); + base = mlx4_get_base_gid_ix(dev, slave); + gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf); + for (i = 0; i < num_gids; gid_entry_mbox++, i++) { + if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw, + sizeof(zgid_entry))) + continue; + gid_entry_mb1 = gid_entry_mbox + 1; + for (j = i + 1; j < num_gids; gid_entry_mb1++, j++) { + if (!memcmp(gid_entry_mb1->raw, + zgid_entry.raw, sizeof(zgid_entry))) + continue; + if (!memcmp(gid_entry_mb1->raw, gid_entry_mbox->raw, + sizeof(gid_entry_mbox->raw))) { + /* found duplicate */ + return -EINVAL; + } + } + } + + /* 2. Check that do not have duplicates in OTHER + * entries in the port GID table + */ + for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { + if (i >= base && i < base + num_gids) + continue; /* don't compare to slave's current gids */ + gid_entry_tbl = &priv->roce_gids[port - 1][i]; + if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry))) + continue; + gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf); + for (j = 0; j < num_gids; gid_entry_mbox++, j++) { + if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw, + sizeof(zgid_entry))) + continue; + if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw, + sizeof(gid_entry_tbl->raw))) { + /* found duplicate */ + mlx4_warn(dev, "requested gid entry for slave:%d " + "is a duplicate of gid at index %d\n", + slave, i); + return -EINVAL; + } + } + } + + /* insert slave GIDs with memcpy, starting at slave's base index */ + gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf); + for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++) + memcpy(priv->roce_gids[port - 1][offset].raw, gid_entry_mbox->raw, 16); + + /* Now, copy roce port gids table to current mailbox for passing to FW */ + gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf); + for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++) + memcpy(gid_entry_mbox->raw, priv->roce_gids[port - 1][i].raw, 16); + + break; + } + return mlx4_cmd(dev, inbox->dma, in_mod, op_mod, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + } + + /* For IB, we only consider: + * - The capability mask, which is set to the aggregate of all + * slave function capabilities + * - The QKey violatin counter - reset according to each request. + */ + + if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + reset_qkey_viols = (*(u8 *) inbox->buf) & 0x40; + new_cap_mask = ((__be32 *) inbox->buf)[2]; + } else { + reset_qkey_viols = ((u8 *) inbox->buf)[3] & 0x1; + new_cap_mask = ((__be32 *) inbox->buf)[1]; + } + + /* slave may not set the IS_SM capability for the port */ + if (slave != mlx4_master_func_num(dev) && + (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM)) + return -EINVAL; + + /* No DEV_MGMT in multifunc mode */ + if (mlx4_is_mfunc(dev) && + (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP)) + return -EINVAL; + + agg_cap_mask = 0; + slave_cap_mask = + priv->mfunc.master.slave_state[slave].ib_cap_mask[port]; + priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = new_cap_mask; + for (i = 0; i < dev->num_slaves; i++) + agg_cap_mask |= + priv->mfunc.master.slave_state[i].ib_cap_mask[port]; + + /* only clear mailbox for guests. Master may be setting + * MTU or PKEY table size + */ + if (slave != dev->caps.function) + memset(inbox->buf, 0, 256); + if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + *(u8 *) inbox->buf |= !!reset_qkey_viols << 6; + ((__be32 *) inbox->buf)[2] = agg_cap_mask; + } else { + ((u8 *) inbox->buf)[3] |= !!reset_qkey_viols; + ((__be32 *) inbox->buf)[1] = agg_cap_mask; + } + + err = mlx4_cmd(dev, inbox->dma, port, is_eth, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) + priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = + slave_cap_mask; + return err; +} + +int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return mlx4_common_set_port(dev, slave, vhcr->in_modifier, + vhcr->op_modifier, inbox); +} + +/* bit locations for set port command with zero op modifier */ +enum { + MLX4_SET_PORT_VL_CAP = 4, /* bits 7:4 */ + MLX4_SET_PORT_MTU_CAP = 12, /* bits 15:12 */ + MLX4_CHANGE_PORT_PKEY_TBL_SZ = 20, + MLX4_CHANGE_PORT_VL_CAP = 21, + MLX4_CHANGE_PORT_MTU_CAP = 22, +}; + +#define CX3_PPF_DEV_ID 0x1003 +static int vl_cap_start(struct mlx4_dev *dev) +{ + /* for non CX3 devices, start with 4 VLs to avoid errors in syslog */ + if (dev->pdev->device != CX3_PPF_DEV_ID) + return 4; + return 8; +} + +int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz) { struct mlx4_cmd_mailbox *mailbox; - int err; + int err, vl_cap, pkey_tbl_flag = 0; + u32 in_mod; - if (dev->caps.port_type[port] != MLX4_PORT_TYPE_IB) + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_NONE) return 0; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -342,13 +751,295 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port) memset(mailbox->buf, 0, 256); - if (mlx4_ib_set_4k_mtu) - ((__be32 *) mailbox->buf)[0] |= cpu_to_be32((1 << 22) | (1 << 21) | (5 << 12) | (2 << 4)); + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) { + in_mod = MLX4_SET_PORT_GENERAL << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + } else { + ((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port]; - ((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port]; - err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) { + pkey_tbl_flag = 1; + ((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz); + } + + /* IB VL CAP enum isn't used by the firmware, just numerical values */ + for (vl_cap = vl_cap_start(dev); vl_cap >= 1; vl_cap >>= 1) { + ((__be32 *) mailbox->buf)[0] = cpu_to_be32( + (1 << MLX4_CHANGE_PORT_MTU_CAP) | + (1 << MLX4_CHANGE_PORT_VL_CAP) | + (pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) | + (dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) | + (vl_cap << MLX4_SET_PORT_VL_CAP)); + err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + if (err != -ENOMEM) + break; + } + } mlx4_free_cmd_mailbox(dev, mailbox); return err; } + +int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, + u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_general_context *context; + int err; + u32 in_mod; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + memset(context, 0, sizeof *context); + + context->flags = SET_PORT_GEN_ALL_VALID; + context->mtu = cpu_to_be16(mtu); + context->pptx = (pptx * (!pfctx)) << 7; + context->pfctx = pfctx; + context->pprx = (pprx * (!pfcrx)) << 7; + context->pfcrx = pfcrx; + + in_mod = MLX4_SET_PORT_GENERAL << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_general); + +int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, + u8 promisc) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_rqp_calc_context *context; + int err; + u32 in_mod; + u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ? + MCAST_DIRECT : MCAST_DEFAULT; +/* + if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) + return 0; +*/ + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + memset(context, 0, sizeof *context); + + context->base_qpn = cpu_to_be32(base_qpn); + /* + * This assignment breaks vlan support - I don't know why. Probablya an A0 issue - shahar Klein + * context->n_mac = dev->caps.log_num_macs; + */ + context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT | + base_qpn); + context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT | + base_qpn); + context->intra_no_vlan = 0; + context->no_vlan = MLX4_NO_VLAN_IDX; + context->intra_vlan_miss = 0; + context->vlan_miss = MLX4_VLAN_MISS_IDX; + + in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc); + +int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_prio2tc_context *context; + int err; + u32 in_mod; + int i; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + memset(context, 0, sizeof *context); + + for (i = 0; i < MLX4_NUM_UP; i += 2) + context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1]; + + in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC); + +int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw, + u8 *pg, u16 *ratelimit) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_scheduler_context *context; + int err; + u32 in_mod; + int i; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + memset(context, 0, sizeof *context); + + for (i = 0; i < MLX4_NUM_TC; i++) { + struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i]; + u16 r; + if (ratelimit && ratelimit[i]) { + if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) { + r = ratelimit[i]; + tc->max_bw_units = + htons(MLX4_RATELIMIT_100M_UNITS); + } else { + r = ratelimit[i]/10; + tc->max_bw_units = + htons(MLX4_RATELIMIT_1G_UNITS); + } + tc->max_bw_value = htons(r); + } else { + tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT); + tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS); + } + + tc->pg = htons(pg[i]); + tc->bw_precentage = htons(tc_tx_bw[i]); + } + + in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER); + +int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err = 0; + + return err; +} + +int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, + u64 mac, u64 clear, u8 mode) +{ + return mlx4_cmd(dev, (mac | (clear << 63)), port, mode, + MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); +} +EXPORT_SYMBOL(mlx4_SET_MCAST_FLTR); + +int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err = 0; + + return err; +} + +int mlx4_common_dump_eth_stats(struct mlx4_dev *dev, int slave, + u32 in_mod, struct mlx4_cmd_mailbox *outbox) +{ + return mlx4_cmd_box(dev, 0, outbox->dma, in_mod, 0, + MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); +} + +int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + if (slave != dev->caps.function) + return 0; + return mlx4_common_dump_eth_stats(dev, slave, + vhcr->in_modifier, outbox); +} + +void mlx4_set_stats_bitmap(struct mlx4_dev *dev, u64 *stats_bitmap) +{ + if (!mlx4_is_mfunc(dev)) { + *stats_bitmap = 0; + return; + } + + *stats_bitmap = (MLX4_STATS_TRAFFIC_COUNTERS_MASK | + MLX4_STATS_TRAFFIC_DROPS_MASK | + MLX4_STATS_PORT_COUNTERS_MASK | + MLX4_STATS_IF_RX_ERRORS_COUNTERS_MASK); + + if (mlx4_is_master(dev)) + *stats_bitmap |= MLX4_STATS_ERROR_COUNTERS_MASK; +} +EXPORT_SYMBOL(mlx4_set_stats_bitmap); + +int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i, found_ix = -1; + int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; + + if (!mlx4_is_mfunc(dev)) + return -EINVAL; + + for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { + if (!memcmp(priv->roce_gids[port - 1][i].raw, gid, 16)) { + found_ix = i; + break; + } + } + + if (found_ix >= 0) { + if (found_ix < MLX4_ROCE_PF_GIDS) + *slave_id = 0; + else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % dev->num_vfs) * + (vf_gids / dev->num_vfs + 1)) + *slave_id = ((found_ix - MLX4_ROCE_PF_GIDS) / + (vf_gids / dev->num_vfs + 1)) + 1; + else + *slave_id = + ((found_ix - MLX4_ROCE_PF_GIDS - + ((vf_gids % dev->num_vfs) * ((vf_gids / dev->num_vfs + 1)))) / + (vf_gids / dev->num_vfs)) + vf_gids % dev->num_vfs + 1; + } + + return (found_ix >= 0) ? 0 : -EINVAL; +} +EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid); + +int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (!mlx4_is_master(dev)) + return -EINVAL; + + memcpy(gid, priv->roce_gids[port - 1][slave_id].raw, 16); + return 0; +} +EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave); + diff --git a/sys/ofed/drivers/net/mlx4/profile.c b/sys/ofed/drivers/net/mlx4/profile.c index bd22df95adf9..d3042f012146 100644 --- a/sys/ofed/drivers/net/mlx4/profile.c +++ b/sys/ofed/drivers/net/mlx4/profile.c @@ -32,7 +32,7 @@ * SOFTWARE. */ -#include +#include #include "mlx4.h" #include "fw.h" @@ -76,7 +76,7 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, u64 size; u64 start; int type; - int num; + u32 num; int log_num; }; @@ -85,7 +85,7 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, struct mlx4_resource tmp; int i, j; - profile = kzalloc(MLX4_RES_NUM * sizeof *profile, GFP_KERNEL); + profile = kcalloc(MLX4_RES_NUM, sizeof(*profile), GFP_KERNEL); if (!profile) return -ENOMEM; @@ -98,8 +98,8 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, profile[MLX4_RES_EQ].size = dev_cap->eqc_entry_sz; profile[MLX4_RES_DMPT].size = dev_cap->dmpt_entry_sz; profile[MLX4_RES_CMPT].size = dev_cap->cmpt_entry_sz; - profile[MLX4_RES_MTT].size = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz; - profile[MLX4_RES_MCG].size = MLX4_MGM_ENTRY_SIZE; + profile[MLX4_RES_MTT].size = dev_cap->mtt_entry_sz; + profile[MLX4_RES_MCG].size = mlx4_get_mgm_entry_size(dev); profile[MLX4_RES_QP].num = request->num_qp; profile[MLX4_RES_RDMARC].num = request->num_qp * request->rdmarc_per_qp; @@ -107,12 +107,12 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, profile[MLX4_RES_AUXC].num = request->num_qp; profile[MLX4_RES_SRQ].num = request->num_srq; profile[MLX4_RES_CQ].num = request->num_cq; - profile[MLX4_RES_EQ].num = min_t(unsigned, dev_cap->max_eqs, - dev_cap->reserved_eqs + - num_possible_cpus() + 1); + profile[MLX4_RES_EQ].num = mlx4_is_mfunc(dev) ? + dev->phys_caps.num_phys_eqs : + min_t(unsigned, dev_cap->max_eqs, MAX_MSIX); profile[MLX4_RES_DMPT].num = request->num_mpt; profile[MLX4_RES_CMPT].num = MLX4_NUM_CMPTS; - profile[MLX4_RES_MTT].num = request->num_mtt; + profile[MLX4_RES_MTT].num = request->num_mtt * (1 << log_mtts_per_seg); profile[MLX4_RES_MCG].num = request->num_mcg; for (i = 0; i < MLX4_RES_NUM; ++i) { @@ -198,9 +198,10 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, init_hca->log_num_cqs = profile[i].log_num; break; case MLX4_RES_EQ: - dev->caps.num_eqs = profile[i].num; + dev->caps.num_eqs = roundup_pow_of_two(min_t(unsigned, dev_cap->max_eqs, + MAX_MSIX)); init_hca->eqc_base = profile[i].start; - init_hca->log_num_eqs = profile[i].log_num; + init_hca->log_num_eqs = ilog2(dev->caps.num_eqs); break; case MLX4_RES_DMPT: dev->caps.num_mpts = profile[i].num; @@ -212,17 +213,24 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, init_hca->cmpt_base = profile[i].start; break; case MLX4_RES_MTT: - dev->caps.num_mtt_segs = profile[i].num; + dev->caps.num_mtts = profile[i].num; priv->mr_table.mtt_base = profile[i].start; init_hca->mtt_base = profile[i].start; break; case MLX4_RES_MCG: - dev->caps.num_mgms = profile[i].num >> 1; - dev->caps.num_amgms = profile[i].num >> 1; init_hca->mc_base = profile[i].start; - init_hca->log_mc_entry_sz = ilog2(MLX4_MGM_ENTRY_SIZE); + init_hca->log_mc_entry_sz = + ilog2(mlx4_get_mgm_entry_size(dev)); init_hca->log_mc_table_sz = profile[i].log_num; - init_hca->log_mc_hash_sz = profile[i].log_num - 1; + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + dev->caps.num_mgms = profile[i].num; + } else { + init_hca->log_mc_hash_sz = + profile[i].log_num - 1; + dev->caps.num_mgms = profile[i].num >> 1; + dev->caps.num_amgms = profile[i].num >> 1; + } break; default: break; diff --git a/sys/ofed/drivers/net/mlx4/qp.c b/sys/ofed/drivers/net/mlx4/qp.c index bf1c117db4de..2386adc7263c 100644 --- a/sys/ofed/drivers/net/mlx4/qp.c +++ b/sys/ofed/drivers/net/mlx4/qp.c @@ -41,6 +41,12 @@ #include "mlx4.h" #include "icm.h" +/* + * QP to support BF should have bits 6,7 cleared + */ +#define MLX4_BF_QP_SKIP_MASK 0xc0 +#define MLX4_MAX_BF_QP_RANGE 0x40 + void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type) { struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; @@ -55,7 +61,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type) spin_unlock(&qp_table->lock); if (!qp) { - mlx4_warn(dev, "Async event for bogus QP %08x\n", qpn); + mlx4_dbg(dev, "Async event for none existent QP %08x\n", qpn); return; } @@ -65,10 +71,25 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type) complete(&qp->free); } -int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, - enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, - struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar, - int sqd_event, struct mlx4_qp *qp) +/* used for INIT/CLOSE port logic */ +static int is_master_qp0(struct mlx4_dev *dev, struct mlx4_qp *qp, int *real_qp0, int *proxy_qp0) +{ + /* this procedure is called after we already know we are on the master */ + /* qp0 is either the proxy qp0, or the real qp0 */ + u32 pf_proxy_offset = dev->phys_caps.base_proxy_sqpn + 8 * mlx4_master_func_num(dev); + *proxy_qp0 = qp->qpn >= pf_proxy_offset && qp->qpn <= pf_proxy_offset + 1; + + *real_qp0 = qp->qpn >= dev->phys_caps.base_sqpn && + qp->qpn <= dev->phys_caps.base_sqpn + 1; + + return *real_qp0 || *proxy_qp0; +} + +static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, + struct mlx4_qp_context *context, + enum mlx4_qp_optpar optpar, + int sqd_event, struct mlx4_qp *qp, int native) { static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = { [MLX4_QP_STATE_RST] = { @@ -110,16 +131,31 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, } }; + struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; int ret = 0; + int real_qp0 = 0; + int proxy_qp0 = 0; + u8 port; if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE || !op[cur_state][new_state]) return -EINVAL; - if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) - return mlx4_cmd(dev, 0, qp->qpn, 2, - MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A); + if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) { + ret = mlx4_cmd(dev, 0, qp->qpn, 2, + MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A, native); + if (mlx4_is_master(dev) && cur_state != MLX4_QP_STATE_ERR && + cur_state != MLX4_QP_STATE_RST && + is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) { + port = (qp->qpn & 1) + 1; + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0; + else + priv->mfunc.master.qp0_state[port].qp0_active = 0; + } + return ret; + } mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -138,41 +174,197 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, ((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn = cpu_to_be32(qp->qpn); - ret = mlx4_cmd(dev, mailbox->dma, qp->qpn | (!!sqd_event << 31), + ret = mlx4_cmd(dev, mailbox->dma, + qp->qpn | (!!sqd_event << 31), new_state == MLX4_QP_STATE_RST ? 2 : 0, - op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C); + op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C, native); + + if (mlx4_is_master(dev) && is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) { + port = (qp->qpn & 1) + 1; + if (cur_state != MLX4_QP_STATE_ERR && + cur_state != MLX4_QP_STATE_RST && + new_state == MLX4_QP_STATE_ERR) { + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0; + else + priv->mfunc.master.qp0_state[port].qp0_active = 0; + } else if (new_state == MLX4_QP_STATE_RTR) { + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 1; + else + priv->mfunc.master.qp0_state[port].qp0_active = 1; + } + } mlx4_free_cmd_mailbox(dev, mailbox); return ret; } + +int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, + struct mlx4_qp_context *context, + enum mlx4_qp_optpar optpar, + int sqd_event, struct mlx4_qp *qp) +{ + return __mlx4_qp_modify(dev, mtt, cur_state, new_state, context, + optpar, sqd_event, qp, 0); +} EXPORT_SYMBOL_GPL(mlx4_qp_modify); -int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base) +int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, + int *base, u8 bf_qp) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; - int qpn; - qpn = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align); - if (qpn == -1) + if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp) + return -ENOMEM; + + *base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align, + bf_qp ? MLX4_BF_QP_SKIP_MASK : 0); + if (*base == -1) return -ENOMEM; - *base = qpn; return 0; } + +int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, + int *base, u8 bf_qp) +{ + u64 in_param = 0; + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, (((!!bf_qp) << 31) | (u32)cnt)); + set_param_h(&in_param, align); + err = mlx4_cmd_imm(dev, in_param, &out_param, + RES_QP, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + return err; + + *base = get_param_l(&out_param); + return 0; + } + return __mlx4_qp_reserve_range(dev, cnt, align, base, bf_qp); +} EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range); -void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) +void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; - if (base_qpn < dev->caps.sqp_start + 8) - return; + if (mlx4_is_qp_reserved(dev, (u32) base_qpn)) + return; mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt); } + +void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) +{ + u64 in_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, base_qpn); + set_param_h(&in_param, cnt); + err = mlx4_cmd(dev, in_param, RES_QP, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) { + mlx4_warn(dev, "Failed to release qp range" + " base:%d cnt:%d\n", base_qpn, cnt); + } + } else + __mlx4_qp_release_range(dev, base_qpn, cnt); +} EXPORT_SYMBOL_GPL(mlx4_qp_release_range); +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + int err; + + err = mlx4_table_get(dev, &qp_table->qp_table, qpn); + if (err) + goto err_out; + + err = mlx4_table_get(dev, &qp_table->auxc_table, qpn); + if (err) + goto err_put_qp; + + err = mlx4_table_get(dev, &qp_table->altc_table, qpn); + if (err) + goto err_put_auxc; + + err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn); + if (err) + goto err_put_altc; + + err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn); + if (err) + goto err_put_rdmarc; + + return 0; + +err_put_rdmarc: + mlx4_table_put(dev, &qp_table->rdmarc_table, qpn); + +err_put_altc: + mlx4_table_put(dev, &qp_table->altc_table, qpn); + +err_put_auxc: + mlx4_table_put(dev, &qp_table->auxc_table, qpn); + +err_put_qp: + mlx4_table_put(dev, &qp_table->qp_table, qpn); + +err_out: + return err; +} + +static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) +{ + u64 param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(¶m, qpn); + return mlx4_cmd_imm(dev, param, ¶m, RES_QP, RES_OP_MAP_ICM, + MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + } + return __mlx4_qp_alloc_icm(dev, qpn); +} + +void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + + mlx4_table_put(dev, &qp_table->cmpt_table, qpn); + mlx4_table_put(dev, &qp_table->rdmarc_table, qpn); + mlx4_table_put(dev, &qp_table->altc_table, qpn); + mlx4_table_put(dev, &qp_table->auxc_table, qpn); + mlx4_table_put(dev, &qp_table->qp_table, qpn); +} + +static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, qpn); + if (mlx4_cmd(dev, in_param, RES_QP, RES_OP_MAP_ICM, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED)) + mlx4_warn(dev, "Failed to free icm of qp:%d\n", qpn); + } else + __mlx4_qp_free_icm(dev, qpn); +} + int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -184,70 +376,29 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp) qp->qpn = qpn; - err = mlx4_table_get(dev, &qp_table->qp_table, qp->qpn); + err = mlx4_qp_alloc_icm(dev, qpn); if (err) - goto err_out; - - err = mlx4_table_get(dev, &qp_table->auxc_table, qp->qpn); - if (err) - goto err_put_qp; - - err = mlx4_table_get(dev, &qp_table->altc_table, qp->qpn); - if (err) - goto err_put_auxc; - - err = mlx4_table_get(dev, &qp_table->rdmarc_table, qp->qpn); - if (err) - goto err_put_altc; - - err = mlx4_table_get(dev, &qp_table->cmpt_table, qp->qpn); - if (err) - goto err_put_rdmarc; + return err; spin_lock_irq(&qp_table->lock); - err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1), qp); + err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & + (dev->caps.num_qps - 1), qp); spin_unlock_irq(&qp_table->lock); if (err) - goto err_put_cmpt; + goto err_icm; atomic_set(&qp->refcount, 1); init_completion(&qp->free); return 0; -err_put_cmpt: - mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn); - -err_put_rdmarc: - mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn); - -err_put_altc: - mlx4_table_put(dev, &qp_table->altc_table, qp->qpn); - -err_put_auxc: - mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn); - -err_put_qp: - mlx4_table_put(dev, &qp_table->qp_table, qp->qpn); - -err_out: +err_icm: + mlx4_qp_free_icm(dev, qpn); return err; } + EXPORT_SYMBOL_GPL(mlx4_qp_alloc); -struct mlx4_qp *mlx4_qp_lookup_lock(struct mlx4_dev *dev, u32 qpn) -{ - struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; - unsigned long flags; - struct mlx4_qp *qp; - - spin_lock_irqsave(&qp_table->lock, flags); - qp = radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1)); - spin_unlock_irqrestore(&qp_table->lock, flags); - return qp; -} -EXPORT_SYMBOL_GPL(mlx4_qp_lookup_lock); - void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp) { struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; @@ -261,25 +412,18 @@ EXPORT_SYMBOL_GPL(mlx4_qp_remove); void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp) { - struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; - if (atomic_dec_and_test(&qp->refcount)) complete(&qp->free); wait_for_completion(&qp->free); - mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn); - mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn); - mlx4_table_put(dev, &qp_table->altc_table, qp->qpn); - mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn); - mlx4_table_put(dev, &qp_table->qp_table, qp->qpn); + mlx4_qp_free_icm(dev, qp->qpn); } EXPORT_SYMBOL_GPL(mlx4_qp_free); static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn) { - return mlx4_cmd(dev, 0, base_qpn, - (dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY) ? 4 : 0, - MLX4_CMD_CONF_SPECIAL_QP, MLX4_CMD_TIME_CLASS_B); + return mlx4_cmd(dev, 0, base_qpn, 0, MLX4_CMD_CONF_SPECIAL_QP, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } int mlx4_init_qp_table(struct mlx4_dev *dev) @@ -287,18 +431,23 @@ int mlx4_init_qp_table(struct mlx4_dev *dev) struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; int err; int reserved_from_top = 0; + int reserved_from_bot; + int k; spin_lock_init(&qp_table->lock); INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC); + if (mlx4_is_slave(dev)) + return 0; /* * We reserve 2 extra QPs per port for the special QPs. The * block of special QPs must be aligned to a multiple of 8, so * round up. + * * We also reserve the MSB of the 24-bit QP number to indicate - * an XRC qp. + * that a QP is an XRC QP. */ - dev->caps.sqp_start = + dev->phys_caps.base_sqpn = ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8); { @@ -329,34 +478,82 @@ int mlx4_init_qp_table(struct mlx4_dev *dev) } + /* Reserve 8 real SQPs in both native and SRIOV modes. + * In addition, in SRIOV mode, reserve 8 proxy SQPs per function + * (for all PFs and VFs), and 8 corresponding tunnel QPs. + * Each proxy SQP works opposite its own tunnel QP. + * + * The QPs are arranged as follows: + * a. 8 real SQPs + * b. All the proxy SQPs (8 per function) + * c. All the tunnel QPs (8 per function) + */ + reserved_from_bot = mlx4_num_reserved_sqps(dev); + if (reserved_from_bot + reserved_from_top > dev->caps.num_qps) { + mlx4_err(dev, "Number of reserved QPs is higher than number " + "of QPs, increase the value of log_num_qp\n"); + return -EINVAL; + } + err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps, - (1 << 23) - 1, dev->caps.sqp_start + 8, + (1 << 23) - 1, reserved_from_bot, reserved_from_top); if (err) return err; - return mlx4_CONF_SPECIAL_QP(dev, dev->caps.sqp_start); + if (mlx4_is_mfunc(dev)) { + /* for PPF use */ + dev->phys_caps.base_proxy_sqpn = dev->phys_caps.base_sqpn + 8; + dev->phys_caps.base_tunnel_sqpn = dev->phys_caps.base_sqpn + 8 + 8 * MLX4_MFUNC_MAX; + + /* In mfunc, calculate proxy and tunnel qp offsets for the PF here, + * since the PF does not call mlx4_slave_caps */ + dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + + if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy || + !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) { + err = -ENOMEM; + goto err_mem; + } + + for (k = 0; k < dev->caps.num_ports; k++) { + dev->caps.qp0_proxy[k] = dev->phys_caps.base_proxy_sqpn + + 8 * mlx4_master_func_num(dev) + k; + dev->caps.qp0_tunnel[k] = dev->caps.qp0_proxy[k] + 8 * MLX4_MFUNC_MAX; + dev->caps.qp1_proxy[k] = dev->phys_caps.base_proxy_sqpn + + 8 * mlx4_master_func_num(dev) + MLX4_MAX_PORTS + k; + dev->caps.qp1_tunnel[k] = dev->caps.qp1_proxy[k] + 8 * MLX4_MFUNC_MAX; + } + } + + + err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn); + if (err) + goto err_mem; + return 0; + +err_mem: + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + dev->caps.qp0_tunnel = dev->caps.qp0_proxy = + dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL; + return err; } void mlx4_cleanup_qp_table(struct mlx4_dev *dev) { + if (mlx4_is_slave(dev)) + return; + mlx4_CONF_SPECIAL_QP(dev, 0); mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap); } -int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region, - int *base_qpn, int *cnt) -{ - if ((region < 0) || (region >= MLX4_NUM_QP_REGION)) - return -EINVAL; - - *base_qpn = dev->caps.reserved_qps_base[region]; - *cnt = dev->caps.reserved_qps_cnt[region]; - - return 0; -} -EXPORT_SYMBOL_GPL(mlx4_qp_get_region); - int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp, struct mlx4_qp_context *context) { @@ -368,7 +565,8 @@ int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp, return PTR_ERR(mailbox); err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0, - MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); if (!err) memcpy(context, mailbox->buf + 8, sizeof *context); diff --git a/sys/ofed/drivers/net/mlx4/reset.c b/sys/ofed/drivers/net/mlx4/reset.c index 3951b884c0fb..d8d796a9ca63 100644 --- a/sys/ofed/drivers/net/mlx4/reset.c +++ b/sys/ofed/drivers/net/mlx4/reset.c @@ -121,7 +121,7 @@ int mlx4_reset(struct mlx4_dev *dev) iounmap(reset); /* Docs say to wait one second before accessing device */ - msleep(1000); + msleep(2000); end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES; do { @@ -139,11 +139,12 @@ int mlx4_reset(struct mlx4_dev *dev) goto out; } + /* Now restore the PCI headers */ if (pcie_cap) { devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4]; if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_DEVCTL, - devctl)) { + devctl)) { err = -ENODEV; mlx4_err(dev, "Couldn't restore HCA PCI Express " "Device Control register, aborting.\n"); @@ -151,7 +152,7 @@ int mlx4_reset(struct mlx4_dev *dev) } linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4]; if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_LNKCTL, - linkctl)) { + linkctl)) { err = -ENODEV; mlx4_err(dev, "Couldn't restore HCA PCI Express " "Link control register, aborting.\n"); diff --git a/sys/ofed/drivers/net/mlx4/resource_tracker.c b/sys/ofed/drivers/net/mlx4/resource_tracker.c new file mode 100644 index 000000000000..aa101cdd6e99 --- /dev/null +++ b/sys/ofed/drivers/net/mlx4/resource_tracker.c @@ -0,0 +1,4315 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. + * All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlx4.h" +#include "fw.h" + +#define MLX4_MAC_VALID (1ull << 63) + +struct mac_res { + struct list_head list; + u64 mac; + int ref_count; + u8 smac_index; + u8 port; +}; + +struct vlan_res { + struct list_head list; + u16 vlan; + int ref_count; + int vlan_index; + u8 port; +}; + +struct res_common { + struct list_head list; + struct rb_node node; + u64 res_id; + int owner; + int state; + int from_state; + int to_state; + int removing; +}; + +enum { + RES_ANY_BUSY = 1 +}; + +struct res_gid { + struct list_head list; + u8 gid[16]; + enum mlx4_protocol prot; + enum mlx4_steer_type steer; +}; + +enum res_qp_states { + RES_QP_BUSY = RES_ANY_BUSY, + + /* QP number was allocated */ + RES_QP_RESERVED, + + /* ICM memory for QP context was mapped */ + RES_QP_MAPPED, + + /* QP is in hw ownership */ + RES_QP_HW +}; + +struct res_qp { + struct res_common com; + struct res_mtt *mtt; + struct res_cq *rcq; + struct res_cq *scq; + struct res_srq *srq; + struct list_head mcg_list; + spinlock_t mcg_spl; + int local_qpn; +}; + +enum res_mtt_states { + RES_MTT_BUSY = RES_ANY_BUSY, + RES_MTT_ALLOCATED, +}; + +static inline const char *mtt_states_str(enum res_mtt_states state) +{ + switch (state) { + case RES_MTT_BUSY: return "RES_MTT_BUSY"; + case RES_MTT_ALLOCATED: return "RES_MTT_ALLOCATED"; + default: return "Unknown"; + } +} + +struct res_mtt { + struct res_common com; + int order; + atomic_t ref_count; +}; + +enum res_mpt_states { + RES_MPT_BUSY = RES_ANY_BUSY, + RES_MPT_RESERVED, + RES_MPT_MAPPED, + RES_MPT_HW, +}; + +struct res_mpt { + struct res_common com; + struct res_mtt *mtt; + int key; +}; + +enum res_eq_states { + RES_EQ_BUSY = RES_ANY_BUSY, + RES_EQ_RESERVED, + RES_EQ_HW, +}; + +struct res_eq { + struct res_common com; + struct res_mtt *mtt; +}; + +enum res_cq_states { + RES_CQ_BUSY = RES_ANY_BUSY, + RES_CQ_ALLOCATED, + RES_CQ_HW, +}; + +struct res_cq { + struct res_common com; + struct res_mtt *mtt; + atomic_t ref_count; +}; + +enum res_srq_states { + RES_SRQ_BUSY = RES_ANY_BUSY, + RES_SRQ_ALLOCATED, + RES_SRQ_HW, +}; + +struct res_srq { + struct res_common com; + struct res_mtt *mtt; + struct res_cq *cq; + atomic_t ref_count; +}; + +enum res_counter_states { + RES_COUNTER_BUSY = RES_ANY_BUSY, + RES_COUNTER_ALLOCATED, +}; + +struct res_counter { + struct res_common com; + int port; +}; + +enum res_xrcdn_states { + RES_XRCD_BUSY = RES_ANY_BUSY, + RES_XRCD_ALLOCATED, +}; + +struct res_xrcdn { + struct res_common com; + int port; +}; + +enum res_fs_rule_states { + RES_FS_RULE_BUSY = RES_ANY_BUSY, + RES_FS_RULE_ALLOCATED, +}; + +struct res_fs_rule { + struct res_common com; +}; + +static int mlx4_is_eth(struct mlx4_dev *dev, int port) +{ + return dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB ? 0 : 1; +} + +static void *res_tracker_lookup(struct rb_root *root, u64 res_id) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct res_common *res = container_of(node, struct res_common, + node); + + if (res_id < res->res_id) + node = node->rb_left; + else if (res_id > res->res_id) + node = node->rb_right; + else + return res; + } + return NULL; +} + +static int res_tracker_insert(struct rb_root *root, struct res_common *res) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct res_common *this = container_of(*new, struct res_common, + node); + + parent = *new; + if (res->res_id < this->res_id) + new = &((*new)->rb_left); + else if (res->res_id > this->res_id) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&res->node, parent, new); + rb_insert_color(&res->node, root); + + return 0; +} + +enum qp_transition { + QP_TRANS_INIT2RTR, + QP_TRANS_RTR2RTS, + QP_TRANS_RTS2RTS, + QP_TRANS_SQERR2RTS, + QP_TRANS_SQD2SQD, + QP_TRANS_SQD2RTS +}; + +/* For Debug uses */ +static const char *ResourceType(enum mlx4_resource rt) +{ + switch (rt) { + case RES_QP: return "RES_QP"; + case RES_CQ: return "RES_CQ"; + case RES_SRQ: return "RES_SRQ"; + case RES_MPT: return "RES_MPT"; + case RES_MTT: return "RES_MTT"; + case RES_MAC: return "RES_MAC"; + case RES_VLAN: return "RES_VLAN"; + case RES_EQ: return "RES_EQ"; + case RES_COUNTER: return "RES_COUNTER"; + case RES_FS_RULE: return "RES_FS_RULE"; + case RES_XRCD: return "RES_XRCD"; + default: return "Unknown resource type !!!"; + }; +} + +static void rem_slave_vlans(struct mlx4_dev *dev, int slave); +static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave, + enum mlx4_resource res_type, int count, + int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct resource_allocator *res_alloc = + &priv->mfunc.master.res_tracker.res_alloc[res_type]; + int err = -EINVAL; + int allocated, free, reserved, guaranteed, from_free; + + spin_lock(&res_alloc->alloc_lock); + allocated = (port > 0) ? + res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] : + res_alloc->allocated[slave]; + free = (port > 0) ? res_alloc->res_port_free[port - 1] : + res_alloc->res_free; + reserved = (port > 0) ? res_alloc->res_port_rsvd[port - 1] : + res_alloc->res_reserved; + guaranteed = res_alloc->guaranteed[slave]; + + if (allocated + count > res_alloc->quota[slave]) + goto out; + + if (allocated + count <= guaranteed) { + err = 0; + } else { + /* portion may need to be obtained from free area */ + if (guaranteed - allocated > 0) + from_free = count - (guaranteed - allocated); + else + from_free = count; + + if (free - from_free > reserved) + err = 0; + } + + if (!err) { + /* grant the request */ + if (port > 0) { + res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] += count; + res_alloc->res_port_free[port - 1] -= count; + } else { + res_alloc->allocated[slave] += count; + res_alloc->res_free -= count; + } + } + +out: + spin_unlock(&res_alloc->alloc_lock); + return err; + +} + +static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave, + enum mlx4_resource res_type, int count, + int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct resource_allocator *res_alloc = + &priv->mfunc.master.res_tracker.res_alloc[res_type]; + + spin_lock(&res_alloc->alloc_lock); + if (port > 0) { + res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] -= count; + res_alloc->res_port_free[port - 1] += count; + } else { + res_alloc->allocated[slave] -= count; + res_alloc->res_free += count; + } + + spin_unlock(&res_alloc->alloc_lock); + return; +} + +static inline void initialize_res_quotas(struct mlx4_dev *dev, + struct resource_allocator *res_alloc, + enum mlx4_resource res_type, + int vf, int num_instances) +{ + res_alloc->guaranteed[vf] = num_instances / (2 * (dev->num_vfs + 1)); + res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf]; + if (vf == mlx4_master_func_num(dev)) { + res_alloc->res_free = num_instances; + if (res_type == RES_MTT) { + /* reserved mtts will be taken out of the PF allocation */ + res_alloc->res_free += dev->caps.reserved_mtts; + res_alloc->guaranteed[vf] += dev->caps.reserved_mtts; + res_alloc->quota[vf] += dev->caps.reserved_mtts; + } + } +} + +void mlx4_init_quotas(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int pf; + + /* quotas for VFs are initialized in mlx4_slave_cap */ + if (mlx4_is_slave(dev)) + return; + + if (!mlx4_is_mfunc(dev)) { + dev->quotas.qp = dev->caps.num_qps - dev->caps.reserved_qps - + mlx4_num_reserved_sqps(dev); + dev->quotas.cq = dev->caps.num_cqs - dev->caps.reserved_cqs; + dev->quotas.srq = dev->caps.num_srqs - dev->caps.reserved_srqs; + dev->quotas.mtt = dev->caps.num_mtts - dev->caps.reserved_mtts; + dev->quotas.mpt = dev->caps.num_mpts - dev->caps.reserved_mrws; + return; + } + + pf = mlx4_master_func_num(dev); + dev->quotas.qp = + priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[pf]; + dev->quotas.cq = + priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[pf]; + dev->quotas.srq = + priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[pf]; + dev->quotas.mtt = + priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[pf]; + dev->quotas.mpt = + priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf]; +} +int mlx4_init_resource_tracker(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i, j; + int t; + + priv->mfunc.master.res_tracker.slave_list = + kzalloc(dev->num_slaves * sizeof(struct slave_list), + GFP_KERNEL); + if (!priv->mfunc.master.res_tracker.slave_list) + return -ENOMEM; + + for (i = 0 ; i < dev->num_slaves; i++) { + for (t = 0; t < MLX4_NUM_OF_RESOURCE_TYPE; ++t) + INIT_LIST_HEAD(&priv->mfunc.master.res_tracker. + slave_list[i].res_list[t]); + mutex_init(&priv->mfunc.master.res_tracker.slave_list[i].mutex); + } + + mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves\n", + dev->num_slaves); + for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) + priv->mfunc.master.res_tracker.res_tree[i] = RB_ROOT; + + for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) { + struct resource_allocator *res_alloc = + &priv->mfunc.master.res_tracker.res_alloc[i]; + res_alloc->quota = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL); + res_alloc->guaranteed = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL); + if (i == RES_MAC || i == RES_VLAN) + res_alloc->allocated = kzalloc(MLX4_MAX_PORTS * + (dev->num_vfs + 1) * sizeof(int), + GFP_KERNEL); + else + res_alloc->allocated = kzalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL); + + if (!res_alloc->quota || !res_alloc->guaranteed || + !res_alloc->allocated) + goto no_mem_err; + + spin_lock_init(&res_alloc->alloc_lock); + for (t = 0; t < dev->num_vfs + 1; t++) { + switch (i) { + case RES_QP: + initialize_res_quotas(dev, res_alloc, RES_QP, + t, dev->caps.num_qps - + dev->caps.reserved_qps - + mlx4_num_reserved_sqps(dev)); + break; + case RES_CQ: + initialize_res_quotas(dev, res_alloc, RES_CQ, + t, dev->caps.num_cqs - + dev->caps.reserved_cqs); + break; + case RES_SRQ: + initialize_res_quotas(dev, res_alloc, RES_SRQ, + t, dev->caps.num_srqs - + dev->caps.reserved_srqs); + break; + case RES_MPT: + initialize_res_quotas(dev, res_alloc, RES_MPT, + t, dev->caps.num_mpts - + dev->caps.reserved_mrws); + break; + case RES_MTT: + initialize_res_quotas(dev, res_alloc, RES_MTT, + t, dev->caps.num_mtts - + dev->caps.reserved_mtts); + break; + case RES_MAC: + if (t == mlx4_master_func_num(dev)) { + res_alloc->quota[t] = + MLX4_MAX_MAC_NUM - 2 * dev->num_vfs; + res_alloc->guaranteed[t] = res_alloc->quota[t]; + for (j = 0; j < MLX4_MAX_PORTS; j++) + res_alloc->res_port_free[j] = MLX4_MAX_MAC_NUM; + } else { + res_alloc->quota[t] = 2; + res_alloc->guaranteed[t] = 2; + } + break; + case RES_VLAN: + if (t == mlx4_master_func_num(dev)) { + res_alloc->quota[t] = MLX4_MAX_VLAN_NUM; + res_alloc->guaranteed[t] = MLX4_MAX_VLAN_NUM / 2; + for (j = 0; j < MLX4_MAX_PORTS; j++) + res_alloc->res_port_free[j] = + res_alloc->quota[t]; + } else { + res_alloc->quota[t] = MLX4_MAX_VLAN_NUM / 2; + res_alloc->guaranteed[t] = 0; + } + break; + case RES_COUNTER: + res_alloc->quota[t] = dev->caps.max_counters; + res_alloc->guaranteed[t] = 0; + if (t == mlx4_master_func_num(dev)) + res_alloc->res_free = res_alloc->quota[t]; + break; + default: + break; + } + if (i == RES_MAC || i == RES_VLAN) { + for (j = 0; j < MLX4_MAX_PORTS; j++) + res_alloc->res_port_rsvd[j] += + res_alloc->guaranteed[t]; + } else { + res_alloc->res_reserved += res_alloc->guaranteed[t]; + } + } + } + spin_lock_init(&priv->mfunc.master.res_tracker.lock); + return 0; + +no_mem_err: + for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) { + kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated); + priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL; + kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed); + priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL; + kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota); + priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL; + } + return -ENOMEM; +} + +void mlx4_free_resource_tracker(struct mlx4_dev *dev, + enum mlx4_res_tracker_free_type type) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + if (priv->mfunc.master.res_tracker.slave_list) { + if (type != RES_TR_FREE_STRUCTS_ONLY) { + for (i = 0; i < dev->num_slaves; i++) { + if (type == RES_TR_FREE_ALL || + dev->caps.function != i) + mlx4_delete_all_resources_for_slave(dev, i); + } + /* free master's vlans */ + i = dev->caps.function; + mutex_lock(&priv->mfunc.master.res_tracker.slave_list[i].mutex); + rem_slave_vlans(dev, i); + mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[i].mutex); + } + + if (type != RES_TR_FREE_SLAVES_ONLY) { + for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) { + kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated); + priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL; + kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed); + priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL; + kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota); + priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL; + } + kfree(priv->mfunc.master.res_tracker.slave_list); + priv->mfunc.master.res_tracker.slave_list = NULL; + } + } +} + +static void update_pkey_index(struct mlx4_dev *dev, int slave, + struct mlx4_cmd_mailbox *inbox) +{ + u8 sched = *(u8 *)(inbox->buf + 64); + u8 orig_index = *(u8 *)(inbox->buf + 35); + u8 new_index; + struct mlx4_priv *priv = mlx4_priv(dev); + int port; + + port = (sched >> 6 & 1) + 1; + + new_index = priv->virt2phys_pkey[slave][port - 1][orig_index]; + *(u8 *)(inbox->buf + 35) = new_index; +} + +static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox, + u8 slave) +{ + struct mlx4_qp_context *qp_ctx = inbox->buf + 8; + enum mlx4_qp_optpar optpar = be32_to_cpu(*(__be32 *) inbox->buf); + u32 ts = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff; + int port; + + if (MLX4_QP_ST_UD == ts) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) + qp_ctx->pri_path.mgid_index = mlx4_get_base_gid_ix(dev, slave) | 0x80; + else + qp_ctx->pri_path.mgid_index = 0x80 | slave; + + } else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_UC == ts) { + if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) { + qp_ctx->pri_path.mgid_index += mlx4_get_base_gid_ix(dev, slave); + qp_ctx->pri_path.mgid_index &= 0x7f; + } else { + qp_ctx->pri_path.mgid_index = slave & 0x7F; + } + } + if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) { + port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) { + qp_ctx->alt_path.mgid_index += mlx4_get_base_gid_ix(dev, slave); + qp_ctx->alt_path.mgid_index &= 0x7f; + } else { + qp_ctx->alt_path.mgid_index = slave & 0x7F; + } + } + } +} + +static int update_vport_qp_param(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *inbox, + u8 slave) +{ + struct mlx4_qp_context *qpc = inbox->buf + 8; + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_priv *priv; + u32 qp_type; + int port; + + port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1; + priv = mlx4_priv(dev); + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + + if (MLX4_VGT != vp_oper->state.default_vlan) { + qp_type = (be32_to_cpu(qpc->flags) >> 16) & 0xff; + if (MLX4_QP_ST_RC == qp_type) + return -EINVAL; + + qpc->srqn |= cpu_to_be32(1 << 25); /*set cqe vlan mask */ + qpc->pri_path.vlan_index = vp_oper->vlan_idx; + qpc->pri_path.fl = 1 << 6; /* set cv bit*/ + qpc->pri_path.feup |= 1 << 3; /* set fvl bit */ + qpc->pri_path.sched_queue &= 0xC7; + qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3; + mlx4_dbg(dev, "qp %d port %d Q 0x%x set vlan to %d vidx %d feup %x fl %x\n", + be32_to_cpu(qpc->local_qpn) & 0xffffff, port, + (int)(qpc->pri_path.sched_queue), vp_oper->state.default_vlan, + vp_oper->vlan_idx, (int)(qpc->pri_path.feup), + (int)(qpc->pri_path.fl)); + } + if (vp_oper->state.spoofchk) { + qpc->pri_path.feup |= 1 << 5; /* set fsm bit */; + qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx; + mlx4_dbg(dev, "spoof qp %d port %d feup 0x%x, myLmc 0x%x mindx %d\n", + be32_to_cpu(qpc->local_qpn) & 0xffffff, port, + (int)qpc->pri_path.feup, (int)qpc->pri_path.grh_mylmc, + vp_oper->mac_idx); + } + return 0; +} + +static int mpt_mask(struct mlx4_dev *dev) +{ + return dev->caps.num_mpts - 1; +} + +static void *find_res(struct mlx4_dev *dev, int res_id, + enum mlx4_resource type) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return res_tracker_lookup(&priv->mfunc.master.res_tracker.res_tree[type], + res_id); +} + +static int get_res(struct mlx4_dev *dev, int slave, u64 res_id, + enum mlx4_resource type, + void *res) +{ + struct res_common *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = find_res(dev, res_id, type); + if (!r) { + err = -ENOENT; + goto exit; + } + + if (r->state == RES_ANY_BUSY) { + err = -EBUSY; + goto exit; + } + + if (r->owner != slave) { + err = -EPERM; + goto exit; + } + + r->from_state = r->state; + r->state = RES_ANY_BUSY; + + if (res) + *((struct res_common **)res) = r; + +exit: + spin_unlock_irq(mlx4_tlock(dev)); + return err; +} + +int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev, + enum mlx4_resource type, + u64 res_id, int *slave) +{ + + struct res_common *r; + int err = -ENOENT; + int id = res_id; + + if (type == RES_QP) + id &= 0x7fffff; + spin_lock(mlx4_tlock(dev)); + + r = find_res(dev, id, type); + if (r) { + *slave = r->owner; + err = 0; + } + spin_unlock(mlx4_tlock(dev)); + + return err; +} + +static void put_res(struct mlx4_dev *dev, int slave, u64 res_id, + enum mlx4_resource type) +{ + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = find_res(dev, res_id, type); + if (r) + r->state = r->from_state; + spin_unlock_irq(mlx4_tlock(dev)); +} + +static struct res_common *alloc_qp_tr(int id) +{ + struct res_qp *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_QP_RESERVED; + ret->local_qpn = id; + INIT_LIST_HEAD(&ret->mcg_list); + spin_lock_init(&ret->mcg_spl); + + return &ret->com; +} + +static struct res_common *alloc_mtt_tr(int id, int order) +{ + struct res_mtt *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->order = order; + ret->com.state = RES_MTT_ALLOCATED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_mpt_tr(int id, int key) +{ + struct res_mpt *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_MPT_RESERVED; + ret->key = key; + + return &ret->com; +} + +static struct res_common *alloc_eq_tr(int id) +{ + struct res_eq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_EQ_RESERVED; + + return &ret->com; +} + +static struct res_common *alloc_cq_tr(int id) +{ + struct res_cq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_CQ_ALLOCATED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_srq_tr(int id) +{ + struct res_srq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_SRQ_ALLOCATED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_counter_tr(int id) +{ + struct res_counter *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_COUNTER_ALLOCATED; + + return &ret->com; +} + +static struct res_common *alloc_xrcdn_tr(int id) +{ + struct res_xrcdn *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_XRCD_ALLOCATED; + + return &ret->com; +} + +static struct res_common *alloc_fs_rule_tr(u64 id) +{ + struct res_fs_rule *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_FS_RULE_ALLOCATED; + + return &ret->com; +} + +static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave, + int extra) +{ + struct res_common *ret; + + switch (type) { + case RES_QP: + ret = alloc_qp_tr(id); + break; + case RES_MPT: + ret = alloc_mpt_tr(id, extra); + break; + case RES_MTT: + ret = alloc_mtt_tr(id, extra); + break; + case RES_EQ: + ret = alloc_eq_tr(id); + break; + case RES_CQ: + ret = alloc_cq_tr(id); + break; + case RES_SRQ: + ret = alloc_srq_tr(id); + break; + case RES_MAC: + printk(KERN_ERR "implementation missing\n"); + return NULL; + case RES_COUNTER: + ret = alloc_counter_tr(id); + break; + case RES_XRCD: + ret = alloc_xrcdn_tr(id); + break; + case RES_FS_RULE: + ret = alloc_fs_rule_tr(id); + break; + default: + return NULL; + } + if (ret) + ret->owner = slave; + + return ret; +} + +static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count, + enum mlx4_resource type, int extra) +{ + int i; + int err; + struct mlx4_priv *priv = mlx4_priv(dev); + struct res_common **res_arr; + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct rb_root *root = &tracker->res_tree[type]; + + res_arr = kzalloc(count * sizeof *res_arr, GFP_KERNEL); + if (!res_arr) + return -ENOMEM; + + for (i = 0; i < count; ++i) { + res_arr[i] = alloc_tr(base + i, type, slave, extra); + if (!res_arr[i]) { + for (--i; i >= 0; --i) + kfree(res_arr[i]); + + kfree(res_arr); + return -ENOMEM; + } + } + + spin_lock_irq(mlx4_tlock(dev)); + for (i = 0; i < count; ++i) { + if (find_res(dev, base + i, type)) { + err = -EEXIST; + goto undo; + } + err = res_tracker_insert(root, res_arr[i]); + if (err) + goto undo; + list_add_tail(&res_arr[i]->list, + &tracker->slave_list[slave].res_list[type]); + } + spin_unlock_irq(mlx4_tlock(dev)); + kfree(res_arr); + + return 0; + +undo: + for (--i; i >= base; --i) + rb_erase(&res_arr[i]->node, root); + + spin_unlock_irq(mlx4_tlock(dev)); + + for (i = 0; i < count; ++i) + kfree(res_arr[i]); + + kfree(res_arr); + + return err; +} + +static int remove_qp_ok(struct res_qp *res) +{ + if (res->com.state == RES_QP_BUSY) + return -EBUSY; + else if (res->com.state != RES_QP_RESERVED) + return -EPERM; + + return 0; +} + +static int remove_mtt_ok(struct res_mtt *res, int order) +{ + if (res->com.state == RES_MTT_BUSY || + atomic_read(&res->ref_count)) { + printk(KERN_DEBUG "%s-%d: state %s, ref_count %d\n", + __func__, __LINE__, + mtt_states_str(res->com.state), + atomic_read(&res->ref_count)); + return -EBUSY; + } else if (res->com.state != RES_MTT_ALLOCATED) + return -EPERM; + else if (res->order != order) + return -EINVAL; + + return 0; +} + +static int remove_mpt_ok(struct res_mpt *res) +{ + if (res->com.state == RES_MPT_BUSY) + return -EBUSY; + else if (res->com.state != RES_MPT_RESERVED) + return -EPERM; + + return 0; +} + +static int remove_eq_ok(struct res_eq *res) +{ + if (res->com.state == RES_MPT_BUSY) + return -EBUSY; + else if (res->com.state != RES_MPT_RESERVED) + return -EPERM; + + return 0; +} + +static int remove_counter_ok(struct res_counter *res) +{ + if (res->com.state == RES_COUNTER_BUSY) + return -EBUSY; + else if (res->com.state != RES_COUNTER_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_xrcdn_ok(struct res_xrcdn *res) +{ + if (res->com.state == RES_XRCD_BUSY) + return -EBUSY; + else if (res->com.state != RES_XRCD_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_fs_rule_ok(struct res_fs_rule *res) +{ + if (res->com.state == RES_FS_RULE_BUSY) + return -EBUSY; + else if (res->com.state != RES_FS_RULE_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_cq_ok(struct res_cq *res) +{ + if (res->com.state == RES_CQ_BUSY) + return -EBUSY; + else if (res->com.state != RES_CQ_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_srq_ok(struct res_srq *res) +{ + if (res->com.state == RES_SRQ_BUSY) + return -EBUSY; + else if (res->com.state != RES_SRQ_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra) +{ + switch (type) { + case RES_QP: + return remove_qp_ok((struct res_qp *)res); + case RES_CQ: + return remove_cq_ok((struct res_cq *)res); + case RES_SRQ: + return remove_srq_ok((struct res_srq *)res); + case RES_MPT: + return remove_mpt_ok((struct res_mpt *)res); + case RES_MTT: + return remove_mtt_ok((struct res_mtt *)res, extra); + case RES_MAC: + return -ENOSYS; + case RES_EQ: + return remove_eq_ok((struct res_eq *)res); + case RES_COUNTER: + return remove_counter_ok((struct res_counter *)res); + case RES_XRCD: + return remove_xrcdn_ok((struct res_xrcdn *)res); + case RES_FS_RULE: + return remove_fs_rule_ok((struct res_fs_rule *)res); + default: + return -EINVAL; + } +} + +static int rem_res_range(struct mlx4_dev *dev, int slave, u64 base, int count, + enum mlx4_resource type, int extra) +{ + u64 i; + int err; + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + for (i = base; i < base + count; ++i) { + r = res_tracker_lookup(&tracker->res_tree[type], i); + if (!r) { + err = -ENOENT; + goto out; + } + if (r->owner != slave) { + err = -EPERM; + goto out; + } + err = remove_ok(r, type, extra); + if (err) + goto out; + } + + for (i = base; i < base + count; ++i) { + r = res_tracker_lookup(&tracker->res_tree[type], i); + rb_erase(&r->node, &tracker->res_tree[type]); + list_del(&r->list); + kfree(r); + } + err = 0; + +out: + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn, + enum res_qp_states state, struct res_qp **qp, + int alloc) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_qp *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_QP], qpn); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_QP_BUSY: + mlx4_dbg(dev, "%s: failed RES_QP, 0x%llx\n", + __func__, r->com.res_id); + err = -EBUSY; + break; + + case RES_QP_RESERVED: + if (r->com.state == RES_QP_MAPPED && !alloc) + break; + + mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", r->com.res_id); + err = -EINVAL; + break; + + case RES_QP_MAPPED: + if ((r->com.state == RES_QP_RESERVED && alloc) || + r->com.state == RES_QP_HW) + break; + else { + mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", + r->com.res_id); + err = -EINVAL; + } + + break; + + case RES_QP_HW: + if (r->com.state != RES_QP_MAPPED) + err = -EINVAL; + break; + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_QP_BUSY; + if (qp) + *qp = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_mpt_states state, struct res_mpt **mpt) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_mpt *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_MPT], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_MPT_BUSY: + err = -EINVAL; + break; + + case RES_MPT_RESERVED: + if (r->com.state != RES_MPT_MAPPED) + err = -EINVAL; + break; + + case RES_MPT_MAPPED: + if (r->com.state != RES_MPT_RESERVED && + r->com.state != RES_MPT_HW) + err = -EINVAL; + break; + + case RES_MPT_HW: + if (r->com.state != RES_MPT_MAPPED) + err = -EINVAL; + break; + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_MPT_BUSY; + if (mpt) + *mpt = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_eq_states state, struct res_eq **eq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_eq *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_EQ], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_EQ_BUSY: + err = -EINVAL; + break; + + case RES_EQ_RESERVED: + if (r->com.state != RES_EQ_HW) + err = -EINVAL; + break; + + case RES_EQ_HW: + if (r->com.state != RES_EQ_RESERVED) + err = -EINVAL; + break; + + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_EQ_BUSY; + if (eq) + *eq = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn, + enum res_cq_states state, struct res_cq **cq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_cq *r; + int err; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_CQ_BUSY: + err = -EBUSY; + break; + + case RES_CQ_ALLOCATED: + if (r->com.state != RES_CQ_HW) + err = -EINVAL; + else if (atomic_read(&r->ref_count)) + err = -EBUSY; + else + err = 0; + break; + + case RES_CQ_HW: + if (r->com.state != RES_CQ_ALLOCATED) + err = -EINVAL; + else + err = 0; + break; + + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_CQ_BUSY; + if (cq) + *cq = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_srq_states state, struct res_srq **srq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_srq *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_SRQ], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_SRQ_BUSY: + err = -EINVAL; + break; + + case RES_SRQ_ALLOCATED: + if (r->com.state != RES_SRQ_HW) + err = -EINVAL; + else if (atomic_read(&r->ref_count)) + err = -EBUSY; + break; + + case RES_SRQ_HW: + if (r->com.state != RES_SRQ_ALLOCATED) + err = -EINVAL; + break; + + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_SRQ_BUSY; + if (srq) + *srq = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static void res_abort_move(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[type], id); + if (r && (r->owner == slave)) + r->state = r->from_state; + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void res_end_move(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[type], id); + if (r && (r->owner == slave)) + r->state = r->to_state; + spin_unlock_irq(mlx4_tlock(dev)); +} + +static int valid_reserved(struct mlx4_dev *dev, int slave, int qpn) +{ + return mlx4_is_qp_reserved(dev, qpn) && + (mlx4_is_master(dev) || mlx4_is_guest_proxy(dev, slave, qpn)); +} + +static int fw_reserved(struct mlx4_dev *dev, int qpn) +{ + return qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW]; +} + +static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err; + int count; + int align; + int base; + int qpn; + u8 bf_qp; + + switch (op) { + case RES_OP_RESERVE: + count = get_param_l(&in_param) & 0xffffff; + bf_qp = get_param_l(&in_param) >> 31; + align = get_param_h(&in_param); + err = mlx4_grant_resource(dev, slave, RES_QP, count, 0); + if (err) + return err; + + err = __mlx4_qp_reserve_range(dev, count, align, &base, bf_qp); + if (err) { + mlx4_release_resource(dev, slave, RES_QP, count, 0); + return err; + } + + err = add_res_range(dev, slave, base, count, RES_QP, 0); + if (err) { + mlx4_release_resource(dev, slave, RES_QP, count, 0); + __mlx4_qp_release_range(dev, base, count); + return err; + } + set_param_l(out_param, base); + break; + case RES_OP_MAP_ICM: + qpn = get_param_l(&in_param) & 0x7fffff; + if (valid_reserved(dev, slave, qpn)) { + err = add_res_range(dev, slave, qpn, 1, RES_QP, 0); + if (err) + return err; + } + + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, + NULL, 1); + if (err) + return err; + + if (!fw_reserved(dev, qpn)) { + err = __mlx4_qp_alloc_icm(dev, qpn); + if (err) { + res_abort_move(dev, slave, RES_QP, qpn); + return err; + } + } + + res_end_move(dev, slave, RES_QP, qpn); + break; + + default: + err = -EINVAL; + break; + } + return err; +} + +static int mtt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int base; + int order; + + if (op != RES_OP_RESERVE_AND_MAP) + return err; + + order = get_param_l(&in_param); + + err = mlx4_grant_resource(dev, slave, RES_MTT, 1 << order, 0); + if (err) + return err; + + base = __mlx4_alloc_mtt_range(dev, order); + if (base == -1) { + mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0); + return -ENOMEM; + } + + err = add_res_range(dev, slave, base, 1, RES_MTT, order); + if (err) { + mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0); + __mlx4_free_mtt_range(dev, base, order); + } else + set_param_l(out_param, base); + + return err; +} + +static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int index; + int id; + struct res_mpt *mpt; + + switch (op) { + case RES_OP_RESERVE: + err = mlx4_grant_resource(dev, slave, RES_MPT, 1, 0); + if (err) + break; + + index = __mlx4_mr_reserve(dev); + if (index == -1) { + mlx4_release_resource(dev, slave, RES_MPT, 1, 0); + break; + } + id = index & mpt_mask(dev); + + err = add_res_range(dev, slave, id, 1, RES_MPT, index); + if (err) { + mlx4_release_resource(dev, slave, RES_MPT, 1, 0); + __mlx4_mr_release(dev, index); + break; + } + set_param_l(out_param, index); + break; + case RES_OP_MAP_ICM: + index = get_param_l(&in_param); + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, + RES_MPT_MAPPED, &mpt); + if (err) + return err; + + err = __mlx4_mr_alloc_icm(dev, mpt->key); + if (err) { + res_abort_move(dev, slave, RES_MPT, id); + return err; + } + + res_end_move(dev, slave, RES_MPT, id); + break; + } + return err; +} + +static int cq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int cqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + err = mlx4_grant_resource(dev, slave, RES_CQ, 1, 0); + if (err) + break; + + err = __mlx4_cq_alloc_icm(dev, &cqn); + if (err) { + mlx4_release_resource(dev, slave, RES_CQ, 1, 0); + break; + } + + err = add_res_range(dev, slave, cqn, 1, RES_CQ, 0); + if (err) { + mlx4_release_resource(dev, slave, RES_CQ, 1, 0); + __mlx4_cq_free_icm(dev, cqn); + break; + } + + set_param_l(out_param, cqn); + break; + + default: + err = -EINVAL; + } + + return err; +} + +static int srq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int srqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + err = mlx4_grant_resource(dev, slave, RES_SRQ, 1, 0); + if (err) + break; + + err = __mlx4_srq_alloc_icm(dev, &srqn); + if (err) { + mlx4_release_resource(dev, slave, RES_SRQ, 1, 0); + break; + } + + err = add_res_range(dev, slave, srqn, 1, RES_SRQ, 0); + if (err) { + mlx4_release_resource(dev, slave, RES_SRQ, 1, 0); + __mlx4_srq_free_icm(dev, srqn); + break; + } + + set_param_l(out_param, srqn); + break; + + default: + err = -EINVAL; + } + + return err; +} + +static int mac_find_smac_ix_in_slave(struct mlx4_dev *dev, int slave, int port, + u8 smac_index, u64 *mac) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + if (res->smac_index == smac_index && res->port == (u8) port) { + *mac = res->mac; + return 0; + } + } + return -ENOENT; +} + +static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port, u8 smac_index) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + if (res->mac == mac && res->port == (u8) port) { + /* mac found. update ref count */ + ++res->ref_count; + return 0; + } + } + + if (mlx4_grant_resource(dev, slave, RES_MAC, 1, port)) + return -EINVAL; + res = kzalloc(sizeof *res, GFP_KERNEL); + if (!res) { + mlx4_release_resource(dev, slave, RES_MAC, 1, port); + return -ENOMEM; + } + res->mac = mac; + res->port = (u8) port; + res->smac_index = smac_index; + res->ref_count = 1; + list_add_tail(&res->list, + &tracker->slave_list[slave].res_list[RES_MAC]); + return 0; +} + + +static void mac_del_from_slave(struct mlx4_dev *dev, int slave, u64 mac, + int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + if (res->mac == mac && res->port == (u8) port) { + if (!--res->ref_count) { + list_del(&res->list); + mlx4_release_resource(dev, slave, RES_MAC, 1, port); + kfree(res); + } + break; + } + } +} + +static void rem_slave_macs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + int i; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + list_del(&res->list); + /* dereference the mac the num times the slave referenced it */ + for (i = 0; i < res->ref_count; i++) + __mlx4_unregister_mac(dev, res->port, res->mac); + mlx4_release_resource(dev, slave, RES_MAC, 1, res->port); + kfree(res); + } +} + +static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param, int in_port) +{ + int err = -EINVAL; + int port; + u64 mac; + u8 smac_index = 0; + + if (op != RES_OP_RESERVE_AND_MAP) + return err; + + port = !in_port ? get_param_l(out_param) : in_port; + mac = in_param; + + err = __mlx4_register_mac(dev, port, mac); + if (err >= 0) { + smac_index = err; + set_param_l(out_param, err); + err = 0; + } + + if (!err) { + err = mac_add_to_slave(dev, slave, mac, port, smac_index); + if (err) + __mlx4_unregister_mac(dev, port, mac); + } + return err; +} + +static int vlan_add_to_slave(struct mlx4_dev *dev, int slave, u16 vlan, + int port, int vlan_index) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *vlan_list = + &tracker->slave_list[slave].res_list[RES_VLAN]; + struct vlan_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, vlan_list, list) { + if (res->vlan == vlan && res->port == (u8) port) { + /* vlan found. update ref count */ + ++res->ref_count; + return 0; + } + } + + if (mlx4_grant_resource(dev, slave, RES_VLAN, 1, port)) + return -EINVAL; + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) { + mlx4_release_resource(dev, slave, RES_VLAN, 1, port); + return -ENOMEM; + } + res->vlan = vlan; + res->port = (u8) port; + res->vlan_index = vlan_index; + res->ref_count = 1; + list_add_tail(&res->list, + &tracker->slave_list[slave].res_list[RES_VLAN]); + return 0; +} + + +static void vlan_del_from_slave(struct mlx4_dev *dev, int slave, u16 vlan, + int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *vlan_list = + &tracker->slave_list[slave].res_list[RES_VLAN]; + struct vlan_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, vlan_list, list) { + if (res->vlan == vlan && res->port == (u8) port) { + if (!--res->ref_count) { + list_del(&res->list); + mlx4_release_resource(dev, slave, RES_VLAN, + 1, port); + kfree(res); + } + break; + } + } +} + +static void rem_slave_vlans(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *vlan_list = + &tracker->slave_list[slave].res_list[RES_VLAN]; + struct vlan_res *res, *tmp; + int i; + + list_for_each_entry_safe(res, tmp, vlan_list, list) { + list_del(&res->list); + /* dereference the vlan the num times the slave referenced it */ + for (i = 0; i < res->ref_count; i++) + __mlx4_unregister_vlan(dev, res->port, res->vlan); + mlx4_release_resource(dev, slave, RES_VLAN, 1, res->port); + kfree(res); + } +} + +static int vlan_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param, int port) +{ + int err = -EINVAL; + u16 vlan; + int vlan_index; + + if (!port) + return err; + + if (op != RES_OP_RESERVE_AND_MAP) + return err; + + vlan = (u16) in_param; + + err = __mlx4_register_vlan(dev, port, vlan, &vlan_index); + if (!err) { + set_param_l(out_param, (u32) vlan_index); + err = vlan_add_to_slave(dev, slave, vlan, port, vlan_index); + if (err) + __mlx4_unregister_vlan(dev, port, vlan); + } + return err; +} + +static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + u32 index; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + err = mlx4_grant_resource(dev, slave, RES_COUNTER, 1, 0); + if (err) + return err; + + err = __mlx4_counter_alloc(dev, &index); + if (err) { + mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0); + return err; + } + + err = add_res_range(dev, slave, index, 1, RES_COUNTER, 0); + if (err) { + __mlx4_counter_free(dev, index); + mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0); + } else { + set_param_l(out_param, index); + } + + return err; +} + +static int xrcdn_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + u32 xrcdn; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + err = __mlx4_xrcd_alloc(dev, &xrcdn); + if (err) + return err; + + err = add_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0); + if (err) + __mlx4_xrcd_free(dev, xrcdn); + else + set_param_l(out_param, xrcdn); + + return err; +} + +int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int alop = vhcr->op_modifier; + + switch (vhcr->in_modifier & 0xFF) { + case RES_QP: + err = qp_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MTT: + err = mtt_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MPT: + err = mpt_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_CQ: + err = cq_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_SRQ: + err = srq_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MAC: + err = mac_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param, + (vhcr->in_modifier >> 8) & 0xFF); + break; + + case RES_VLAN: + err = vlan_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param, + (vhcr->in_modifier >> 8) & 0xFF); + break; + + case RES_COUNTER: + err = counter_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_XRCD: + err = xrcdn_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + default: + err = -EINVAL; + break; + } + + return err; +} + +static int qp_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param) +{ + int err; + int count; + int base; + int qpn; + + switch (op) { + case RES_OP_RESERVE: + base = get_param_l(&in_param) & 0x7fffff; + count = get_param_h(&in_param); + err = rem_res_range(dev, slave, base, count, RES_QP, 0); + if (err) + break; + mlx4_release_resource(dev, slave, RES_QP, count, 0); + __mlx4_qp_release_range(dev, base, count); + break; + case RES_OP_MAP_ICM: + qpn = get_param_l(&in_param) & 0x7fffff; + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_RESERVED, + NULL, 0); + if (err) + return err; + + if (!fw_reserved(dev, qpn)) + __mlx4_qp_free_icm(dev, qpn); + + res_end_move(dev, slave, RES_QP, qpn); + + if (valid_reserved(dev, slave, qpn)) + err = rem_res_range(dev, slave, qpn, 1, RES_QP, 0); + break; + default: + err = -EINVAL; + break; + } + return err; +} + +static int mtt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int base; + int order; + + if (op != RES_OP_RESERVE_AND_MAP) + return err; + + base = get_param_l(&in_param); + order = get_param_h(&in_param); + err = rem_res_range(dev, slave, base, 1, RES_MTT, order); + if (!err) { + mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0); + __mlx4_free_mtt_range(dev, base, order); + } + return err; +} + +static int mpt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param) +{ + int err = -EINVAL; + int index; + int id; + struct res_mpt *mpt; + + switch (op) { + case RES_OP_RESERVE: + index = get_param_l(&in_param); + id = index & mpt_mask(dev); + err = get_res(dev, slave, id, RES_MPT, &mpt); + if (err) + break; + index = mpt->key; + put_res(dev, slave, id, RES_MPT); + + err = rem_res_range(dev, slave, id, 1, RES_MPT, 0); + if (err) + break; + mlx4_release_resource(dev, slave, RES_MPT, 1, 0); + __mlx4_mr_release(dev, index); + break; + case RES_OP_MAP_ICM: + index = get_param_l(&in_param); + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, + RES_MPT_RESERVED, &mpt); + if (err) + return err; + + __mlx4_mr_free_icm(dev, mpt->key); + res_end_move(dev, slave, RES_MPT, id); + return err; + break; + default: + err = -EINVAL; + break; + } + return err; +} + +static int cq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int cqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + cqn = get_param_l(&in_param); + err = rem_res_range(dev, slave, cqn, 1, RES_CQ, 0); + if (err) + break; + + mlx4_release_resource(dev, slave, RES_CQ, 1, 0); + __mlx4_cq_free_icm(dev, cqn); + break; + + default: + err = -EINVAL; + break; + } + + return err; +} + +static int srq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int srqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + srqn = get_param_l(&in_param); + err = rem_res_range(dev, slave, srqn, 1, RES_SRQ, 0); + if (err) + break; + + mlx4_release_resource(dev, slave, RES_SRQ, 1, 0); + __mlx4_srq_free_icm(dev, srqn); + break; + + default: + err = -EINVAL; + break; + } + + return err; +} + +static int mac_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param, int in_port) +{ + int port; + int err = 0; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + port = !in_port ? get_param_l(out_param) : in_port; + mac_del_from_slave(dev, slave, in_param, port); + __mlx4_unregister_mac(dev, port, in_param); + break; + default: + err = -EINVAL; + break; + } + + return err; + +} + +static int vlan_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param, int port) +{ + int err = 0; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + if (!port) + return -EINVAL; + vlan_del_from_slave(dev, slave, in_param, port); + __mlx4_unregister_vlan(dev, port, in_param); + break; + default: + err = -EINVAL; + break; + } + + return err; +} + +static int counter_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int index; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + index = get_param_l(&in_param); + err = rem_res_range(dev, slave, index, 1, RES_COUNTER, 0); + if (err) + return err; + + __mlx4_counter_free(dev, index); + mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0); + + return err; +} + +static int xrcdn_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int xrcdn; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + xrcdn = get_param_l(&in_param); + err = rem_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0); + if (err) + return err; + + __mlx4_xrcd_free(dev, xrcdn); + + return err; +} + +int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err = -EINVAL; + int alop = vhcr->op_modifier; + + switch (vhcr->in_modifier & 0xFF) { + case RES_QP: + err = qp_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param); + break; + + case RES_MTT: + err = mtt_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MPT: + err = mpt_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param); + break; + + case RES_CQ: + err = cq_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_SRQ: + err = srq_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MAC: + err = mac_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param, + (vhcr->in_modifier >> 8) & 0xFF); + break; + + case RES_VLAN: + err = vlan_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param, + (vhcr->in_modifier >> 8) & 0xFF); + break; + + case RES_COUNTER: + err = counter_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_XRCD: + err = xrcdn_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + + default: + break; + } + return err; +} + +/* ugly but other choices are uglier */ +static int mr_phys_mpt(struct mlx4_mpt_entry *mpt) +{ + return (be32_to_cpu(mpt->flags) >> 9) & 1; +} + +static int mr_get_mtt_addr(struct mlx4_mpt_entry *mpt) +{ + return (int)be64_to_cpu(mpt->mtt_addr) & 0xfffffff8; +} + +static int mr_get_mtt_size(struct mlx4_mpt_entry *mpt) +{ + return be32_to_cpu(mpt->mtt_sz); +} + +static int qp_get_mtt_addr(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->mtt_base_addr_l) & 0xfffffff8; +} + +static int srq_get_mtt_addr(struct mlx4_srq_context *srqc) +{ + return be32_to_cpu(srqc->mtt_base_addr_l) & 0xfffffff8; +} + +static int qp_get_mtt_size(struct mlx4_qp_context *qpc) +{ + int page_shift = (qpc->log_page_size & 0x3f) + 12; + int log_sq_size = (qpc->sq_size_stride >> 3) & 0xf; + int log_sq_sride = qpc->sq_size_stride & 7; + int log_rq_size = (qpc->rq_size_stride >> 3) & 0xf; + int log_rq_stride = qpc->rq_size_stride & 7; + int srq = (be32_to_cpu(qpc->srqn) >> 24) & 1; + int rss = (be32_to_cpu(qpc->flags) >> 13) & 1; + int xrc = (be32_to_cpu(qpc->local_qpn) >> 23) & 1; + int sq_size; + int rq_size; + int total_pages; + int total_mem; + int page_offset = (be32_to_cpu(qpc->params2) >> 6) & 0x3f; + + sq_size = 1 << (log_sq_size + log_sq_sride + 4); + rq_size = (srq|rss|xrc) ? 0 : (1 << (log_rq_size + log_rq_stride + 4)); + total_mem = sq_size + rq_size; + total_pages = + roundup_pow_of_two((total_mem + (page_offset << 6)) >> + page_shift); + + return total_pages; +} + +static int check_mtt_range(struct mlx4_dev *dev, int slave, int start, + int size, struct res_mtt *mtt) +{ + int res_start = mtt->com.res_id; + int res_size = (1 << mtt->order); + + if (start < res_start || start + size > res_start + res_size) + return -EPERM; + return 0; +} + +int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mtt *mtt; + struct res_mpt *mpt; + int mtt_base = mr_get_mtt_addr(inbox->buf) / dev->caps.mtt_entry_sz; + int phys; + int id; + + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, RES_MPT_HW, &mpt); + if (err) + return err; + + phys = mr_phys_mpt(inbox->buf); + if (!phys) { + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto ex_abort; + + err = check_mtt_range(dev, slave, mtt_base, + mr_get_mtt_size(inbox->buf), mtt); + if (err) + goto ex_put; + + mpt->mtt = mtt; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put; + + if (!phys) { + atomic_inc(&mtt->ref_count); + put_res(dev, slave, mtt->com.res_id, RES_MTT); + } + + res_end_move(dev, slave, RES_MPT, id); + return 0; + +ex_put: + if (!phys) + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_MPT, id); + + return err; +} + +int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mpt *mpt; + int id; + + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, RES_MPT_MAPPED, &mpt); + if (err) + return err; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_abort; + + if (mpt->mtt) + atomic_dec(&mpt->mtt->ref_count); + + res_end_move(dev, slave, RES_MPT, id); + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_MPT, id); + + return err; +} + +int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mpt *mpt; + int id; + + id = index & mpt_mask(dev); + err = get_res(dev, slave, id, RES_MPT, &mpt); + if (err) + return err; + + if (mpt->com.from_state != RES_MPT_HW) { + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + +out: + put_res(dev, slave, id, RES_MPT); + return err; +} + +static int qp_get_rcqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->cqn_recv) & 0xffffff; +} + +static int qp_get_scqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->cqn_send) & 0xffffff; +} + +static u32 qp_get_srqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->srqn) & 0x1ffffff; +} + +static void adjust_proxy_tun_qkey(struct mlx4_dev *dev, struct mlx4_vhcr *vhcr, + struct mlx4_qp_context *context) +{ + u32 qpn = vhcr->in_modifier & 0xffffff; + u32 qkey = 0; + + if (mlx4_get_parav_qkey(dev, qpn, &qkey)) + return; + + /* adjust qkey in qp context */ + context->qkey = cpu_to_be32(qkey); +} + +int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_mtt *mtt; + struct res_qp *qp; + struct mlx4_qp_context *qpc = inbox->buf + 8; + int mtt_base = qp_get_mtt_addr(qpc) / dev->caps.mtt_entry_sz; + int mtt_size = qp_get_mtt_size(qpc); + struct res_cq *rcq; + struct res_cq *scq; + int rcqn = qp_get_rcqn(qpc); + int scqn = qp_get_scqn(qpc); + u32 srqn = qp_get_srqn(qpc) & 0xffffff; + int use_srq = (qp_get_srqn(qpc) >> 24) & 1; + struct res_srq *srq; + int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff; + + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_HW, &qp, 0); + if (err) + return err; + qp->local_qpn = local_qpn; + + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto ex_abort; + + err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt); + if (err) + goto ex_put_mtt; + + err = get_res(dev, slave, rcqn, RES_CQ, &rcq); + if (err) + goto ex_put_mtt; + + if (scqn != rcqn) { + err = get_res(dev, slave, scqn, RES_CQ, &scq); + if (err) + goto ex_put_rcq; + } else + scq = rcq; + + if (use_srq) { + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) + goto ex_put_scq; + } + + adjust_proxy_tun_qkey(dev, vhcr, qpc); + update_pkey_index(dev, slave, inbox); + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put_srq; + atomic_inc(&mtt->ref_count); + qp->mtt = mtt; + atomic_inc(&rcq->ref_count); + qp->rcq = rcq; + atomic_inc(&scq->ref_count); + qp->scq = scq; + + if (scqn != rcqn) + put_res(dev, slave, scqn, RES_CQ); + + if (use_srq) { + atomic_inc(&srq->ref_count); + put_res(dev, slave, srqn, RES_SRQ); + qp->srq = srq; + } + put_res(dev, slave, rcqn, RES_CQ); + put_res(dev, slave, mtt_base, RES_MTT); + res_end_move(dev, slave, RES_QP, qpn); + + return 0; + +ex_put_srq: + if (use_srq) + put_res(dev, slave, srqn, RES_SRQ); +ex_put_scq: + if (scqn != rcqn) + put_res(dev, slave, scqn, RES_CQ); +ex_put_rcq: + put_res(dev, slave, rcqn, RES_CQ); +ex_put_mtt: + put_res(dev, slave, mtt_base, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_QP, qpn); + + return err; +} + +static int eq_get_mtt_addr(struct mlx4_eq_context *eqc) +{ + return be32_to_cpu(eqc->mtt_base_addr_l) & 0xfffffff8; +} + +static int eq_get_mtt_size(struct mlx4_eq_context *eqc) +{ + int log_eq_size = eqc->log_eq_size & 0x1f; + int page_shift = (eqc->log_page_size & 0x3f) + 12; + + if (log_eq_size + 5 < page_shift) + return 1; + + return 1 << (log_eq_size + 5 - page_shift); +} + +static int cq_get_mtt_addr(struct mlx4_cq_context *cqc) +{ + return be32_to_cpu(cqc->mtt_base_addr_l) & 0xfffffff8; +} + +static int cq_get_mtt_size(struct mlx4_cq_context *cqc) +{ + int log_cq_size = (be32_to_cpu(cqc->logsize_usrpage) >> 24) & 0x1f; + int page_shift = (cqc->log_page_size & 0x3f) + 12; + + if (log_cq_size + 5 < page_shift) + return 1; + + return 1 << (log_cq_size + 5 - page_shift); +} + +int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int eqn = vhcr->in_modifier; + int res_id = (slave << 8) | eqn; + struct mlx4_eq_context *eqc = inbox->buf; + int mtt_base = eq_get_mtt_addr(eqc) / dev->caps.mtt_entry_sz; + int mtt_size = eq_get_mtt_size(eqc); + struct res_eq *eq; + struct res_mtt *mtt; + + err = add_res_range(dev, slave, res_id, 1, RES_EQ, 0); + if (err) + return err; + err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_HW, &eq); + if (err) + goto out_add; + + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto out_move; + + err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt); + if (err) + goto out_put; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto out_put; + + atomic_inc(&mtt->ref_count); + eq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_EQ, res_id); + return 0; + +out_put: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +out_move: + res_abort_move(dev, slave, RES_EQ, res_id); +out_add: + rem_res_range(dev, slave, res_id, 1, RES_EQ, 0); + return err; +} + +static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start, + int len, struct res_mtt **res) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_mtt *mtt; + int err = -EINVAL; + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry(mtt, &tracker->slave_list[slave].res_list[RES_MTT], + com.list) { + if (!check_mtt_range(dev, slave, start, len, mtt)) { + *res = mtt; + mtt->com.from_state = mtt->com.state; + mtt->com.state = RES_MTT_BUSY; + err = 0; + break; + } + } + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int verify_qp_parameters(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *inbox, + enum qp_transition transition, u8 slave) +{ + u32 qp_type; + struct mlx4_qp_context *qp_ctx; + enum mlx4_qp_optpar optpar; + int port; + int num_gids; + + qp_ctx = inbox->buf + 8; + qp_type = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff; + optpar = be32_to_cpu(*(__be32 *) inbox->buf); + + switch (qp_type) { + case MLX4_QP_ST_RC: + case MLX4_QP_ST_UC: + switch (transition) { + case QP_TRANS_INIT2RTR: + case QP_TRANS_RTR2RTS: + case QP_TRANS_RTS2RTS: + case QP_TRANS_SQD2SQD: + case QP_TRANS_SQD2RTS: + if (slave != mlx4_master_func_num(dev)) + if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) + num_gids = mlx4_get_slave_num_gids(dev, slave); + else + num_gids = 1; + if (qp_ctx->pri_path.mgid_index >= num_gids) + return -EINVAL; + } + if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) { + port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1; + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) + num_gids = mlx4_get_slave_num_gids(dev, slave); + else + num_gids = 1; + if (qp_ctx->alt_path.mgid_index >= num_gids) + return -EINVAL; + } + break; + default: + break; + } + + break; + default: + break; + } + + return 0; +} + +int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_mtt mtt; + __be64 *page_list = inbox->buf; + u64 *pg_list = (u64 *)page_list; + int i; + struct res_mtt *rmtt = NULL; + int start = be64_to_cpu(page_list[0]); + int npages = vhcr->in_modifier; + int err; + + err = get_containing_mtt(dev, slave, start, npages, &rmtt); + if (err) + return err; + + /* Call the SW implementation of write_mtt: + * - Prepare a dummy mtt struct + * - Translate inbox contents to simple addresses in host endianess */ + mtt.offset = 0; /* TBD this is broken but I don't handle it since + we don't really use it */ + mtt.order = 0; + mtt.page_shift = 0; + for (i = 0; i < npages; ++i) + pg_list[i + 2] = (be64_to_cpu(page_list[i + 2]) & ~1ULL); + + err = __mlx4_write_mtt(dev, &mtt, be64_to_cpu(page_list[0]), npages, + ((u64 *)page_list + 2)); + + if (rmtt) + put_res(dev, slave, rmtt->com.res_id, RES_MTT); + + return err; +} + +int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int eqn = vhcr->in_modifier; + int res_id = eqn | (slave << 8); + struct res_eq *eq; + int err; + + err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_RESERVED, &eq); + if (err) + return err; + + err = get_res(dev, slave, eq->mtt->com.res_id, RES_MTT, NULL); + if (err) + goto ex_abort; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put; + + atomic_dec(&eq->mtt->ref_count); + put_res(dev, slave, eq->mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_EQ, res_id); + rem_res_range(dev, slave, res_id, 1, RES_EQ, 0); + + return 0; + +ex_put: + put_res(dev, slave, eq->mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_EQ, res_id); + + return err; +} + +int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq_info *event_eq; + struct mlx4_cmd_mailbox *mailbox; + u32 in_modifier = 0; + int err; + int res_id; + struct res_eq *req; + + if (!priv->mfunc.master.slave_state) + return -EINVAL; + + event_eq = &priv->mfunc.master.slave_state[slave].event_eq[eqe->type]; + + /* Create the event only if the slave is registered */ + if (event_eq->eqn < 0) + return 0; + + mutex_lock(&priv->mfunc.master.gen_eqe_mutex[slave]); + res_id = (slave << 8) | event_eq->eqn; + err = get_res(dev, slave, res_id, RES_EQ, &req); + if (err) + goto unlock; + + if (req->com.from_state != RES_EQ_HW) { + err = -EINVAL; + goto put; + } + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto put; + } + + if (eqe->type == MLX4_EVENT_TYPE_CMD) { + ++event_eq->token; + eqe->event.cmd.token = cpu_to_be16(event_eq->token); + } + + memcpy(mailbox->buf, (u8 *) eqe, 28); + + in_modifier = (slave & 0xff) | ((event_eq->eqn & 0xff) << 16); + + err = mlx4_cmd(dev, mailbox->dma, in_modifier, 0, + MLX4_CMD_GEN_EQE, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + + put_res(dev, slave, res_id, RES_EQ); + mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]); + mlx4_free_cmd_mailbox(dev, mailbox); + return err; + +put: + put_res(dev, slave, res_id, RES_EQ); + +unlock: + mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]); + return err; +} + +int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int eqn = vhcr->in_modifier; + int res_id = eqn | (slave << 8); + struct res_eq *eq; + int err; + + err = get_res(dev, slave, res_id, RES_EQ, &eq); + if (err) + return err; + + if (eq->com.from_state != RES_EQ_HW) { + err = -EINVAL; + goto ex_put; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + +ex_put: + put_res(dev, slave, res_id, RES_EQ); + return err; +} + +int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int cqn = vhcr->in_modifier; + struct mlx4_cq_context *cqc = inbox->buf; + int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz; + struct res_cq *cq; + struct res_mtt *mtt; + + err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq); + if (err) + return err; + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto out_move; + err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt); + if (err) + goto out_put; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto out_put; + atomic_inc(&mtt->ref_count); + cq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_CQ, cqn); + return 0; + +out_put: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +out_move: + res_abort_move(dev, slave, RES_CQ, cqn); + return err; +} + +int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int cqn = vhcr->in_modifier; + struct res_cq *cq; + + err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq); + if (err) + return err; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto out_move; + atomic_dec(&cq->mtt->ref_count); + res_end_move(dev, slave, RES_CQ, cqn); + return 0; + +out_move: + res_abort_move(dev, slave, RES_CQ, cqn); + return err; +} + +int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int cqn = vhcr->in_modifier; + struct res_cq *cq; + int err; + + err = get_res(dev, slave, cqn, RES_CQ, &cq); + if (err) + return err; + + if (cq->com.from_state != RES_CQ_HW) + goto ex_put; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +ex_put: + put_res(dev, slave, cqn, RES_CQ); + + return err; +} + +static int handle_resize(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd, + struct res_cq *cq) +{ + int err; + struct res_mtt *orig_mtt; + struct res_mtt *mtt; + struct mlx4_cq_context *cqc = inbox->buf; + int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz; + + err = get_res(dev, slave, cq->mtt->com.res_id, RES_MTT, &orig_mtt); + if (err) + return err; + + if (orig_mtt != cq->mtt) { + err = -EINVAL; + goto ex_put; + } + + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto ex_put; + + err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt); + if (err) + goto ex_put1; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put1; + atomic_dec(&orig_mtt->ref_count); + put_res(dev, slave, orig_mtt->com.res_id, RES_MTT); + atomic_inc(&mtt->ref_count); + cq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + return 0; + +ex_put1: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_put: + put_res(dev, slave, orig_mtt->com.res_id, RES_MTT); + + return err; + +} + +int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int cqn = vhcr->in_modifier; + struct res_cq *cq; + int err; + + err = get_res(dev, slave, cqn, RES_CQ, &cq); + if (err) + return err; + + if (cq->com.from_state != RES_CQ_HW) + goto ex_put; + + if (vhcr->op_modifier == 0) { + err = handle_resize(dev, slave, vhcr, inbox, outbox, cmd, cq); + goto ex_put; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +ex_put: + put_res(dev, slave, cqn, RES_CQ); + + return err; +} + +static int srq_get_mtt_size(struct mlx4_srq_context *srqc) +{ + int log_srq_size = (be32_to_cpu(srqc->state_logsize_srqn) >> 24) & 0xf; + int log_rq_stride = srqc->logstride & 7; + int page_shift = (srqc->log_page_size & 0x3f) + 12; + + if (log_srq_size + log_rq_stride + 4 < page_shift) + return 1; + + return 1 << (log_srq_size + log_rq_stride + 4 - page_shift); +} + +int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_mtt *mtt; + struct res_srq *srq; + struct mlx4_srq_context *srqc = inbox->buf; + int mtt_base = srq_get_mtt_addr(srqc) / dev->caps.mtt_entry_sz; + + if (srqn != (be32_to_cpu(srqc->state_logsize_srqn) & 0xffffff)) + return -EINVAL; + + err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_HW, &srq); + if (err) + return err; + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto ex_abort; + err = check_mtt_range(dev, slave, mtt_base, srq_get_mtt_size(srqc), + mtt); + if (err) + goto ex_put_mtt; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put_mtt; + + atomic_inc(&mtt->ref_count); + srq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_SRQ, srqn); + return 0; + +ex_put_mtt: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_SRQ, srqn); + + return err; +} + +int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq); + if (err) + return err; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_abort; + atomic_dec(&srq->mtt->ref_count); + if (srq->cq) + atomic_dec(&srq->cq->ref_count); + res_end_move(dev, slave, RES_SRQ, srqn); + + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_SRQ, srqn); + + return err; +} + +int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) + return err; + if (srq->com.from_state != RES_SRQ_HW) { + err = -EBUSY; + goto out; + } + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +out: + put_res(dev, slave, srqn, RES_SRQ); + return err; +} + +int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) + return err; + + if (srq->com.from_state != RES_SRQ_HW) { + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +out: + put_res(dev, slave, srqn, RES_SRQ); + return err; +} + +int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_qp *qp; + + err = get_res(dev, slave, qpn, RES_QP, &qp); + if (err) + return err; + if (qp->com.from_state != RES_QP_HW) { + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +out: + put_res(dev, slave, qpn, RES_QP); + return err; +} + +int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + adjust_proxy_tun_qkey(dev, vhcr, context); + update_pkey_index(dev, slave, inbox); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +static int roce_verify_mac(struct mlx4_dev *dev, int slave, + struct mlx4_qp_context *qpc, + struct mlx4_cmd_mailbox *inbox) +{ + u64 mac; + int port; + u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff; + u8 sched = *(u8 *)(inbox->buf + 64); + u8 smac_ix; + + port = (sched >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port) && (ts != MLX4_QP_ST_MLX)) { + smac_ix = qpc->pri_path.grh_mylmc & 0x7f; + if (mac_find_smac_ix_in_slave(dev, slave, port, smac_ix, &mac)) + return -ENOENT; + } + return 0; +} + +int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *qpc = inbox->buf + 8; + + err = verify_qp_parameters(dev, inbox, QP_TRANS_INIT2RTR, slave); + if (err) + return err; + + if (roce_verify_mac(dev, slave, qpc, inbox)) + return -EINVAL; + + update_pkey_index(dev, slave, inbox); + update_gid(dev, inbox, (u8)slave); + adjust_proxy_tun_qkey(dev, vhcr, qpc); + err = update_vport_qp_param(dev, inbox, slave); + if (err) + return err; + + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *context = inbox->buf + 8; + + err = verify_qp_parameters(dev, inbox, QP_TRANS_RTR2RTS, slave); + if (err) + return err; + + update_pkey_index(dev, slave, inbox); + update_gid(dev, inbox, (u8)slave); + adjust_proxy_tun_qkey(dev, vhcr, context); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *context = inbox->buf + 8; + + err = verify_qp_parameters(dev, inbox, QP_TRANS_RTS2RTS, slave); + if (err) + return err; + + update_pkey_index(dev, slave, inbox); + update_gid(dev, inbox, (u8)slave); + adjust_proxy_tun_qkey(dev, vhcr, context); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + + +int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + adjust_proxy_tun_qkey(dev, vhcr, context); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *context = inbox->buf + 8; + + err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2SQD, slave); + if (err) + return err; + + adjust_proxy_tun_qkey(dev, vhcr, context); + update_gid(dev, inbox, (u8)slave); + update_pkey_index(dev, slave, inbox); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *context = inbox->buf + 8; + + err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2RTS, slave); + if (err) + return err; + + adjust_proxy_tun_qkey(dev, vhcr, context); + update_gid(dev, inbox, (u8)slave); + update_pkey_index(dev, slave, inbox); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_qp *qp; + + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, &qp, 0); + if (err) + return err; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_abort; + + atomic_dec(&qp->mtt->ref_count); + atomic_dec(&qp->rcq->ref_count); + atomic_dec(&qp->scq->ref_count); + if (qp->srq) + atomic_dec(&qp->srq->ref_count); + res_end_move(dev, slave, RES_QP, qpn); + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_QP, qpn); + + return err; +} + +static struct res_gid *find_gid(struct mlx4_dev *dev, int slave, + struct res_qp *rqp, u8 *gid) +{ + struct res_gid *res; + + list_for_each_entry(res, &rqp->mcg_list, list) { + if (!memcmp(res->gid, gid, 16)) + return res; + } + return NULL; +} + +static int add_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp, + u8 *gid, enum mlx4_protocol prot, + enum mlx4_steer_type steer) +{ + struct res_gid *res; + int err; + + res = kzalloc(sizeof *res, GFP_KERNEL); + if (!res) + return -ENOMEM; + + spin_lock_irq(&rqp->mcg_spl); + if (find_gid(dev, slave, rqp, gid)) { + kfree(res); + err = -EEXIST; + } else { + memcpy(res->gid, gid, 16); + res->prot = prot; + res->steer = steer; + list_add_tail(&res->list, &rqp->mcg_list); + err = 0; + } + spin_unlock_irq(&rqp->mcg_spl); + + return err; +} + +static int rem_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp, + u8 *gid, enum mlx4_protocol prot, + enum mlx4_steer_type steer) +{ + struct res_gid *res; + int err; + + spin_lock_irq(&rqp->mcg_spl); + res = find_gid(dev, slave, rqp, gid); + if (!res || res->prot != prot || res->steer != steer) + err = -EINVAL; + else { + list_del(&res->list); + kfree(res); + err = 0; + } + spin_unlock_irq(&rqp->mcg_spl); + + return err; +} + +int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp qp; /* dummy for calling attach/detach */ + u8 *gid = inbox->buf; + enum mlx4_protocol prot = (vhcr->in_modifier >> 28) & 0x7; + int err; + int qpn; + struct res_qp *rqp; + int attach = vhcr->op_modifier; + int block_loopback = vhcr->in_modifier >> 31; + u8 steer_type_mask = 2; + enum mlx4_steer_type type = (gid[7] & steer_type_mask) >> 1; + + qpn = vhcr->in_modifier & 0xffffff; + err = get_res(dev, slave, qpn, RES_QP, &rqp); + if (err) + return err; + + qp.qpn = qpn; + if (attach) { + err = add_mcg_res(dev, slave, rqp, gid, prot, type); + if (err) + goto ex_put; + + err = mlx4_qp_attach_common(dev, &qp, gid, + block_loopback, prot, type); + if (err) + goto ex_rem; + } else { + err = rem_mcg_res(dev, slave, rqp, gid, prot, type); + if (err) + goto ex_put; + err = mlx4_qp_detach_common(dev, &qp, gid, prot, type); + } + + put_res(dev, slave, qpn, RES_QP); + return 0; + +ex_rem: + /* ignore error return below, already in error */ + (void) rem_mcg_res(dev, slave, rqp, gid, prot, type); +ex_put: + put_res(dev, slave, qpn, RES_QP); + + return err; +} + +/* + * MAC validation for Flow Steering rules. + * VF can attach rules only with a mac address which is assigned to it. + */ + +static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header, + struct list_head *rlist) +{ + struct mac_res *res, *tmp; + __be64 be_mac; + + /* make sure it isn't multicast or broadcast mac*/ + if (!is_multicast_ether_addr(eth_header->eth.dst_mac) && + !is_broadcast_ether_addr(eth_header->eth.dst_mac)) { + list_for_each_entry_safe(res, tmp, rlist, list) { + be_mac = cpu_to_be64(res->mac << 16); + if (!memcmp(&be_mac, eth_header->eth.dst_mac, ETH_ALEN)) + return 0; + } + pr_err("MAC %pM doesn't belong to VF %d, Steering rule rejected\n", + eth_header->eth.dst_mac, slave); + return -EINVAL; + } + return 0; +} + +/* + * In case of missing eth header, append eth header with a MAC address + * assigned to the VF. + */ +static int add_eth_header(struct mlx4_dev *dev, int slave, + struct mlx4_cmd_mailbox *inbox, + struct list_head *rlist, int header_id) +{ + struct mac_res *res, *tmp; + u8 port; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + struct mlx4_net_trans_rule_hw_eth *eth_header; + struct mlx4_net_trans_rule_hw_ipv4 *ip_header; + struct mlx4_net_trans_rule_hw_tcp_udp *l4_header; + __be64 be_mac = 0; + __be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16); + + ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf; + port = ctrl->port; + eth_header = (struct mlx4_net_trans_rule_hw_eth *)(ctrl + 1); + + /* Clear a space in the inbox for eth header */ + switch (header_id) { + case MLX4_NET_TRANS_RULE_ID_IPV4: + ip_header = + (struct mlx4_net_trans_rule_hw_ipv4 *)(eth_header + 1); + memmove(ip_header, eth_header, + sizeof(*ip_header) + sizeof(*l4_header)); + break; + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + l4_header = (struct mlx4_net_trans_rule_hw_tcp_udp *) + (eth_header + 1); + memmove(l4_header, eth_header, sizeof(*l4_header)); + break; + default: + return -EINVAL; + } + list_for_each_entry_safe(res, tmp, rlist, list) { + if (port == res->port) { + be_mac = cpu_to_be64(res->mac << 16); + break; + } + } + if (!be_mac) { + pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d .\n", + port); + return -EINVAL; + } + + memset(eth_header, 0, sizeof(*eth_header)); + eth_header->size = sizeof(*eth_header) >> 2; + eth_header->id = cpu_to_be16(__sw_id_hw[MLX4_NET_TRANS_RULE_ID_ETH]); + memcpy(eth_header->dst_mac, &be_mac, ETH_ALEN); + memcpy(eth_header->dst_mac_msk, &mac_msk, ETH_ALEN); + + return 0; + +} + +int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *rlist = &tracker->slave_list[slave].res_list[RES_MAC]; + int err; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + struct _rule_hw *rule_header; + int header_id; + + if (dev->caps.steering_mode != + MLX4_STEERING_MODE_DEVICE_MANAGED) + return -EOPNOTSUPP; + + ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf; + rule_header = (struct _rule_hw *)(ctrl + 1); + header_id = map_hw_to_sw_id(be16_to_cpu(rule_header->id)); + + switch (header_id) { + case MLX4_NET_TRANS_RULE_ID_ETH: + if (validate_eth_header_mac(slave, rule_header, rlist)) + return -EINVAL; + break; + case MLX4_NET_TRANS_RULE_ID_IB: + break; + case MLX4_NET_TRANS_RULE_ID_IPV4: + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + pr_warn("Can't attach FS rule without L2 headers, adding L2 header.\n"); + if (add_eth_header(dev, slave, inbox, rlist, header_id)) + return -EINVAL; + vhcr->in_modifier += + sizeof(struct mlx4_net_trans_rule_hw_eth) >> 2; + break; + default: + pr_err("Corrupted mailbox.\n"); + return -EINVAL; + } + + err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param, + vhcr->in_modifier, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + return err; + + err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, 0); + if (err) { + mlx4_err(dev, "Fail to add flow steering resources.\n "); + /* detach rule*/ + mlx4_cmd(dev, vhcr->out_param, 0, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + } + return err; +} + +int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + + if (dev->caps.steering_mode != + MLX4_STEERING_MODE_DEVICE_MANAGED) + return -EOPNOTSUPP; + + err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE, 0); + if (err) { + mlx4_err(dev, "Fail to remove flow steering resources.\n "); + return err; + } + + err = mlx4_cmd(dev, vhcr->in_param, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + return err; +} + +enum { + BUSY_MAX_RETRIES = 10 +}; + +int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier & 0xffff; + + err = get_res(dev, slave, index, RES_COUNTER, NULL); + if (err) + return err; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + put_res(dev, slave, index, RES_COUNTER); + return err; +} + +static void detach_qp(struct mlx4_dev *dev, int slave, struct res_qp *rqp) +{ + struct res_gid *rgid; + struct res_gid *tmp; + struct mlx4_qp qp; /* dummy for calling attach/detach */ + + list_for_each_entry_safe(rgid, tmp, &rqp->mcg_list, list) { + qp.qpn = rqp->local_qpn; + (void) mlx4_qp_detach_common(dev, &qp, rgid->gid, rgid->prot, + rgid->steer); + list_del(&rgid->list); + kfree(rgid); + } +} + +static int _move_all_busy(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int print) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = + &priv->mfunc.master.res_tracker; + struct list_head *rlist = &tracker->slave_list[slave].res_list[type]; + struct res_common *r; + struct res_common *tmp; + int busy; + + busy = 0; + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(r, tmp, rlist, list) { + if (r->owner == slave) { + if (!r->removing) { + if (r->state == RES_ANY_BUSY) { + if (print) + mlx4_dbg(dev, + "%s id 0x%llx is busy\n", + ResourceType(type), + r->res_id); + ++busy; + } else { + r->from_state = r->state; + r->state = RES_ANY_BUSY; + r->removing = 1; + } + } + } + } + spin_unlock_irq(mlx4_tlock(dev)); + + return busy; +} + +static int move_all_busy(struct mlx4_dev *dev, int slave, + enum mlx4_resource type) +{ + unsigned long begin; + int busy; + + begin = jiffies; + do { + busy = _move_all_busy(dev, slave, type, 0); + if (time_after(jiffies, begin + 5 * HZ)) + break; + if (busy) + cond_resched(); + } while (busy); + + if (busy) + busy = _move_all_busy(dev, slave, type, 1); + + return busy; +} +static void rem_slave_qps(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *qp_list = + &tracker->slave_list[slave].res_list[RES_QP]; + struct res_qp *qp; + struct res_qp *tmp; + int state; + u64 in_param; + int qpn; + int err; + + err = move_all_busy(dev, slave, RES_QP); + if (err) + mlx4_warn(dev, "rem_slave_qps: Could not move all qps to busy" + "for slave %d\n", slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(qp, tmp, qp_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (qp->com.owner == slave) { + qpn = qp->com.res_id; + detach_qp(dev, slave, qp); + state = qp->com.from_state; + while (state != 0) { + switch (state) { + case RES_QP_RESERVED: + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&qp->com.node, + &tracker->res_tree[RES_QP]); + list_del(&qp->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(qp); + state = 0; + break; + case RES_QP_MAPPED: + if (!valid_reserved(dev, slave, qpn)) + __mlx4_qp_free_icm(dev, qpn); + state = RES_QP_RESERVED; + break; + case RES_QP_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, + qp->local_qpn, 2, + MLX4_CMD_2RST_QP, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_qps: failed" + " to move slave %d qpn %d to" + " reset\n", slave, + qp->local_qpn); + atomic_dec(&qp->rcq->ref_count); + atomic_dec(&qp->scq->ref_count); + atomic_dec(&qp->mtt->ref_count); + if (qp->srq) + atomic_dec(&qp->srq->ref_count); + state = RES_QP_MAPPED; + break; + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_srqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *srq_list = + &tracker->slave_list[slave].res_list[RES_SRQ]; + struct res_srq *srq; + struct res_srq *tmp; + int state; + u64 in_param; + LIST_HEAD(tlist); + int srqn; + int err; + + err = move_all_busy(dev, slave, RES_SRQ); + if (err) + mlx4_warn(dev, "rem_slave_srqs: Could not move all srqs to " + "busy for slave %d\n", slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(srq, tmp, srq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (srq->com.owner == slave) { + srqn = srq->com.res_id; + state = srq->com.from_state; + while (state != 0) { + switch (state) { + case RES_SRQ_ALLOCATED: + __mlx4_srq_free_icm(dev, srqn); + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&srq->com.node, + &tracker->res_tree[RES_SRQ]); + list_del(&srq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(srq); + state = 0; + break; + + case RES_SRQ_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, srqn, 1, + MLX4_CMD_HW2SW_SRQ, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_srqs: failed" + " to move slave %d srq %d to" + " SW ownership\n", + slave, srqn); + + atomic_dec(&srq->mtt->ref_count); + if (srq->cq) + atomic_dec(&srq->cq->ref_count); + state = RES_SRQ_ALLOCATED; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_cqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *cq_list = + &tracker->slave_list[slave].res_list[RES_CQ]; + struct res_cq *cq; + struct res_cq *tmp; + int state; + u64 in_param; + LIST_HEAD(tlist); + int cqn; + int err; + + err = move_all_busy(dev, slave, RES_CQ); + if (err) + mlx4_warn(dev, "rem_slave_cqs: Could not move all cqs to " + "busy for slave %d\n", slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(cq, tmp, cq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (cq->com.owner == slave && !atomic_read(&cq->ref_count)) { + cqn = cq->com.res_id; + state = cq->com.from_state; + while (state != 0) { + switch (state) { + case RES_CQ_ALLOCATED: + __mlx4_cq_free_icm(dev, cqn); + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&cq->com.node, + &tracker->res_tree[RES_CQ]); + list_del(&cq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(cq); + state = 0; + break; + + case RES_CQ_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, cqn, 1, + MLX4_CMD_HW2SW_CQ, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_cqs: failed" + " to move slave %d cq %d to" + " SW ownership\n", + slave, cqn); + atomic_dec(&cq->mtt->ref_count); + state = RES_CQ_ALLOCATED; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_mrs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mpt_list = + &tracker->slave_list[slave].res_list[RES_MPT]; + struct res_mpt *mpt; + struct res_mpt *tmp; + int state; + u64 in_param; + LIST_HEAD(tlist); + int mptn; + int err; + + err = move_all_busy(dev, slave, RES_MPT); + if (err) + mlx4_warn(dev, "rem_slave_mrs: Could not move all mpts to " + "busy for slave %d\n", slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(mpt, tmp, mpt_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (mpt->com.owner == slave) { + mptn = mpt->com.res_id; + state = mpt->com.from_state; + while (state != 0) { + switch (state) { + case RES_MPT_RESERVED: + __mlx4_mr_release(dev, mpt->key); + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&mpt->com.node, + &tracker->res_tree[RES_MPT]); + list_del(&mpt->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(mpt); + state = 0; + break; + + case RES_MPT_MAPPED: + __mlx4_mr_free_icm(dev, mpt->key); + state = RES_MPT_RESERVED; + break; + + case RES_MPT_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, mptn, 0, + MLX4_CMD_HW2SW_MPT, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_mrs: failed" + " to move slave %d mpt %d to" + " SW ownership\n", + slave, mptn); + if (mpt->mtt) + atomic_dec(&mpt->mtt->ref_count); + state = RES_MPT_MAPPED; + break; + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_mtts(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = + &priv->mfunc.master.res_tracker; + struct list_head *mtt_list = + &tracker->slave_list[slave].res_list[RES_MTT]; + struct res_mtt *mtt; + struct res_mtt *tmp; + int state; + LIST_HEAD(tlist); + int base; + int err; + + err = move_all_busy(dev, slave, RES_MTT); + if (err) + mlx4_warn(dev, "rem_slave_mtts: Could not move all mtts to " + "busy for slave %d\n", slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(mtt, tmp, mtt_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (mtt->com.owner == slave) { + base = mtt->com.res_id; + state = mtt->com.from_state; + while (state != 0) { + switch (state) { + case RES_MTT_ALLOCATED: + __mlx4_free_mtt_range(dev, base, + mtt->order); + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&mtt->com.node, + &tracker->res_tree[RES_MTT]); + list_del(&mtt->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(mtt); + state = 0; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_fs_rule(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = + &priv->mfunc.master.res_tracker; + struct list_head *fs_rule_list = + &tracker->slave_list[slave].res_list[RES_FS_RULE]; + struct res_fs_rule *fs_rule; + struct res_fs_rule *tmp; + int state; + u64 base; + int err; + + err = move_all_busy(dev, slave, RES_FS_RULE); + if (err) + mlx4_warn(dev, "rem_slave_fs_rule: Could not move all mtts to busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(fs_rule, tmp, fs_rule_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (fs_rule->com.owner == slave) { + base = fs_rule->com.res_id; + state = fs_rule->com.from_state; + while (state != 0) { + switch (state) { + case RES_FS_RULE_ALLOCATED: + /* detach rule */ + err = mlx4_cmd(dev, base, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&fs_rule->com.node, + &tracker->res_tree[RES_FS_RULE]); + list_del(&fs_rule->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(fs_rule); + state = 0; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_eqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *eq_list = + &tracker->slave_list[slave].res_list[RES_EQ]; + struct res_eq *eq; + struct res_eq *tmp; + int err; + int state; + LIST_HEAD(tlist); + int eqn; + struct mlx4_cmd_mailbox *mailbox; + + err = move_all_busy(dev, slave, RES_EQ); + if (err) + mlx4_warn(dev, "rem_slave_eqs: Could not move all eqs to " + "busy for slave %d\n", slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(eq, tmp, eq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (eq->com.owner == slave) { + eqn = eq->com.res_id; + state = eq->com.from_state; + while (state != 0) { + switch (state) { + case RES_EQ_RESERVED: + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&eq->com.node, + &tracker->res_tree[RES_EQ]); + list_del(&eq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(eq); + state = 0; + break; + + case RES_EQ_HW: + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + cond_resched(); + continue; + } + err = mlx4_cmd_box(dev, slave, 0, + eqn & 0xff, 0, + MLX4_CMD_HW2SW_EQ, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_eqs: failed" + " to move slave %d eqs %d to" + " SW ownership\n", slave, eqn); + mlx4_free_cmd_mailbox(dev, mailbox); + atomic_dec(&eq->mtt->ref_count); + state = RES_EQ_RESERVED; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_counters(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *counter_list = + &tracker->slave_list[slave].res_list[RES_COUNTER]; + struct res_counter *counter; + struct res_counter *tmp; + int err; + int index; + + err = move_all_busy(dev, slave, RES_COUNTER); + if (err) + mlx4_warn(dev, "rem_slave_counters: Could not move all counters to " + "busy for slave %d\n", slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(counter, tmp, counter_list, com.list) { + if (counter->com.owner == slave) { + index = counter->com.res_id; + rb_erase(&counter->com.node, + &tracker->res_tree[RES_COUNTER]); + list_del(&counter->com.list); + kfree(counter); + __mlx4_counter_free(dev, index); + } + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *xrcdn_list = + &tracker->slave_list[slave].res_list[RES_XRCD]; + struct res_xrcdn *xrcd; + struct res_xrcdn *tmp; + int err; + int xrcdn; + + err = move_all_busy(dev, slave, RES_XRCD); + if (err) + mlx4_warn(dev, "rem_slave_xrcdns: Could not move all xrcdns to " + "busy for slave %d\n", slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) { + if (xrcd->com.owner == slave) { + xrcdn = xrcd->com.res_id; + rb_erase(&xrcd->com.node, &tracker->res_tree[RES_XRCD]); + list_del(&xrcd->com.list); + kfree(xrcd); + __mlx4_xrcd_free(dev, xrcdn); + } + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex); + rem_slave_macs(dev, slave); + rem_slave_vlans(dev, slave); + rem_slave_qps(dev, slave); + rem_slave_srqs(dev, slave); + rem_slave_cqs(dev, slave); + rem_slave_mrs(dev, slave); + rem_slave_eqs(dev, slave); + rem_slave_mtts(dev, slave); + rem_slave_counters(dev, slave); + rem_slave_xrcdns(dev, slave); + rem_slave_fs_rule(dev, slave); + mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex); +} diff --git a/sys/ofed/drivers/net/mlx4/sense.c b/sys/ofed/drivers/net/mlx4/sense.c index 0fcf025e36b5..ba1fb4340e38 100644 --- a/sys/ofed/drivers/net/mlx4/sense.c +++ b/sys/ofed/drivers/net/mlx4/sense.c @@ -38,14 +38,15 @@ #include "mlx4.h" -static int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, - enum mlx4_port_type *type) +int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, + enum mlx4_port_type *type) { u64 out_param; int err = 0; err = mlx4_cmd_imm(dev, 0, &out_param, port, 0, - MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); if (err) { mlx4_err(dev, "Sense command failed for port: %d\n", port); return err; @@ -53,7 +54,7 @@ static int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, if (out_param > 2) { mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param); - return EINVAL; + return -EINVAL; } *type = out_param; @@ -79,20 +80,6 @@ void mlx4_do_sense_ports(struct mlx4_dev *dev, stype[i - 1] = defaults[i - 1]; } - /* - * Adjust port configuration: - * If port 1 sensed nothing and port 2 is IB, set both as IB - * If port 2 sensed nothing and port 1 is Eth, set both as Eth - */ - if (stype[0] == MLX4_PORT_TYPE_ETH) { - for (i = 1; i < dev->caps.num_ports; i++) - stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_ETH; - } - if (stype[dev->caps.num_ports - 1] == MLX4_PORT_TYPE_IB) { - for (i = 0; i < dev->caps.num_ports - 1; i++) - stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_IB; - } - /* * If sensed nothing, remain in current configuration. */ @@ -139,18 +126,26 @@ void mlx4_start_sense(struct mlx4_dev *dev) round_jiffies(MLX4_SENSE_RANGE)); } - void mlx4_stop_sense(struct mlx4_dev *dev) { mlx4_priv(dev)->sense.resched = 0; } -int mlx4_sense_init(struct mlx4_dev *dev) +void mlx4_sense_cleanup(struct mlx4_dev *dev) +{ + mlx4_stop_sense(dev); + cancel_delayed_work(&mlx4_priv(dev)->sense.sense_poll); + destroy_workqueue(mlx4_priv(dev)->sense.sense_wq); +} + + +int mlx4_sense_init(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_sense *sense = &priv->sense; int port; + sense->dev = dev; sense->sense_wq = create_singlethread_workqueue("mlx4_sense"); if (!sense->sense_wq) @@ -159,14 +154,7 @@ int mlx4_sense_init(struct mlx4_dev *dev) for (port = 1; port <= dev->caps.num_ports; port++) sense->do_sense_port[port] = 1; - INIT_DELAYED_WORK_DEFERRABLE(&sense->sense_poll, mlx4_sense_port); - return 0; -} + INIT_DEFERRABLE_WORK(&sense->sense_poll, mlx4_sense_port); -void mlx4_sense_cleanup(struct mlx4_dev *dev) -{ - mlx4_stop_sense(dev); - cancel_delayed_work(&mlx4_priv(dev)->sense.sense_poll); - destroy_workqueue(mlx4_priv(dev)->sense.sense_wq); + return 0; } - diff --git a/sys/ofed/drivers/net/mlx4/srq.c b/sys/ofed/drivers/net/mlx4/srq.c index f856b8ddb66e..321c2388a782 100644 --- a/sys/ofed/drivers/net/mlx4/srq.c +++ b/sys/ofed/drivers/net/mlx4/srq.c @@ -34,31 +34,11 @@ #include #include -#include +#include #include "mlx4.h" #include "icm.h" -struct mlx4_srq_context { - __be32 state_logsize_srqn; - u8 logstride; - u8 reserved1; - __be16 xrc_domain; - __be32 pg_offset_cqn; - u32 reserved2; - u8 log_page_size; - u8 reserved3[2]; - u8 mtt_base_addr_h; - __be32 mtt_base_addr_l; - __be32 pd; - __be16 limit_watermark; - __be16 wqe_cnt; - u16 reserved4; - __be16 wqe_counter; - u32 reserved5; - __be64 db_rec_addr; -}; - void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) { struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; @@ -66,8 +46,7 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) spin_lock(&srq_table->lock); - srq = radix_tree_lookup(&dev->srq_table_tree, - srqn & (dev->caps.num_srqs - 1)); + srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1)); if (srq) atomic_inc(&srq->refcount); @@ -87,8 +66,9 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int srq_num) { - return mlx4_cmd(dev, mailbox->dma, srq_num, 0, MLX4_CMD_SW2HW_SRQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, mailbox->dma, srq_num, 0, + MLX4_CMD_SW2HW_SRQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); } static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, @@ -96,20 +76,89 @@ static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox { return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark) { return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); } static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int srq_num) { return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); +} + +int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + int err; + + + *srqn = mlx4_bitmap_alloc(&srq_table->bitmap); + if (*srqn == -1) + return -ENOMEM; + + err = mlx4_table_get(dev, &srq_table->table, *srqn); + if (err) + goto err_out; + + err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn); + if (err) + goto err_put; + return 0; + +err_put: + mlx4_table_put(dev, &srq_table->table, *srqn); + +err_out: + mlx4_bitmap_free(&srq_table->bitmap, *srqn); + return err; +} + +static int mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_SRQ, + RES_OP_RESERVE_AND_MAP, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (!err) + *srqn = get_param_l(&out_param); + + return err; + } + return __mlx4_srq_alloc_icm(dev, srqn); +} + +void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + + mlx4_table_put(dev, &srq_table->cmpt_table, srqn); + mlx4_table_put(dev, &srq_table->table, srqn); + mlx4_bitmap_free(&srq_table->bitmap, srqn); +} + +static void mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, srqn); + if (mlx4_cmd(dev, in_param, RES_SRQ, RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED)) + mlx4_warn(dev, "Failed freeing cq:%d\n", srqn); + return; + } + __mlx4_srq_free_icm(dev, srqn); } int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, @@ -121,23 +170,15 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, u64 mtt_addr; int err; - srq->srqn = mlx4_bitmap_alloc(&srq_table->bitmap); - if (srq->srqn == -1) - return -ENOMEM; - - err = mlx4_table_get(dev, &srq_table->table, srq->srqn); + err = mlx4_srq_alloc_icm(dev, &srq->srqn); if (err) - goto err_out; - - err = mlx4_table_get(dev, &srq_table->cmpt_table, srq->srqn); - if (err) - goto err_put; + return err; spin_lock_irq(&srq_table->lock); - err = radix_tree_insert(&dev->srq_table_tree, srq->srqn, srq); + err = radix_tree_insert(&srq_table->tree, srq->srqn, srq); spin_unlock_irq(&srq_table->lock); if (err) - goto err_cmpt_put; + goto err_icm; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { @@ -151,7 +192,7 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) | srq->srqn); srq_context->logstride = srq->wqe_shift - 4; - srq_context->xrc_domain = cpu_to_be16(xrcd); + srq_context->xrcd = cpu_to_be16(xrcd); srq_context->pg_offset_cqn = cpu_to_be32(cqn & 0xffffff); srq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; @@ -173,52 +214,33 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, err_radix: spin_lock_irq(&srq_table->lock); - radix_tree_delete(&dev->srq_table_tree, srq->srqn); + radix_tree_delete(&srq_table->tree, srq->srqn); spin_unlock_irq(&srq_table->lock); -err_cmpt_put: - mlx4_table_put(dev, &srq_table->cmpt_table, srq->srqn); - -err_put: - mlx4_table_put(dev, &srq_table->table, srq->srqn); - -err_out: - mlx4_bitmap_free(&srq_table->bitmap, srq->srqn); - +err_icm: + mlx4_srq_free_icm(dev, srq->srqn); return err; } EXPORT_SYMBOL_GPL(mlx4_srq_alloc); -void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq) +void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq) { + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; int err; err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn); if (err) mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn); -} -EXPORT_SYMBOL_GPL(mlx4_srq_invalidate); - -void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq) -{ - struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; spin_lock_irq(&srq_table->lock); - radix_tree_delete(&dev->srq_table_tree, srq->srqn); + radix_tree_delete(&srq_table->tree, srq->srqn); spin_unlock_irq(&srq_table->lock); -} -EXPORT_SYMBOL_GPL(mlx4_srq_remove); - -void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq) -{ - struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; if (atomic_dec_and_test(&srq->refcount)) complete(&srq->free); wait_for_completion(&srq->free); - mlx4_table_put(dev, &srq_table->table, srq->srqn); - mlx4_bitmap_free(&srq_table->bitmap, srq->srqn); + mlx4_srq_free_icm(dev, srq->srqn); } EXPORT_SYMBOL_GPL(mlx4_srq_free); @@ -257,7 +279,9 @@ int mlx4_init_srq_table(struct mlx4_dev *dev) int err; spin_lock_init(&srq_table->lock); - INIT_RADIX_TREE(&dev->srq_table_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC); + if (mlx4_is_slave(dev)) + return 0; err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs, dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0); @@ -269,5 +293,7 @@ int mlx4_init_srq_table(struct mlx4_dev *dev) void mlx4_cleanup_srq_table(struct mlx4_dev *dev) { + if (mlx4_is_slave(dev)) + return; mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap); } diff --git a/sys/ofed/drivers/net/mlx4/sys_tune.c b/sys/ofed/drivers/net/mlx4/sys_tune.c new file mode 100644 index 000000000000..0675e90eacf5 --- /dev/null +++ b/sys/ofed/drivers/net/mlx4/sys_tune.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2010 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include + +#include "mlx4.h" + +#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) + + + +/* Each CPU is put into a group. In most cases, the group number is + * equal to the CPU number of one of the CPUs in the group. The + * exception is group NR_CPUS which is the default group. This is + * protected by sys_tune_startup_mutex. */ +DEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS; + +/* For each group, a count of the number of CPUs in the group which + * are known to be busy. A busy CPU might be running the busy loop + * below or general kernel code. The count is decremented on entry to + * the old pm_idle handler and incremented on exit. The aim is to + * avoid the count going to zero or negative. This situation can + * occur temporarily during module unload or CPU hot-plug but + * normality will be restored when the affected CPUs next exit the + * idle loop. */ +static atomic_t busy_cpu_count[NR_CPUS+1]; + +/* A workqueue item to be executed to cause the CPU to exit from the + * idle loop. */ +DEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work); + +#define sys_tune_set_state(CPU,STATE) \ + do { } while(0) + + +/* A mutex to protect most of the module datastructures. */ +static DEFINE_MUTEX(sys_tune_startup_mutex); + +/* The old pm_idle handler. */ +static void (*old_pm_idle)(void) = NULL; + +static void sys_tune_pm_idle(void) +{ + atomic_t *busy_cpus_ptr; + int busy_cpus; + int cpu = smp_processor_id(); + + busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]); + + sys_tune_set_state(cpu, 2); + + local_irq_enable(); + while (!need_resched()) { + busy_cpus = atomic_read(busy_cpus_ptr); + + /* If other CPUs in this group are busy then let this + * CPU go idle. We mustn't let the number of busy + * CPUs drop below 1. */ + if ( busy_cpus > 1 && + old_pm_idle != NULL && + ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus, + busy_cpus-1) == busy_cpus ) ) { + local_irq_disable(); + sys_tune_set_state(cpu, 3); + /* This check might not be necessary, but it + * seems safest to include it because there + * might be a kernel version which requires + * it. */ + if (need_resched()) + local_irq_enable(); + else + old_pm_idle(); + /* This CPU is busy again. */ + sys_tune_set_state(cpu, 1); + atomic_add(1, busy_cpus_ptr); + return; + } + + cpu_relax(); + } + sys_tune_set_state(cpu, 0); +} + + +void sys_tune_work_func(struct work_struct *work) +{ + /* Do nothing. Since this function is running in process + * context, the idle thread isn't running on this CPU. */ +} + + +#ifdef CONFIG_SMP +static void sys_tune_smp_call(void *info) +{ + schedule_work(&get_cpu_var(sys_tune_cpu_work)); + put_cpu_var(sys_tune_cpu_work); +} +#endif + + +#ifdef CONFIG_SMP +static void sys_tune_refresh(void) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) + on_each_cpu(&sys_tune_smp_call, NULL, 0, 1); +#else + on_each_cpu(&sys_tune_smp_call, NULL, 1); +#endif +} +#else +static void sys_tune_refresh(void) +{ + /* The current thread is executing on the one and only CPU so + * the idle thread isn't running. */ +} +#endif + + + +static int sys_tune_cpu_group(int cpu) +{ +#ifdef CONFIG_SMP + const cpumask_t *mask; + int other_cpu; + int group; + +#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP) + /* Keep one hyperthread busy per core. */ + mask = topology_thread_cpumask(cpu); +#else + return cpu; +#endif + for_each_cpu_mask(cpu, *(mask)) { + group = per_cpu(idle_cpu_group, other_cpu); + if (group != NR_CPUS) + return group; + } +#endif + + return cpu; +} + + +static void sys_tune_add_cpu(int cpu) +{ + int group; + + /* Do nothing if this CPU has already been added. */ + if (per_cpu(idle_cpu_group, cpu) != NR_CPUS) + return; + + group = sys_tune_cpu_group(cpu); + per_cpu(idle_cpu_group, cpu) = group; + atomic_inc(&(busy_cpu_count[group])); + +} + +static void sys_tune_del_cpu(int cpu) +{ + + int group; + + if (per_cpu(idle_cpu_group, cpu) == NR_CPUS) + return; + + group = per_cpu(idle_cpu_group, cpu); + /* If the CPU was busy, this can cause the count to drop to + * zero. To rectify this, we need to cause one of the other + * CPUs in the group to exit the idle loop. If the CPU was + * not busy then this causes the contribution for this CPU to + * go to -1 which can cause the overall count to drop to zero + * or go negative. To rectify this situation we need to cause + * this CPU to exit the idle loop. */ + atomic_dec(&(busy_cpu_count[group])); + per_cpu(idle_cpu_group, cpu) = NR_CPUS; + +} + + +static int sys_tune_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + + switch(action) { +#ifdef CPU_ONLINE_FROZEN + case CPU_ONLINE_FROZEN: +#endif + case CPU_ONLINE: + mutex_lock(&sys_tune_startup_mutex); + sys_tune_add_cpu(cpu); + mutex_unlock(&sys_tune_startup_mutex); + /* The CPU might have already entered the idle loop in + * the wrong group. Make sure it exits the idle loop + * so that it picks up the correct group. */ + sys_tune_refresh(); + break; + +#ifdef CPU_DEAD_FROZEN + case CPU_DEAD_FROZEN: +#endif + case CPU_DEAD: + mutex_lock(&sys_tune_startup_mutex); + sys_tune_del_cpu(cpu); + mutex_unlock(&sys_tune_startup_mutex); + /* The deleted CPU may have been the only busy CPU in + * the group. Make sure one of the other CPUs in the + * group exits the idle loop. */ + sys_tune_refresh(); + break; + } + return NOTIFY_OK; +} + + +static struct notifier_block sys_tune_cpu_nb = { + .notifier_call = sys_tune_cpu_notify, +}; + + +static void sys_tune_ensure_init(void) +{ + BUG_ON (old_pm_idle != NULL); + + /* Atomically update pm_idle to &sys_tune_pm_idle. The old value + * is stored in old_pm_idle before installing the new + * handler. */ + do { + old_pm_idle = pm_idle; + } while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) != + old_pm_idle); +} +#endif + +void sys_tune_fini(void) +{ +#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) + void (*old)(void); + int cpu; + + unregister_cpu_notifier(&sys_tune_cpu_nb); + + mutex_lock(&sys_tune_startup_mutex); + + + old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle); + + for_each_online_cpu(cpu) + sys_tune_del_cpu(cpu); + + mutex_unlock(&sys_tune_startup_mutex); + + /* Our handler may still be executing on other CPUs. + * Schedule this thread on all CPUs to make sure all + * idle threads get interrupted. */ + sys_tune_refresh(); + + /* Make sure the work item has finished executing on all CPUs. + * This in turn ensures that all idle threads have been + * interrupted. */ + flush_scheduled_work(); +#endif /* CONFIG_X86 */ +} + +void sys_tune_init(void) +{ +#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) + int cpu; + + for_each_possible_cpu(cpu) { + INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu), + sys_tune_work_func); + } + + /* Start by registering the handler to ensure we don't miss + * any updates. */ + register_cpu_notifier(&sys_tune_cpu_nb); + + mutex_lock(&sys_tune_startup_mutex); + + for_each_online_cpu(cpu) + sys_tune_add_cpu(cpu); + + sys_tune_ensure_init(); + + + mutex_unlock(&sys_tune_startup_mutex); + + /* Ensure our idle handler starts to run. */ + sys_tune_refresh(); +#endif +} + diff --git a/sys/ofed/include/asm/atomic.h b/sys/ofed/include/asm/atomic.h index 5c5caa07d3e6..46e03705dae0 100644 --- a/sys/ofed/include/asm/atomic.h +++ b/sys/ofed/include/asm/atomic.h @@ -45,6 +45,7 @@ typedef struct { #define atomic_sub_and_test(i, v) (atomic_sub_return((i), (v)) == 0) #define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0) #define atomic_inc_and_test(v) (atomic_add_return(1, (v)) == 0) +#define atomic_dec_return(v) atomic_sub_return(1, (v)) static inline int atomic_add_return(int i, atomic_t *v) @@ -82,4 +83,25 @@ atomic_dec(atomic_t *v) return atomic_fetchadd_int(&v->counter, -1) - 1; } +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + // old = atomic_cmpxchg((v), c, c + (a)); /*Linux*/ + old = atomic_cmpset_int(&v->counter, c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + + + + #endif /* _ASM_ATOMIC_H_ */ diff --git a/sys/ofed/include/asm/byteorder.h b/sys/ofed/include/asm/byteorder.h index 341c548f71dd..b59e973bc5ea 100644 --- a/sys/ofed/include/asm/byteorder.h +++ b/sys/ofed/include/asm/byteorder.h @@ -30,6 +30,7 @@ #include #include +#include #if BYTE_ORDER == LITTLE_ENDIAN #define __LITTLE_ENDIAN diff --git a/sys/ofed/include/linux/atomic.h b/sys/ofed/include/linux/atomic.h new file mode 100644 index 000000000000..0d689c1d4b72 --- /dev/null +++ b/sys/ofed/include/linux/atomic.h @@ -0,0 +1,53 @@ +#ifndef _COMPAT_LINUX_ATOMIC_H +#define _COMPAT_LINUX_ATOMIC_H 1 + +/* +#include + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,36)) +#include_next +#else +*/ + +#include + +/* Shahar Klein: atomic_inc_not_zero_hint do we need it? */ +#if 0 + +/** + * atomic_inc_not_zero_hint - increment if not null + * @v: pointer of type atomic_t + * @hint: probable value of the atomic before the increment + * + * This version of atomic_inc_not_zero() gives a hint of probable + * value of the atomic. This helps processor to not read the memory + * before doing the atomic read/modify/write cycle, lowering + * number of bus transactions on some arches. + * + * Returns: 0 if increment was not done, 1 otherwise. + */ + +#ifndef atomic_inc_not_zero_hint +static inline int atomic_inc_not_zero_hint(atomic_t *v, int hint) +{ + int val, c = hint; + + /* sanity test, should be removed by compiler if hint is a constant */ + if (!hint) + return atomic_inc_not_zero(v); + + do { + val = atomic_cmpxchg(v, c, c + 1); + if (val == c) + return 1; + c = val; + } while (c); + + return 0; +} +#endif +#endif + +//#endif /* (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,36)) */ + +#endif /* _COMPAT_LINUX_ATOMIC_H */ diff --git a/sys/ofed/include/linux/bitops.h b/sys/ofed/include/linux/bitops.h index 658c32e46743..4ada7089b989 100644 --- a/sys/ofed/include/linux/bitops.h +++ b/sys/ofed/include/linux/bitops.h @@ -35,6 +35,7 @@ #endif #define BIT_MASK(n) (~0UL >> (BITS_PER_LONG - (n))) #define BITS_TO_LONGS(n) howmany((n), BITS_PER_LONG) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) static inline int __ffs(int mask) @@ -63,6 +64,16 @@ __flsl(long mask) #define ffz(mask) __ffs(~(mask)) +static inline int get_count_order(unsigned int count) +{ + int order; + + order = fls(count) - 1; + if (count & (count - 1)) + order++; + return order; +} + static inline unsigned long find_first_bit(unsigned long *addr, unsigned long size) { @@ -314,4 +325,159 @@ test_and_set_bit(long bit, long *var) return !!(val & bit); } + +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) +#define BITMAP_LAST_WORD_MASK(nbits) \ +( \ + ((nbits) % BITS_PER_LONG) ? \ + (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \ +) + + +static inline void +bitmap_set(unsigned long *map, int start, int nr) +{ + unsigned long *p = map + BIT_WORD(start); + const int size = start + nr; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + while (nr - bits_to_set >= 0) { + *p |= mask_to_set; + nr -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + p++; + } + if (nr) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + *p |= mask_to_set; + } +} + +static inline void +bitmap_clear(unsigned long *map, int start, int nr) +{ + unsigned long *p = map + BIT_WORD(start); + const int size = start + nr; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (nr - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + nr -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (nr) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} + +enum { + REG_OP_ISFREE, /* true if region is all zero bits */ + REG_OP_ALLOC, /* set all bits in region */ + REG_OP_RELEASE, /* clear all bits in region */ +}; + +static int __reg_op(unsigned long *bitmap, int pos, int order, int reg_op) +{ + int nbits_reg; /* number of bits in region */ + int index; /* index first long of region in bitmap */ + int offset; /* bit offset region in bitmap[index] */ + int nlongs_reg; /* num longs spanned by region in bitmap */ + int nbitsinlong; /* num bits of region in each spanned long */ + unsigned long mask; /* bitmask for one long of region */ + int i; /* scans bitmap by longs */ + int ret = 0; /* return value */ + + /* + * Either nlongs_reg == 1 (for small orders that fit in one long) + * or (offset == 0 && mask == ~0UL) (for larger multiword orders.) + */ + nbits_reg = 1 << order; + index = pos / BITS_PER_LONG; + offset = pos - (index * BITS_PER_LONG); + nlongs_reg = BITS_TO_LONGS(nbits_reg); + nbitsinlong = min(nbits_reg, BITS_PER_LONG); + + /* + * Can't do "mask = (1UL << nbitsinlong) - 1", as that + * overflows if nbitsinlong == BITS_PER_LONG. + */ + mask = (1UL << (nbitsinlong - 1)); + mask += mask - 1; + mask <<= offset; + + switch (reg_op) { + case REG_OP_ISFREE: + for (i = 0; i < nlongs_reg; i++) { + if (bitmap[index + i] & mask) + goto done; + } + ret = 1; /* all bits in region free (zero) */ + break; + + case REG_OP_ALLOC: + for (i = 0; i < nlongs_reg; i++) + bitmap[index + i] |= mask; + break; + + case REG_OP_RELEASE: + for (i = 0; i < nlongs_reg; i++) + bitmap[index + i] &= ~mask; + break; + } +done: + return ret; +} + +/** + * bitmap_find_free_region - find a contiguous aligned mem region + * @bitmap: array of unsigned longs corresponding to the bitmap + * @bits: number of bits in the bitmap + * @order: region size (log base 2 of number of bits) to find + * + * Find a region of free (zero) bits in a @bitmap of @bits bits and + * allocate them (set them to one). Only consider regions of length + * a power (@order) of two, aligned to that power of two, which + * makes the search algorithm much faster. + * + * Return the bit offset in bitmap of the allocated region, + * or -errno on failure. + */ +static inline int +bitmap_find_free_region(unsigned long *bitmap, int bits, int order) +{ + int pos, end; /* scans bitmap by regions of size order */ + + for (pos = 0 ; (end = pos + (1 << order)) <= bits; pos = end) { + if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) + continue; + __reg_op(bitmap, pos, order, REG_OP_ALLOC); + return pos; + } + return -ENOMEM; +} + +/** + * bitmap_release_region - release allocated bitmap region + * @bitmap: array of unsigned longs corresponding to the bitmap + * @pos: beginning of bit region to release + * @order: region size (log base 2 of number of bits) to release + * + * This is the complement to __bitmap_find_free_region() and releases + * the found region (by clearing it in the bitmap). + * + * No return value. + */ +static inline void +bitmap_release_region(unsigned long *bitmap, int pos, int order) +{ + __reg_op(bitmap, pos, order, REG_OP_RELEASE); +} + + #endif /* _LINUX_BITOPS_H_ */ diff --git a/sys/ofed/include/linux/clocksource.h b/sys/ofed/include/linux/clocksource.h new file mode 100644 index 000000000000..e74cc62e78ec --- /dev/null +++ b/sys/ofed/include/linux/clocksource.h @@ -0,0 +1,17 @@ +/* linux/include/linux/clocksource.h + * + * MLX4_CORE_PORT + * + * This file contains the structure definitions for clocksources. + * + * If you are not a clocksource, or timekeeping code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKSOURCE_H +#define _LINUX_CLOCKSOURCE_H + +/* clocksource cycle base type */ +typedef u64 cycle_t; + + +#endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/sys/ofed/include/linux/compat.h b/sys/ofed/include/linux/compat.h index cfb167112f8e..7af826c717bd 100644 --- a/sys/ofed/include/linux/compat.h +++ b/sys/ofed/include/linux/compat.h @@ -29,5 +29,8 @@ #ifndef _LINUX_COMPAT_H_ #define _LINUX_COMPAT_H_ +#define is_multicast_ether_addr(x) 0 +#define is_broadcast_ether_addr(x) 0 + #endif /* _LINUX_COMPAT_H_ */ diff --git a/sys/ofed/include/linux/device.h b/sys/ofed/include/linux/device.h index cce46ca4b2c9..37a772065baa 100644 --- a/sys/ofed/include/linux/device.h +++ b/sys/ofed/include/linux/device.h @@ -385,4 +385,10 @@ class_remove_file(struct class *class, const struct class_attribute *attr) sysfs_remove_file(&class->kobj, &attr->attr); } +static inline int dev_to_node(struct device *dev) +{ + return -1; +} + + #endif /* _LINUX_DEVICE_H_ */ diff --git a/sys/ofed/include/linux/dma-mapping.h b/sys/ofed/include/linux/dma-mapping.h index 0f0ad9d279a4..065745cbfa80 100644 --- a/sys/ofed/include/linux/dma-mapping.h +++ b/sys/ofed/include/linux/dma-mapping.h @@ -245,6 +245,13 @@ dma_mapping_error(struct device *dev, dma_addr_t dma_addr) return (0); } +static inline unsigned int dma_set_max_seg_size(struct device *dev, + unsigned int size) +{ + return (0); +} + + #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL) diff --git a/sys/ofed/include/linux/gfp.h b/sys/ofed/include/linux/gfp.h index 8d2b228c6a96..e88df78d68c5 100644 --- a/sys/ofed/include/linux/gfp.h +++ b/sys/ofed/include/linux/gfp.h @@ -121,4 +121,8 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) return (virt_to_page(page)); } +#define alloc_pages_node(node, mask, order) alloc_pages(mask, order) + +#define kmalloc_node(chunk, mask, node) kmalloc(chunk, mask) + #endif /* _LINUX_GFP_H_ */ diff --git a/sys/ofed/include/linux/idr.h b/sys/ofed/include/linux/idr.h index 40b25b663eb7..b778e64ccfc9 100644 --- a/sys/ofed/include/linux/idr.h +++ b/sys/ofed/include/linux/idr.h @@ -40,6 +40,10 @@ #define MAX_ID_MASK (MAX_ID_BIT - 1) #define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS +#define MAX_IDR_SHIFT (sizeof(int)*8 - 1) +#define MAX_IDR_BIT (1U << MAX_IDR_SHIFT) +#define MAX_IDR_MASK (MAX_IDR_BIT - 1) + struct idr_layer { unsigned long bitmap; struct idr_layer *ary[IDR_SIZE]; diff --git a/sys/ofed/include/linux/if_ether.h b/sys/ofed/include/linux/if_ether.h index 960865781dd4..f10df2edcb23 100644 --- a/sys/ofed/include/linux/if_ether.h +++ b/sys/ofed/include/linux/if_ether.h @@ -34,4 +34,9 @@ #define ETH_P_8021Q ETHERTYPE_VLAN +/* + * defined Ethernet Protocol ID's. + */ +#define ETH_P_IP 0x0800 /* Internet Protocol packet */ + #endif /* _LINUX_IF_ETHER_H_ */ diff --git a/sys/ofed/include/linux/in6.h b/sys/ofed/include/linux/in6.h index 925c7ed3a386..2032b6179594 100644 --- a/sys/ofed/include/linux/in6.h +++ b/sys/ofed/include/linux/in6.h @@ -29,6 +29,8 @@ #ifndef _LINUX_IN6_H_ #define _LINUX_IN6_H_ +#ifndef KLD_MODULE #include "opt_inet6.h" +#endif #endif /* _LINUX_IN6_H_ */ diff --git a/sys/ofed/include/linux/kernel.h b/sys/ofed/include/linux/kernel.h index f49036e72414..55b71f61fe0d 100644 --- a/sys/ofed/include/linux/kernel.h +++ b/sys/ofed/include/linux/kernel.h @@ -47,6 +47,7 @@ #include #include +#define KERN_CONT "" #define KERN_EMERG "<0>" #define KERN_ALERT "<1>" #define KERN_CRIT "<2>" @@ -68,6 +69,60 @@ #define pr_debug(fmt, ...) printk(KERN_DEBUG # fmt, ##__VA_ARGS__) #define udelay(t) DELAY(t) +#ifndef pr_fmt +#define pr_fmt(fmt) fmt +#endif + +/* + * Print a one-time message (analogous to WARN_ONCE() et al): + */ +#define printk_once(x...) ({ \ + static bool __print_once; \ + \ + if (!__print_once) { \ + __print_once = true; \ + printk(x); \ + } \ +}) + + + +#define pr_emerg(fmt, ...) \ + printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) \ + printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) \ + printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) \ + printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warning(fmt, ...) \ + printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn pr_warning +#define pr_notice(fmt, ...) \ + printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) +#define pr_cont(fmt, ...) \ + printk(KERN_CONT fmt, ##__VA_ARGS__) + +/* pr_devel() should produce zero code unless DEBUG is defined */ +#ifdef DEBUG +#define pr_devel(fmt, ...) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) +#else +#define pr_devel(fmt, ...) \ + ({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; }) +#endif + +#ifndef WARN +#define WARN(condition, format...) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + pr_warning(format); \ + unlikely(__ret_warn_on); \ +}) +#endif + #define container_of(ptr, type, member) \ ({ \ __typeof(((type *)0)->member) *_p = (ptr); \ @@ -77,12 +132,27 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define simple_strtoul strtoul +#define simple_strtol strtol #define min(x, y) (x < y ? x : y) #define max(x, y) (x > y ? x : y) #define min_t(type, _x, _y) (type)(_x) < (type)(_y) ? (type)(_x) : (_y) #define max_t(type, _x, _y) (type)(_x) > (type)(_y) ? (type)(_x) : (_y) +/* + * This looks more complex than it should be. But we need to + * get the type for the ~ right in round_down (it needs to be + * as wide as the result!), and we want to evaluate the macro + * arguments just once each. + */ +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) + #define num_possible_cpus() mp_ncpus +typedef struct pm_message { + int event; +} pm_message_t; + #endif /* _LINUX_KERNEL_H_ */ diff --git a/sys/ofed/include/linux/linux_compat.c b/sys/ofed/include/linux/linux_compat.c index 4dbdad92665d..80d1e1e8cf1f 100644 --- a/sys/ofed/include/linux/linux_compat.c +++ b/sys/ofed/include/linux/linux_compat.c @@ -267,6 +267,8 @@ linux_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td) if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) return (error); filp->f_flags = file->f_flag; + devfs_clear_cdevpriv(); + return (0); } diff --git a/sys/ofed/include/linux/list.h b/sys/ofed/include/linux/list.h index 61b42d242a43..4b5454a517b1 100644 --- a/sys/ofed/include/linux/list.h +++ b/sys/ofed/include/linux/list.h @@ -111,6 +111,9 @@ list_del_init(struct list_head *entry) #define list_entry(ptr, type, field) container_of(ptr, type, field) +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + #define list_for_each(p, head) \ for (p = (head)->next; p != (head); p = p->next) diff --git a/sys/ofed/include/linux/log2.h b/sys/ofed/include/linux/log2.h index 0a8315a40b1c..8c2a05b64b96 100644 --- a/sys/ofed/include/linux/log2.h +++ b/sys/ofed/include/linux/log2.h @@ -51,10 +51,119 @@ rounddown_pow_of_two(unsigned long x) return (1UL << (flsl(x) - 1)); } -static inline unsigned long -ilog2(unsigned long x) + +/* + * deal with unrepresentable constant logarithms + */ +extern __attribute__((const, noreturn)) +int ____ilog2_NaN(void); + +/* + * non-constant log of base 2 calculators + * - the arch may override these in asm/bitops.h if they can be implemented + * more efficiently than using fls() and fls64() + * - the arch is not required to handle n==0 if implementing the fallback + */ +#ifndef CONFIG_ARCH_HAS_ILOG2_U32 +static inline __attribute__((const)) +int __ilog2_u32(u32 n) { - return (flsl(x) - 1); + return flsl(n) - 1; } +#endif + +#ifndef CONFIG_ARCH_HAS_ILOG2_U64 +static inline __attribute__((const)) +int __ilog2_u64(u64 n) +{ + return flsl(n) - 1; +} +#endif + + +/** + * ilog2 - log of base 2 of 32-bit or a 64-bit unsigned value + * @n - parameter + * + * constant-capable log of base 2 calculation + * - this can be used to initialise global variables from constant data, hence + * the massive ternary operator construction + * + * selects the appropriately-sized optimised version depending on sizeof(n) + */ +#define ilog2(n) \ +( \ + __builtin_constant_p(n) ? ( \ + (n) < 1 ? ____ilog2_NaN() : \ + (n) & (1ULL << 63) ? 63 : \ + (n) & (1ULL << 62) ? 62 : \ + (n) & (1ULL << 61) ? 61 : \ + (n) & (1ULL << 60) ? 60 : \ + (n) & (1ULL << 59) ? 59 : \ + (n) & (1ULL << 58) ? 58 : \ + (n) & (1ULL << 57) ? 57 : \ + (n) & (1ULL << 56) ? 56 : \ + (n) & (1ULL << 55) ? 55 : \ + (n) & (1ULL << 54) ? 54 : \ + (n) & (1ULL << 53) ? 53 : \ + (n) & (1ULL << 52) ? 52 : \ + (n) & (1ULL << 51) ? 51 : \ + (n) & (1ULL << 50) ? 50 : \ + (n) & (1ULL << 49) ? 49 : \ + (n) & (1ULL << 48) ? 48 : \ + (n) & (1ULL << 47) ? 47 : \ + (n) & (1ULL << 46) ? 46 : \ + (n) & (1ULL << 45) ? 45 : \ + (n) & (1ULL << 44) ? 44 : \ + (n) & (1ULL << 43) ? 43 : \ + (n) & (1ULL << 42) ? 42 : \ + (n) & (1ULL << 41) ? 41 : \ + (n) & (1ULL << 40) ? 40 : \ + (n) & (1ULL << 39) ? 39 : \ + (n) & (1ULL << 38) ? 38 : \ + (n) & (1ULL << 37) ? 37 : \ + (n) & (1ULL << 36) ? 36 : \ + (n) & (1ULL << 35) ? 35 : \ + (n) & (1ULL << 34) ? 34 : \ + (n) & (1ULL << 33) ? 33 : \ + (n) & (1ULL << 32) ? 32 : \ + (n) & (1ULL << 31) ? 31 : \ + (n) & (1ULL << 30) ? 30 : \ + (n) & (1ULL << 29) ? 29 : \ + (n) & (1ULL << 28) ? 28 : \ + (n) & (1ULL << 27) ? 27 : \ + (n) & (1ULL << 26) ? 26 : \ + (n) & (1ULL << 25) ? 25 : \ + (n) & (1ULL << 24) ? 24 : \ + (n) & (1ULL << 23) ? 23 : \ + (n) & (1ULL << 22) ? 22 : \ + (n) & (1ULL << 21) ? 21 : \ + (n) & (1ULL << 20) ? 20 : \ + (n) & (1ULL << 19) ? 19 : \ + (n) & (1ULL << 18) ? 18 : \ + (n) & (1ULL << 17) ? 17 : \ + (n) & (1ULL << 16) ? 16 : \ + (n) & (1ULL << 15) ? 15 : \ + (n) & (1ULL << 14) ? 14 : \ + (n) & (1ULL << 13) ? 13 : \ + (n) & (1ULL << 12) ? 12 : \ + (n) & (1ULL << 11) ? 11 : \ + (n) & (1ULL << 10) ? 10 : \ + (n) & (1ULL << 9) ? 9 : \ + (n) & (1ULL << 8) ? 8 : \ + (n) & (1ULL << 7) ? 7 : \ + (n) & (1ULL << 6) ? 6 : \ + (n) & (1ULL << 5) ? 5 : \ + (n) & (1ULL << 4) ? 4 : \ + (n) & (1ULL << 3) ? 3 : \ + (n) & (1ULL << 2) ? 2 : \ + (n) & (1ULL << 1) ? 1 : \ + (n) & (1ULL << 0) ? 0 : \ + ____ilog2_NaN() \ + ) : \ + (sizeof(n) <= 4) ? \ + __ilog2_u32(n) : \ + __ilog2_u64(n) \ + ) #endif /* _LINUX_LOG2_H_ */ diff --git a/sys/ofed/include/linux/mlx4/cmd.h b/sys/ofed/include/linux/mlx4/cmd.h index 60d3036ef78c..d83ee3a53453 100644 --- a/sys/ofed/include/linux/mlx4/cmd.h +++ b/sys/ofed/include/linux/mlx4/cmd.h @@ -59,12 +59,16 @@ enum { MLX4_CMD_HW_HEALTH_CHECK = 0x50, MLX4_CMD_SET_PORT = 0xc, MLX4_CMD_SET_NODE = 0x5a, + MLX4_CMD_QUERY_FUNC = 0x56, MLX4_CMD_ACCESS_DDR = 0x2e, MLX4_CMD_MAP_ICM = 0xffa, MLX4_CMD_UNMAP_ICM = 0xff9, MLX4_CMD_MAP_ICM_AUX = 0xffc, MLX4_CMD_UNMAP_ICM_AUX = 0xffb, MLX4_CMD_SET_ICM_SIZE = 0xffd, + /*master notify fw on finish for slave's flr*/ + MLX4_CMD_INFORM_FLR_DONE = 0x5b, + MLX4_CMD_GET_OP_REQ = 0x59, /* TPT commands */ MLX4_CMD_SW2HW_MPT = 0xd, @@ -119,6 +123,26 @@ enum { /* miscellaneous commands */ MLX4_CMD_DIAG_RPRT = 0x30, MLX4_CMD_NOP = 0x31, + MLX4_CMD_ACCESS_MEM = 0x2e, + MLX4_CMD_SET_VEP = 0x52, + + /* Ethernet specific commands */ + MLX4_CMD_SET_VLAN_FLTR = 0x47, + MLX4_CMD_SET_MCAST_FLTR = 0x48, + MLX4_CMD_DUMP_ETH_STATS = 0x49, + + /* Communication channel commands */ + MLX4_CMD_ARM_COMM_CHANNEL = 0x57, + MLX4_CMD_GEN_EQE = 0x58, + + /* virtual commands */ + MLX4_CMD_ALLOC_RES = 0xf00, + MLX4_CMD_FREE_RES = 0xf01, + MLX4_CMD_MCAST_ATTACH = 0xf05, + MLX4_CMD_UCAST_ATTACH = 0xf06, + MLX4_CMD_PROMISC = 0xf08, + MLX4_CMD_QUERY_FUNC_CAP = 0xf0a, + MLX4_CMD_QP_ATTACH = 0xf0b, /* debug commands */ MLX4_CMD_QUERY_DEBUG_MSG = 0x2a, @@ -127,16 +151,26 @@ enum { /* statistics commands */ MLX4_CMD_QUERY_IF_STAT = 0X54, MLX4_CMD_SET_IF_STAT = 0X55, + + /* set port opcode modifiers */ + MLX4_SET_PORT_PRIO2TC = 0x8, + MLX4_SET_PORT_SCHEDULER = 0x9, + + /* register/delete flow steering network rules */ + MLX4_QP_FLOW_STEERING_ATTACH = 0x65, + MLX4_QP_FLOW_STEERING_DETACH = 0x66, + MLX4_FLOW_STEERING_IB_UC_QP_RANGE = 0x64, }; enum { - MLX4_CMD_TIME_CLASS_A = 10000, - MLX4_CMD_TIME_CLASS_B = 10000, - MLX4_CMD_TIME_CLASS_C = 10000, + MLX4_CMD_TIME_CLASS_A = 60000, + MLX4_CMD_TIME_CLASS_B = 60000, + MLX4_CMD_TIME_CLASS_C = 60000, }; enum { - MLX4_MAILBOX_SIZE = 4096 + MLX4_MAILBOX_SIZE = 4096, + MLX4_ACCESS_MEM_ALIGN = 256, }; enum { @@ -149,6 +183,11 @@ enum { MLX4_SET_PORT_GID_TABLE = 0x5, }; +enum { + MLX4_CMD_WRAPPED, + MLX4_CMD_NATIVE +}; + struct mlx4_dev; struct mlx4_cmd_mailbox { @@ -158,23 +197,24 @@ struct mlx4_cmd_mailbox { int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, - u16 op, unsigned long timeout); + u16 op, unsigned long timeout, int native); /* Invoke a command with no output parameter */ static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier, - u8 op_modifier, u16 op, unsigned long timeout) + u8 op_modifier, u16 op, unsigned long timeout, + int native) { return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier, - op_modifier, op, timeout); + op_modifier, op, timeout, native); } /* Invoke a command with an output mailbox */ static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param, u32 in_modifier, u8 op_modifier, u16 op, - unsigned long timeout) + unsigned long timeout, int native) { return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier, - op_modifier, op, timeout); + op_modifier, op, timeout, native); } /* @@ -184,13 +224,21 @@ static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param */ static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param, u32 in_modifier, u8 op_modifier, u16 op, - unsigned long timeout) + unsigned long timeout, int native) { return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier, - op_modifier, op, timeout); + op_modifier, op, timeout, native); } struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev); void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox); +u32 mlx4_comm_get_version(void); +int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac); +int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos); +int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting); + + +#define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8) + #endif /* MLX4_CMD_H */ diff --git a/sys/ofed/include/linux/mlx4/cq.h b/sys/ofed/include/linux/mlx4/cq.h index 6f65b2c8bb89..08216692452e 100644 --- a/sys/ofed/include/linux/mlx4/cq.h +++ b/sys/ofed/include/linux/mlx4/cq.h @@ -64,6 +64,22 @@ struct mlx4_err_cqe { u8 owner_sr_opcode; }; +struct mlx4_ts_cqe { + __be32 vlan_my_qpn; + __be32 immed_rss_invalid; + __be32 g_mlpath_rqpn; + __be32 timestamp_hi; + __be16 status; + u8 ipv6_ext_mask; + u8 badfcs_enc; + __be32 byte_cnt; + __be16 wqe_index; + __be16 checksum; + u8 reserved; + __be16 timestamp_lo; + u8 owner_sr_opcode; +} __packed; + enum { MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29, MLX4_CQE_QPN_MASK = 0xffffff, @@ -146,5 +162,5 @@ int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq, u16 count, u16 period); int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq, int entries, struct mlx4_mtt *mtt); - +int mlx4_cq_ignore_overrun(struct mlx4_dev *dev, struct mlx4_cq *cq); #endif /* MLX4_CQ_H */ diff --git a/sys/ofed/include/linux/mlx4/device.h b/sys/ofed/include/linux/mlx4/device.h index 167c7e5414b0..2828ef073120 100644 --- a/sys/ofed/include/linux/mlx4/device.h +++ b/sys/ofed/include/linux/mlx4/device.h @@ -36,46 +36,175 @@ #include #include #include +//#include /* XXX SK Probably not needed in freeBSD XXX */ #include -#include +#include /* XXX SK ported to freeBSD */ + +#define MAX_MSIX_P_PORT 17 +#define MAX_MSIX 64 +#define MSIX_LEGACY_SZ 4 +#define MIN_MSIX_P_PORT 5 + +#define MLX4_ROCE_MAX_GIDS 128 +#define MLX4_ROCE_PF_GIDS 16 + +#define MLX4_NUM_UP 8 +#define MLX4_NUM_TC 8 +#define MLX4_MAX_100M_UNITS_VAL 255 /* + * work around: can't set values + * greater then this value when + * using 100 Mbps units. + */ +#define MLX4_RATELIMIT_100M_UNITS 3 /* 100 Mbps */ +#define MLX4_RATELIMIT_1G_UNITS 4 /* 1 Gbps */ +#define MLX4_RATELIMIT_DEFAULT 0x00ff + + + +#define MLX4_LEAST_ATTACHED_VECTOR 0xffffffff enum { MLX4_FLAG_MSI_X = 1 << 0, MLX4_FLAG_OLD_PORT_CMDS = 1 << 1, + MLX4_FLAG_MASTER = 1 << 2, + MLX4_FLAG_SLAVE = 1 << 3, + MLX4_FLAG_SRIOV = 1 << 4, }; enum { - MLX4_MAX_PORTS = 2 + MLX4_PORT_CAP_IS_SM = 1 << 1, + MLX4_PORT_CAP_DEV_MGMT_SUP = 1 << 19, }; +enum { + MLX4_MAX_PORTS = 2, + MLX4_MAX_PORT_PKEYS = 128 +}; + +/* base qkey for use in sriov tunnel-qp/proxy-qp communication. + * These qkeys must not be allowed for general use. This is a 64k range, + * and to test for violation, we use the mask (protect against future chg). + */ +#define MLX4_RESERVED_QKEY_BASE (0xFFFF0000) +#define MLX4_RESERVED_QKEY_MASK (0xFFFF0000) + enum { MLX4_BOARD_ID_LEN = 64 }; enum { - MLX4_DEV_CAP_FLAG_RC = 1 << 0, - MLX4_DEV_CAP_FLAG_UC = 1 << 1, - MLX4_DEV_CAP_FLAG_UD = 1 << 2, - MLX4_DEV_CAP_FLAG_XRC = 1 << 3, - MLX4_DEV_CAP_FLAG_SRQ = 1 << 6, - MLX4_DEV_CAP_FLAG_IPOIB_CSUM = 1 << 7, - MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1 << 8, - MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1 << 9, - MLX4_DEV_CAP_FLAG_DPDP = 1 << 12, - MLX4_DEV_CAP_FLAG_RAW_ETY = 1 << 13, - MLX4_DEV_CAP_FLAG_BLH = 1 << 15, - MLX4_DEV_CAP_FLAG_MEM_WINDOW = 1 << 16, - MLX4_DEV_CAP_FLAG_APM = 1 << 17, - MLX4_DEV_CAP_FLAG_ATOMIC = 1 << 18, - MLX4_DEV_CAP_FLAG_RAW_MCAST = 1 << 19, - MLX4_DEV_CAP_FLAG_UD_AV_PORT = 1 << 20, - MLX4_DEV_CAP_FLAG_UD_MCAST = 1 << 21, - MLX4_DEV_CAP_FLAG_IBOE = 1 << 30, - MLX4_DEV_CAP_FLAG_FC_T11 = 1 << 31 + MLX4_MAX_NUM_PF = 16, + MLX4_MAX_NUM_VF = 64, + MLX4_MFUNC_MAX = 80, + MLX4_MAX_EQ_NUM = 1024, + MLX4_MFUNC_EQ_NUM = 4, + MLX4_MFUNC_MAX_EQES = 8, + MLX4_MFUNC_EQE_MASK = (MLX4_MFUNC_MAX_EQES - 1) }; +/* Driver supports 3 diffrent device methods to manage traffic steering: + * -device managed - High level API for ib and eth flow steering. FW is + * managing flow steering tables. + * - B0 steering mode - Common low level API for ib and (if supported) eth. + * - A0 steering mode - Limited low level API for eth. In case of IB, + * B0 mode is in use. + */ +enum { + MLX4_STEERING_MODE_A0, + MLX4_STEERING_MODE_B0, + MLX4_STEERING_MODE_DEVICE_MANAGED +}; + +static inline const char *mlx4_steering_mode_str(int steering_mode) +{ + switch (steering_mode) { + case MLX4_STEERING_MODE_A0: + return "A0 steering"; + + case MLX4_STEERING_MODE_B0: + return "B0 steering"; + + case MLX4_STEERING_MODE_DEVICE_MANAGED: + return "Device managed flow steering"; + + default: + return "Unrecognize steering mode"; + } +} + +enum { + MLX4_DEV_CAP_FLAG_RC = 1LL << 0, + MLX4_DEV_CAP_FLAG_UC = 1LL << 1, + MLX4_DEV_CAP_FLAG_UD = 1LL << 2, + MLX4_DEV_CAP_FLAG_XRC = 1LL << 3, + MLX4_DEV_CAP_FLAG_SRQ = 1LL << 6, + MLX4_DEV_CAP_FLAG_IPOIB_CSUM = 1LL << 7, + MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1LL << 8, + MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1LL << 9, + MLX4_DEV_CAP_FLAG_DPDP = 1LL << 12, + MLX4_DEV_CAP_FLAG_BLH = 1LL << 15, + MLX4_DEV_CAP_FLAG_MEM_WINDOW = 1LL << 16, + MLX4_DEV_CAP_FLAG_APM = 1LL << 17, + MLX4_DEV_CAP_FLAG_ATOMIC = 1LL << 18, + MLX4_DEV_CAP_FLAG_RAW_MCAST = 1LL << 19, + MLX4_DEV_CAP_FLAG_UD_AV_PORT = 1LL << 20, + MLX4_DEV_CAP_FLAG_UD_MCAST = 1LL << 21, + MLX4_DEV_CAP_FLAG_IBOE = 1LL << 30, + MLX4_DEV_CAP_FLAG_UC_LOOPBACK = 1LL << 32, + MLX4_DEV_CAP_FLAG_FCS_KEEP = 1LL << 34, + MLX4_DEV_CAP_FLAG_WOL_PORT1 = 1LL << 37, + MLX4_DEV_CAP_FLAG_WOL_PORT2 = 1LL << 38, + MLX4_DEV_CAP_FLAG_UDP_RSS = 1LL << 40, + MLX4_DEV_CAP_FLAG_VEP_UC_STEER = 1LL << 41, + MLX4_DEV_CAP_FLAG_VEP_MC_STEER = 1LL << 42, + MLX4_DEV_CAP_FLAG_COUNTERS = 1LL << 48, + MLX4_DEV_CAP_FLAG_COUNTERS_EXT = 1LL << 49, + MLX4_DEV_CAP_FLAG_SET_PORT_ETH_SCHED = 1LL << 53, + MLX4_DEV_CAP_FLAG_SENSE_SUPPORT = 1LL << 55, + MLX4_DEV_CAP_FLAG_FAST_DROP = 1LL << 57, + MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV = 1LL << 59, + MLX4_DEV_CAP_FLAG_ESWITCH_SUPPORT = 1LL << 60, + MLX4_DEV_CAP_FLAG_64B_EQE = 1LL << 61, + MLX4_DEV_CAP_FLAG_64B_CQE = 1LL << 62 +}; + +enum { + MLX4_DEV_CAP_FLAG2_RSS = 1LL << 0, + MLX4_DEV_CAP_FLAG2_RSS_TOP = 1LL << 1, + MLX4_DEV_CAP_FLAG2_RSS_XOR = 1LL << 2, + MLX4_DEV_CAP_FLAG2_FS_EN = 1LL << 3 +}; + +enum { + MLX4_DEV_CAP_64B_EQE_ENABLED = 1LL << 0, + MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1 +}; + +enum { + MLX4_USER_DEV_CAP_64B_CQE = 1L << 0 +}; + +enum { + MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0 +}; + +/* bit enums for an 8-bit flags field indicating special use + * QPs which require special handling in qp_reserve_range. + * Currently, this only includes QPs used by the ETH interface, + * where we expect to use blueflame. These QPs must not have + * bits 6 and 7 set in their qp number. + * + * This enum may use only bits 0..7. + */ +enum { + MLX4_RESERVE_BF_QP = 1 << 7, +}; + + +#define MLX4_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) + enum { MLX4_BMME_FLAG_LOCAL_INV = 1 << 6, MLX4_BMME_FLAG_REMOTE_INV = 1 << 7, @@ -102,7 +231,14 @@ enum mlx4_event { MLX4_EVENT_TYPE_PORT_CHANGE = 0x09, MLX4_EVENT_TYPE_EQ_OVERFLOW = 0x0f, MLX4_EVENT_TYPE_ECC_DETECT = 0x0e, - MLX4_EVENT_TYPE_CMD = 0x0a + MLX4_EVENT_TYPE_CMD = 0x0a, + MLX4_EVENT_TYPE_VEP_UPDATE = 0x19, + MLX4_EVENT_TYPE_COMM_CHANNEL = 0x18, + MLX4_EVENT_TYPE_OP_REQUIRED = 0x1a, + MLX4_EVENT_TYPE_FATAL_WARNING = 0x1b, + MLX4_EVENT_TYPE_FLR_EVENT = 0x1c, + MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d, + MLX4_EVENT_TYPE_NONE = 0xff, }; enum { @@ -110,6 +246,29 @@ enum { MLX4_PORT_CHANGE_SUBTYPE_ACTIVE = 4 }; +enum { + MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0, +}; + +enum slave_port_state { + SLAVE_PORT_DOWN = 0, + SLAVE_PENDING_UP, + SLAVE_PORT_UP, +}; + +enum slave_port_gen_event { + SLAVE_PORT_GEN_EVENT_DOWN = 0, + SLAVE_PORT_GEN_EVENT_UP, + SLAVE_PORT_GEN_EVENT_NONE, +}; + +enum slave_port_state_event { + MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, + MLX4_PORT_STATE_DEV_EVENT_PORT_UP, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, +}; + enum { MLX4_PERM_LOCAL_READ = 1 << 10, MLX4_PERM_LOCAL_WRITE = 1 << 11, @@ -126,7 +285,6 @@ enum { MLX4_OPCODE_SEND = 0x0a, MLX4_OPCODE_SEND_IMM = 0x0b, MLX4_OPCODE_LSO = 0x0e, - MLX4_OPCODE_BIG_LSO = 0x2e, MLX4_OPCODE_RDMA_READ = 0x10, MLX4_OPCODE_ATOMIC_CS = 0x11, MLX4_OPCODE_ATOMIC_FA = 0x12, @@ -150,14 +308,26 @@ enum { MLX4_STAT_RATE_OFFSET = 5 }; +enum mlx4_protocol { + MLX4_PROT_IB_IPV6 = 0, + MLX4_PROT_ETH, + MLX4_PROT_IB_IPV4, + MLX4_PROT_FCOE +}; + enum { MLX4_MTT_FLAG_PRESENT = 1 }; +enum { + MLX4_MAX_MTT_SHIFT = 31 +}; + enum mlx4_qp_region { MLX4_QP_REGION_FW = 0, MLX4_QP_REGION_ETH_ADDR, MLX4_QP_REGION_FC_ADDR, + MLX4_QP_REGION_FC_EXCH, MLX4_NUM_QP_REGION }; @@ -173,25 +343,56 @@ enum mlx4_special_vlan_idx { MLX4_VLAN_MISS_IDX, MLX4_VLAN_REGULAR }; -#define MLX4_LEAST_ATTACHED_VECTOR 0xffffffff -enum { - MLX4_CUNTERS_DISABLED, - MLX4_CUNTERS_BASIC, - MLX4_CUNTERS_EXT +enum mlx4_steer_type { + MLX4_MC_STEER = 0, + MLX4_UC_STEER, + MLX4_NUM_STEERS }; enum { - MAX_FAST_REG_PAGES = 511, + MLX4_NUM_FEXCH = 64 * 1024, }; +enum { + MLX4_MAX_FAST_REG_PAGES = 511, +}; + +enum { + MLX4_DEV_PMC_SUBTYPE_GUID_INFO = 0x14, + MLX4_DEV_PMC_SUBTYPE_PORT_INFO = 0x15, + MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE = 0x16, +}; + +/* Port mgmt change event handling */ +enum { + MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK = 1 << 0, + MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK = 1 << 1, + MLX4_EQ_PORT_INFO_LID_CHANGE_MASK = 1 << 2, + MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK = 1 << 3, + MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK = 1 << 4, +}; + +#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ + MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK) + static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) { return (major << 32) | (minor << 16) | subminor; } +struct mlx4_phys_caps { + u32 gid_phys_table_len[MLX4_MAX_PORTS + 1]; + u32 pkey_phys_table_len[MLX4_MAX_PORTS + 1]; + u32 num_phys_eqs; + u32 base_sqpn; + u32 base_proxy_sqpn; + u32 base_tunnel_sqpn; +}; + struct mlx4_caps { u64 fw_ver; + u32 function; int num_ports; int vl_cap[MLX4_MAX_PORTS + 1]; int ib_mtu_cap[MLX4_MAX_PORTS + 1]; @@ -206,6 +407,7 @@ struct mlx4_caps { u64 trans_code[MLX4_MAX_PORTS + 1]; int local_ca_ack_delay; int num_uars; + u32 uar_page_size; int bf_reg_size; int bf_regs_per_page; int max_sq_sg; @@ -216,7 +418,10 @@ struct mlx4_caps { int max_rq_desc_sz; int max_qp_init_rdma; int max_qp_dest_rdma; - int sqp_start; + u32 *qp0_proxy; + u32 *qp1_proxy; + u32 *qp0_tunnel; + u32 *qp1_tunnel; int num_srqs; int max_srq_wqes; int max_srq_sge; @@ -227,9 +432,10 @@ struct mlx4_caps { int num_eqs; int reserved_eqs; int num_comp_vectors; + int comp_pool; int num_mpts; - int num_mtt_segs; - int mtts_per_seg; + int max_fmr_maps; + int num_mtts; int fmr_reserved_mtts; int reserved_mtts; int reserved_mrws; @@ -238,36 +444,47 @@ struct mlx4_caps { int num_amgms; int reserved_mcgs; int num_qp_per_mgm; + int steering_mode; int num_pds; int reserved_pds; - int mtt_entry_sz; - int reserved_xrcds; int max_xrcds; + int reserved_xrcds; + int mtt_entry_sz; u32 max_msg_sz; u32 page_size_cap; u64 flags; + u64 flags2; u32 bmme_flags; u32 reserved_lkey; u16 stat_rate_support; - int udp_rss; - int loopback_support; - int wol; + u8 cq_timestamp; u8 port_width_cap[MLX4_MAX_PORTS + 1]; int max_gso_sz; + int max_rss_tbl_sz; int reserved_qps_cnt[MLX4_NUM_QP_REGION]; int reserved_qps; int reserved_qps_base[MLX4_NUM_QP_REGION]; int log_num_macs; int log_num_vlans; - int log_num_prios; enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; u8 supported_type[MLX4_MAX_PORTS + 1]; - enum mlx4_port_type port_mask[MLX4_MAX_PORTS + 1]; + u8 suggested_type[MLX4_MAX_PORTS + 1]; + u8 default_sense[MLX4_MAX_PORTS + 1]; + u32 port_mask[MLX4_MAX_PORTS + 1]; enum mlx4_port_type possible_type[MLX4_MAX_PORTS + 1]; - u8 counters_mode; + u32 max_counters; + u8 port_ib_mtu[MLX4_MAX_PORTS + 1]; + u16 sqp_demux; + u32 sync_qp; + u32 eqe_size; + u32 cqe_size; + u8 eqe_factor; + u32 userspace_caps; /* userspace must be aware to */ + u32 function_caps; /* functions must be aware to */ + u8 fast_drop; + u16 hca_core_clock; u32 max_basic_counters; - u32 max_ext_counters; - u32 mc_promisc_mode; + u32 max_extended_counters; }; struct mlx4_buf_list { @@ -284,7 +501,7 @@ struct mlx4_buf { }; struct mlx4_mtt { - u32 first_seg; + u32 offset; int order; int page_shift; }; @@ -375,6 +592,8 @@ struct mlx4_cq { atomic_t refcount; struct completion free; + int eqn; + u16 irq; }; struct mlx4_qp { @@ -432,52 +651,168 @@ union mlx4_ext_av { struct mlx4_eth_av eth; }; -struct mlx4_counters { - __be32 counter_mode; - __be32 num_ifc; - u32 reserved[2]; - __be64 rx_frames; - __be64 rx_bytes; - __be64 tx_frames; - __be64 tx_bytes; +struct mlx4_if_stat_control { + u8 reserved1[3]; + /* Extended counters enabled */ + u8 cnt_mode; + /* Number of interfaces */ + __be32 num_of_if; + __be32 reserved[2]; }; -struct mlx4_counters_ext { - __be32 counter_mode; - __be32 num_ifc; - u32 reserved[2]; - __be64 rx_uni_frames; - __be64 rx_uni_bytes; - __be64 rx_mcast_frames; - __be64 rx_mcast_bytes; - __be64 rx_bcast_frames; - __be64 rx_bcast_bytes; - __be64 rx_nobuf_frames; - __be64 rx_nobuf_bytes; - __be64 rx_err_frames; - __be64 rx_err_bytes; - __be64 tx_uni_frames; - __be64 tx_uni_bytes; - __be64 tx_mcast_frames; - __be64 tx_mcast_bytes; - __be64 tx_bcast_frames; - __be64 tx_bcast_bytes; - __be64 tx_nobuf_frames; - __be64 tx_nobuf_bytes; - __be64 tx_err_frames; - __be64 tx_err_bytes; +struct mlx4_if_stat_basic { + struct mlx4_if_stat_control control; + struct { + __be64 IfRxFrames; + __be64 IfRxOctets; + __be64 IfTxFrames; + __be64 IfTxOctets; + } counters[]; +}; +#define MLX4_IF_STAT_BSC_SZ(ports)(sizeof(struct mlx4_if_stat_extended) +\ + sizeof(((struct mlx4_if_stat_extended *)0)->\ + counters[0]) * ports) + +struct mlx4_if_stat_extended { + struct mlx4_if_stat_control control; + struct { + __be64 IfRxUnicastFrames; + __be64 IfRxUnicastOctets; + __be64 IfRxMulticastFrames; + __be64 IfRxMulticastOctets; + __be64 IfRxBroadcastFrames; + __be64 IfRxBroadcastOctets; + __be64 IfRxNoBufferFrames; + __be64 IfRxNoBufferOctets; + __be64 IfRxErrorFrames; + __be64 IfRxErrorOctets; + __be32 reserved[39]; + __be64 IfTxUnicastFrames; + __be64 IfTxUnicastOctets; + __be64 IfTxMulticastFrames; + __be64 IfTxMulticastOctets; + __be64 IfTxBroadcastFrames; + __be64 IfTxBroadcastOctets; + __be64 IfTxDroppedFrames; + __be64 IfTxDroppedOctets; + __be64 IfTxRequestedFramesSent; + __be64 IfTxGeneratedFramesSent; + __be64 IfTxTsoOctets; + } __packed counters[]; +}; +#define MLX4_IF_STAT_EXT_SZ(ports) (sizeof(struct mlx4_if_stat_extended) +\ + sizeof(((struct mlx4_if_stat_extended *)\ + 0)->counters[0]) * ports) + +union mlx4_counter { + struct mlx4_if_stat_control control; + struct mlx4_if_stat_basic basic; + struct mlx4_if_stat_extended ext; +}; +#define MLX4_IF_STAT_SZ(ports) MLX4_IF_STAT_EXT_SZ(ports) + +struct mlx4_quotas { + int qp; + int cq; + int srq; + int mpt; + int mtt; + int counter; + int xrcd; }; struct mlx4_dev { struct pci_dev *pdev; unsigned long flags; + unsigned long num_slaves; struct mlx4_caps caps; + struct mlx4_phys_caps phys_caps; + struct mlx4_quotas quotas; struct radix_tree_root qp_table_tree; - struct radix_tree_root srq_table_tree; - u32 rev_id; + u8 rev_id; char board_id[MLX4_BOARD_ID_LEN]; + int num_vfs; + int numa_node; + int oper_log_mgm_entry_size; + u64 regid_promisc_array[MLX4_MAX_PORTS + 1]; + u64 regid_allmulti_array[MLX4_MAX_PORTS + 1]; }; +struct mlx4_eqe { + u8 reserved1; + u8 type; + u8 reserved2; + u8 subtype; + union { + u32 raw[6]; + struct { + __be32 cqn; + } __packed comp; + struct { + u16 reserved1; + __be16 token; + u32 reserved2; + u8 reserved3[3]; + u8 status; + __be64 out_param; + } __packed cmd; + struct { + __be32 qpn; + } __packed qp; + struct { + __be32 srqn; + } __packed srq; + struct { + __be32 cqn; + u32 reserved1; + u8 reserved2[3]; + u8 syndrome; + } __packed cq_err; + struct { + u32 reserved1[2]; + __be32 port; + } __packed port_change; + struct { + #define COMM_CHANNEL_BIT_ARRAY_SIZE 4 + u32 reserved; + u32 bit_vec[COMM_CHANNEL_BIT_ARRAY_SIZE]; + } __packed comm_channel_arm; + struct { + u8 port; + u8 reserved[3]; + __be64 mac; + } __packed mac_update; + struct { + __be32 slave_id; + } __packed flr_event; + struct { + __be16 current_temperature; + __be16 warning_threshold; + } __packed warming; + struct { + u8 reserved[3]; + u8 port; + union { + struct { + __be16 mstr_sm_lid; + __be16 port_lid; + __be32 changed_attr; + u8 reserved[3]; + u8 mstr_sm_sl; + __be64 gid_prefix; + } __packed port_info; + struct { + __be32 block_ptr; + __be32 tbl_entries_mask; + } __packed tbl_change_info; + } params; + } __packed port_mgmt_change; + } event; + u8 slave_id; + u8 reserved3[2]; + u8 owner; +} __packed; + struct mlx4_init_port_param { int set_guid0; int set_node_guid; @@ -492,29 +827,71 @@ struct mlx4_init_port_param { u64 si_guid; }; -static inline void mlx4_query_steer_cap(struct mlx4_dev *dev, int *log_mac, - int *log_vlan, int *log_prio) -{ - *log_mac = dev->caps.log_num_macs; - *log_vlan = dev->caps.log_num_vlans; - *log_prio = dev->caps.log_num_prios; -} - #define mlx4_foreach_port(port, dev, type) \ for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ if ((type) == (dev)->caps.port_mask[(port)]) +#define mlx4_foreach_non_ib_transport_port(port, dev) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB)) + #define mlx4_foreach_ib_transport_port(port, dev) \ - for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \ ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) +#define MLX4_INVALID_SLAVE_ID 0xFF + +void handle_port_mgmt_change_event(struct work_struct *work); + +static inline int mlx4_master_func_num(struct mlx4_dev *dev) +{ + return dev->caps.function; +} + +static inline int mlx4_is_master(struct mlx4_dev *dev) +{ + return dev->flags & MLX4_FLAG_MASTER; +} + +static inline int mlx4_num_reserved_sqps(struct mlx4_dev *dev) +{ + return dev->phys_caps.base_sqpn + 8 + + 16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev); +} + +static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn) +{ + return (qpn < dev->phys_caps.base_sqpn + 8 + + 16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev)); +} + +static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn) +{ + int guest_proxy_base = dev->phys_caps.base_proxy_sqpn + slave * 8; + + if (qpn >= guest_proxy_base && qpn < guest_proxy_base + 8) + return 1; + + return 0; +} + +static inline int mlx4_is_mfunc(struct mlx4_dev *dev) +{ + return dev->flags & (MLX4_FLAG_SLAVE | MLX4_FLAG_MASTER); +} + +static inline int mlx4_is_slave(struct mlx4_dev *dev) +{ + return dev->flags & MLX4_FLAG_SLAVE; +} + int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, struct mlx4_buf *buf); void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) { - if (buf->direct.buf != NULL) + if (BITS_PER_LONG == 64 || buf->nbufs == 1) return buf->direct.buf + offset; else return buf->page_list[offset >> PAGE_SHIFT].buf + @@ -523,31 +900,21 @@ static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn); void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn); - int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn); void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn); int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar); void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar); -int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf); +int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node); void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf); int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, struct mlx4_mtt *mtt); void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt); u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt); -int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port); -int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); - -int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx); -void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt); -int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, - u64 iova, u64 size, u32 access, int npages, - int page_shift, struct mlx4_mr *mr); int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access, int npages, int page_shift, struct mlx4_mr *mr); -void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr); void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr); int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr); int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, @@ -565,16 +932,17 @@ void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres, int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq, - unsigned vector, int collapsed); + unsigned vector, int collapsed, int timestamp_en); void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq); -int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base); +int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, + int *base, u8 bf_qp); void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp); void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp); -int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, +int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn, struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq); void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq); int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark); @@ -583,41 +951,185 @@ int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_waterm int mlx4_INIT_PORT(struct mlx4_dev *dev, int port); int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port); +int mlx4_unicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot); +int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot); int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - int block_mcast_loopback, enum mlx4_mcast_prot prot); + u8 port, int block_mcast_loopback, + enum mlx4_protocol protocol, u64 *reg_id); int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - enum mlx4_mcast_prot prot); + enum mlx4_protocol protocol, u64 reg_id); -int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index); -void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index); +enum { + MLX4_DOMAIN_UVERBS = 0x1000, + MLX4_DOMAIN_ETHTOOL = 0x2000, + MLX4_DOMAIN_RFS = 0x3000, + MLX4_DOMAIN_NIC = 0x5000, +}; +enum mlx4_net_trans_rule_id { + MLX4_NET_TRANS_RULE_ID_ETH = 0, + MLX4_NET_TRANS_RULE_ID_IB, + MLX4_NET_TRANS_RULE_ID_IPV6, + MLX4_NET_TRANS_RULE_ID_IPV4, + MLX4_NET_TRANS_RULE_ID_TCP, + MLX4_NET_TRANS_RULE_ID_UDP, + MLX4_NET_TRANS_RULE_NUM, /* should be last */ +}; + +extern const u16 __sw_id_hw[]; + +static inline int map_hw_to_sw_id(u16 header_id) +{ + + int i; + for (i = 0; i < MLX4_NET_TRANS_RULE_NUM; i++) { + if (header_id == __sw_id_hw[i]) + return i; + } + return -EINVAL; +} +enum mlx4_net_trans_promisc_mode { + MLX4_FS_REGULAR = 0, + MLX4_FS_ALL_DEFAULT = 1, + MLX4_FS_MC_DEFAULT = 3, + MLX4_FS_UC_SNIFFER = 4, + MLX4_FS_MC_SNIFFER = 5, +}; + +struct mlx4_spec_eth { + u8 dst_mac[6]; + u8 dst_mac_msk[6]; + u8 src_mac[6]; + u8 src_mac_msk[6]; + u8 ether_type_enable; + __be16 ether_type; + __be16 vlan_id_msk; + __be16 vlan_id; +}; + +struct mlx4_spec_tcp_udp { + __be16 dst_port; + __be16 dst_port_msk; + __be16 src_port; + __be16 src_port_msk; +}; + +struct mlx4_spec_ipv4 { + __be32 dst_ip; + __be32 dst_ip_msk; + __be32 src_ip; + __be32 src_ip_msk; +}; + +struct mlx4_spec_ib { + __be32 r_u_qpn; + __be32 qpn_msk; + u8 dst_gid[16]; + u8 dst_gid_msk[16]; +}; + +struct mlx4_spec_list { + struct list_head list; + enum mlx4_net_trans_rule_id id; + union { + struct mlx4_spec_eth eth; + struct mlx4_spec_ib ib; + struct mlx4_spec_ipv4 ipv4; + struct mlx4_spec_tcp_udp tcp_udp; + }; +}; + +enum mlx4_net_trans_hw_rule_queue { + MLX4_NET_TRANS_Q_FIFO, + MLX4_NET_TRANS_Q_LIFO, +}; + +struct mlx4_net_trans_rule { + struct list_head list; + enum mlx4_net_trans_hw_rule_queue queue_mode; + bool exclusive; + bool allow_loopback; + enum mlx4_net_trans_promisc_mode promisc_mode; + u8 port; + u16 priority; + u32 qpn; +}; + +int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn, + enum mlx4_net_trans_promisc_mode mode); +int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port, + enum mlx4_net_trans_promisc_mode mode); +int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode); + +int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac); +void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac); +int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port); +int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac); +void mlx4_set_stats_bitmap(struct mlx4_dev *dev, u64 *stats_bitmap); +int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, + u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx); +int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, + u8 promisc); +int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc); +int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw, + u8 *pg, u16 *ratelimit); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); -void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index); +void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); -int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr, - u64 *page_list, int npages, u64 iova, u32 fbo, - u32 len, u32 *lkey, u32 *rkey, int same_key); int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, int npages, u64 iova, u32 *lkey, u32 *rkey); -int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, - u32 access, int max_pages, int max_maps, - u8 page_shift, struct mlx4_fmr *fmr); int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, int max_maps, u8 page_shift, struct mlx4_fmr *fmr); int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr); void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u32 *lkey, u32 *rkey); -int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); int mlx4_query_diag_counters(struct mlx4_dev *mlx4_dev, int array_length, - u8 op_modifier, u32 in_offset[], u32 counter_out[]); -int mlx4_test_interrupts(struct mlx4_dev *dev); + u8 op_modifier, u32 in_offset[], + u32 counter_out[]); -void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported); +int mlx4_test_interrupts(struct mlx4_dev *dev); +int mlx4_assign_eq(struct mlx4_dev *dev, char *name, int *vector); +void mlx4_release_eq(struct mlx4_dev *dev, int vec); + +int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port); +int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); void mlx4_counter_free(struct mlx4_dev *dev, u32 idx); +int mlx4_flow_attach(struct mlx4_dev *dev, + struct mlx4_net_trans_rule *rule, u64 *reg_id); +int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id); + +void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, + int i, int val); + +int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey); + +int mlx4_is_slave_active(struct mlx4_dev *dev, int slave); +int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port); +int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port); +int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr); +int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, u8 port_subtype_change); +enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port); +int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int event, enum slave_port_gen_event *gen_event); + +void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid); +__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave); +int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id); +int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid); + +int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, u32 max_range_qpn); + +cycle_t mlx4_read_clock(struct mlx4_dev *dev); + #endif /* MLX4_DEVICE_H */ diff --git a/sys/ofed/include/linux/mlx4/driver.h b/sys/ofed/include/linux/mlx4/driver.h index 15c8319a5ca1..8235a97d0244 100644 --- a/sys/ofed/include/linux/mlx4/driver.h +++ b/sys/ofed/include/linux/mlx4/driver.h @@ -33,15 +33,22 @@ #ifndef MLX4_DRIVER_H #define MLX4_DRIVER_H -#include +#include struct mlx4_dev; +#define MLX4_MAC_MASK 0xffffffffffffULL +#define MLX4_BE_SHORT_MASK cpu_to_be16(0xffff) +#define MLX4_BE_WORD_MASK cpu_to_be32(0xffffffff) + enum mlx4_dev_event { MLX4_DEV_EVENT_CATASTROPHIC_ERROR, MLX4_DEV_EVENT_PORT_UP, MLX4_DEV_EVENT_PORT_DOWN, MLX4_DEV_EVENT_PORT_REINIT, + MLX4_DEV_EVENT_PORT_MGMT_CHANGE, + MLX4_DEV_EVENT_SLAVE_INIT, + MLX4_DEV_EVENT_SLAVE_SHUTDOWN, }; enum mlx4_query_reply { @@ -49,11 +56,6 @@ enum mlx4_query_reply { MLX4_QUERY_MINE_NOPORT = 0 }; -enum mlx4_prot { - MLX4_PROT_IB, - MLX4_PROT_EN, -}; - enum mlx4_mcast_prot { MLX4_MCAST_PROT_IB = 0, MLX4_MCAST_PROT_EN = 1, @@ -63,20 +65,32 @@ struct mlx4_interface { void * (*add) (struct mlx4_dev *dev); void (*remove)(struct mlx4_dev *dev, void *context); void (*event) (struct mlx4_dev *dev, void *context, - enum mlx4_dev_event event, int port); - void * (*get_prot_dev) (struct mlx4_dev *dev, void *context, u8 port); - enum mlx4_prot protocol; + enum mlx4_dev_event event, unsigned long param); + void * (*get_dev)(struct mlx4_dev *dev, void *context, u8 port); enum mlx4_query_reply (*query) (void *context, void *); struct list_head list; + enum mlx4_protocol protocol; }; int mlx4_register_interface(struct mlx4_interface *intf); void mlx4_unregister_interface(struct mlx4_interface *intf); -void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port); -struct mlx4_dev *mlx4_query_interface(void *, int *port); -void mlx4_set_iboe_counter(struct mlx4_dev *dev, int index, u8 port); -int mlx4_get_iboe_counter(struct mlx4_dev *dev, u8 port); +void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port); + +#ifndef ETH_ALEN +#define ETH_ALEN 6 +#endif +static inline u64 mlx4_mac_to_u64(u8 *addr) +{ + u64 mac = 0; + int i; + + for (i = 0; i < ETH_ALEN; i++) { + mac <<= 8; + mac |= addr[i]; + } + return mac; +} #endif /* MLX4_DRIVER_H */ diff --git a/sys/ofed/include/linux/mlx4/qp.h b/sys/ofed/include/linux/mlx4/qp.h index 3fe2bc508275..2d45a9d624fe 100644 --- a/sys/ofed/include/linux/mlx4/qp.h +++ b/sys/ofed/include/linux/mlx4/qp.h @@ -39,6 +39,15 @@ #define MLX4_INVALID_LKEY 0x100 +enum ib_m_qp_attr_mask { + IB_M_EXT_CLASS_1 = 1 << 28, + IB_M_EXT_CLASS_2 = 1 << 29, + IB_M_EXT_CLASS_3 = 1 << 30, + + IB_M_QP_MOD_VEND_MASK = (IB_M_EXT_CLASS_1 | IB_M_EXT_CLASS_2 | + IB_M_EXT_CLASS_3) +}; + enum mlx4_qp_optpar { MLX4_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, MLX4_QP_OPTPAR_RRE = 1 << 1, @@ -95,11 +104,42 @@ enum { MLX4_QP_BIT_RWE = 1 << 14, MLX4_QP_BIT_RAE = 1 << 13, MLX4_QP_BIT_RIC = 1 << 4, + MLX4_QP_BIT_COLL_SYNC_RQ = 1 << 2, + MLX4_QP_BIT_COLL_SYNC_SQ = 1 << 1, + MLX4_QP_BIT_COLL_MASTER = 1 << 0 +}; + +enum { + MLX4_RSS_HASH_XOR = 0, + MLX4_RSS_HASH_TOP = 1, + + MLX4_RSS_UDP_IPV6 = 1 << 0, + MLX4_RSS_UDP_IPV4 = 1 << 1, + MLX4_RSS_TCP_IPV6 = 1 << 2, + MLX4_RSS_IPV6 = 1 << 3, + MLX4_RSS_TCP_IPV4 = 1 << 4, + MLX4_RSS_IPV4 = 1 << 5, + + /* offset of mlx4_rss_context within mlx4_qp_context.pri_path */ + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH = 0x24, + /* offset of being RSS indirection QP within mlx4_qp_context.flags */ + MLX4_RSS_QPC_FLAG_OFFSET = 13, +}; + +struct mlx4_rss_context { + __be32 base_qpn; + __be32 default_qpn; + u16 reserved; + u8 hash_fn; + u8 flags; + __be32 rss_key[10]; + __be32 base_qpn_udp; }; struct mlx4_qp_path { u8 fl; - u8 reserved1[2]; + u8 reserved1[1]; + u8 disable_pkey_check; u8 pkey_index; u8 counter_index; u8 grh_mylmc; @@ -112,7 +152,8 @@ struct mlx4_qp_path { u8 rgid[16]; u8 sched_queue; u8 vlan_index; - u8 reserved3[2]; + u8 feup; + u8 reserved3; u8 reserved4[2]; u8 dmac[6]; }; @@ -153,16 +194,7 @@ struct mlx4_qp_context { u8 reserved4[2]; u8 mtt_base_addr_h; __be32 mtt_base_addr_l; - u8 VE; - u8 reserved5; - __be16 VFT_id_prio; - u8 reserved6; - u8 exch_size; - __be16 exch_base; - u8 VFT_hop_cnt; - u8 my_fc_id_idx; - __be16 reserved7; - u32 reserved8[7]; + u32 reserved5[10]; }; /* Which firmware version adds support for NEC (NoErrorCompletion) bit */ @@ -192,8 +224,12 @@ struct mlx4_wqe_ctrl_seg { * [4] IP checksum * [3:2] C (generate completion queue entry) * [1] SE (solicited event) + * [0] FL (force loopback) */ - __be32 srcrb_flags; + union { + __be32 srcrb_flags; + __be16 srcrb_flags16[2]; + }; /* * imm is immediate data for send/RDMA write w/ immediate; * also invalidation key for send with invalidate; input @@ -204,15 +240,15 @@ struct mlx4_wqe_ctrl_seg { enum { MLX4_WQE_MLX_VL15 = 1 << 17, - MLX4_WQE_MLX_SLR = 1 << 16, - MLX4_WQE_MLX_ICRC = 1 << 4 + MLX4_WQE_MLX_SLR = 1 << 16 }; struct mlx4_wqe_mlx_seg { u8 owner; u8 reserved1[2]; u8 opcode; - u8 reserved2[3]; + __be16 sched_prio; + u8 reserved2; u8 size; /* * [17] VL15 @@ -338,9 +374,6 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn) return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1)); } -struct mlx4_qp *mlx4_qp_lookup_lock(struct mlx4_dev *dev, u32 qpn); void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp); -int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region, - int *base_qpn, int *cnt); #endif /* MLX4_QP_H */ diff --git a/sys/ofed/include/linux/mlx4/srq.h b/sys/ofed/include/linux/mlx4/srq.h index 5e041e5fe06f..799a0697a383 100644 --- a/sys/ofed/include/linux/mlx4/srq.h +++ b/sys/ofed/include/linux/mlx4/srq.h @@ -33,22 +33,10 @@ #ifndef MLX4_SRQ_H #define MLX4_SRQ_H -#include -#include - struct mlx4_wqe_srq_next_seg { u16 reserved1; __be16 next_wqe_index; u32 reserved2[3]; }; -void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq); -void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq); - -static inline struct mlx4_srq *__mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn) -{ - return radix_tree_lookup(&dev->srq_table_tree, - srqn & (dev->caps.num_srqs - 1)); -} - #endif /* MLX4_SRQ_H */ diff --git a/sys/ofed/include/linux/moduleparam.h b/sys/ofed/include/linux/moduleparam.h index 2c541a67f957..e8534c7cdbbb 100644 --- a/sys/ofed/include/linux/moduleparam.h +++ b/sys/ofed/include/linux/moduleparam.h @@ -87,6 +87,9 @@ param_sysinit(struct kernel_param *param) #define module_param(var, type, mode) \ module_param_named(var, var, type, mode) +#define module_param_array(var, type, addr_argc, mode) \ + module_param_named(var, var, type, mode) + #define MODULE_PARM_DESC(name, desc) static inline int diff --git a/sys/ofed/include/linux/pci.h b/sys/ofed/include/linux/pci.h index 5d91e2d6149d..5c9cfde489c8 100644 --- a/sys/ofed/include/linux/pci.h +++ b/sys/ofed/include/linux/pci.h @@ -72,6 +72,9 @@ struct pci_device_id { #define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c #define PCI_DEVICE_ID_MELLANOX_SINAI 0x6274 +#define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) +#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) +#define PCI_FUNC(devfn) ((devfn) & 0x07) #define PCI_VDEVICE(_vendor, _device) \ .vendor = PCI_VENDOR_ID_##_vendor, .device = (_device), \ @@ -93,14 +96,18 @@ struct pci_device_id { struct pci_dev; + struct pci_driver { struct list_head links; char *name; struct pci_device_id *id_table; int (*probe)(struct pci_dev *dev, const struct pci_device_id *id); void (*remove)(struct pci_dev *dev); + int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */ + int (*resume) (struct pci_dev *dev); /* Device woken up */ driver_t driver; devclass_t bsdclass; + struct pci_error_handlers *err_handler; }; extern struct list_head pci_drivers; @@ -117,6 +124,9 @@ struct pci_dev { uint16_t device; uint16_t vendor; unsigned int irq; + unsigned int devfn; + u8 revision; + struct pci_devinfo *bus; /* bus this device is on, equivalent to linux struct pci_bus */ }; static inline struct resource_list_entry * @@ -296,6 +306,7 @@ pci_disable_msix(struct pci_dev *pdev) #define PCI_CAP_ID_EXP PCIY_EXPRESS #define PCI_CAP_ID_PCIX PCIY_PCIX + static inline int pci_find_capability(struct pci_dev *pdev, int capid) { @@ -306,6 +317,26 @@ pci_find_capability(struct pci_dev *pdev, int capid) return (reg); } + + + +/** + * pci_pcie_cap - get the saved PCIe capability offset + * @dev: PCI device + * + * PCIe capability offset is calculated at PCI device initialization + * time and saved in the data structure. This function returns saved + * PCIe capability offset. Using this instead of pci_find_capability() + * reduces unnecessary search in the PCI configuration space. If you + * need to calculate PCIe capability offset from raw device for some + * reasons, please use pci_find_capability() instead. + */ +static inline int pci_pcie_cap(struct pci_dev *dev) +{ + return pci_find_capability(dev, PCI_CAP_ID_EXP); +} + + static inline int pci_read_config_byte(struct pci_dev *pdev, int where, u8 *val) { @@ -529,6 +560,30 @@ pci_enable_msix(struct pci_dev *pdev, struct msix_entry *entries, int nreq) return (0); } +static inline int pci_channel_offline(struct pci_dev *pdev) +{ + return false; +} + +static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) +{ + return -ENODEV; +} +static inline void pci_disable_sriov(struct pci_dev *dev) +{ +} + +/** + * DEFINE_PCI_DEVICE_TABLE - macro used to describe a pci device table + * @_table: device table name + * + * This macro is used to create a struct pci_device_id array (a device table) + * in a generic manner. + */ +#define DEFINE_PCI_DEVICE_TABLE(_table) \ + const struct pci_device_id _table[] __devinitdata + + /* XXX This should not be necessary. */ #define pcix_set_mmrbc(d, v) 0 #define pcix_get_max_mmrbc(d) 0 @@ -578,5 +633,57 @@ pci_enable_msix(struct pci_dev *pdev, struct msix_entry *entries, int nreq) #define pci_unmap_len dma_unmap_len #define pci_unmap_len_set dma_unmap_len_set +typedef unsigned int __bitwise pci_channel_state_t; +typedef unsigned int __bitwise pci_ers_result_t; + +enum pci_channel_state { + /* I/O channel is in normal state */ + pci_channel_io_normal = (__force pci_channel_state_t) 1, + + /* I/O to channel is blocked */ + pci_channel_io_frozen = (__force pci_channel_state_t) 2, + + /* PCI card is dead */ + pci_channel_io_perm_failure = (__force pci_channel_state_t) 3, +}; + +enum pci_ers_result { + /* no result/none/not supported in device driver */ + PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1, + + /* Device driver can recover without slot reset */ + PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2, + + /* Device driver wants slot to be reset. */ + PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3, + + /* Device has completely failed, is unrecoverable */ + PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4, + + /* Device driver is fully recovered and operational */ + PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5, +}; + + +/* PCI bus error event callbacks */ +struct pci_error_handlers { + /* PCI bus error detected on this device */ + pci_ers_result_t (*error_detected)(struct pci_dev *dev, + enum pci_channel_state error); + + /* MMIO has been re-enabled, but not DMA */ + pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev); + + /* PCI Express link has been reset */ + pci_ers_result_t (*link_reset)(struct pci_dev *dev); + + /* PCI slot has been reset */ + pci_ers_result_t (*slot_reset)(struct pci_dev *dev); + + /* Device driver may resume normal operations */ + void (*resume)(struct pci_dev *dev); +}; + + #endif /* _LINUX_PCI_H_ */ diff --git a/sys/ofed/include/linux/sysfs.h b/sys/ofed/include/linux/sysfs.h index c60a2b904e62..cb1f7b2b5900 100644 --- a/sys/ofed/include/linux/sysfs.h +++ b/sys/ofed/include/linux/sysfs.h @@ -105,6 +105,10 @@ sysctl_handle_attr(SYSCTL_HANDLER_ARGS) /* Trim trailing newline. */ buf[len] = '\0'; } + + /* Trim trailing newline. */ + len--; + ((char*)buf)[len] = '\0'; } /* Leave one trailing byte to append a newline. */ @@ -185,4 +189,6 @@ sysfs_remove_dir(struct kobject *kobj) sysctl_remove_oid(kobj->oidp, 1, 1); } +#define sysfs_attr_init(attr) do {} while(0) + #endif /* _LINUX_SYSFS_H_ */ diff --git a/sys/ofed/include/linux/types.h b/sys/ofed/include/linux/types.h index 331c8b6a5503..65568ca9e400 100644 --- a/sys/ofed/include/linux/types.h +++ b/sys/ofed/include/linux/types.h @@ -45,6 +45,8 @@ typedef _Bool bool; #define false FALSE #endif +typedef u64 phys_addr_t; + typedef unsigned long kernel_ulong_t; typedef unsigned int uint; typedef unsigned gfp_t; diff --git a/sys/ofed/include/linux/workqueue.h b/sys/ofed/include/linux/workqueue.h index ce5759bf01a6..b895bd32ed20 100644 --- a/sys/ofed/include/linux/workqueue.h +++ b/sys/ofed/include/linux/workqueue.h @@ -80,7 +80,7 @@ do { \ callout_init(&(_work)->timer, CALLOUT_MPSAFE); \ } while (0) -#define INIT_DELAYED_WORK_DEFERRABLE INIT_DELAYED_WORK +#define INIT_DEFERRABLE_WORK INIT_DELAYED_WORK #define schedule_work(work) \ do { \ @@ -121,6 +121,14 @@ queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, return (!pending); } +static inline bool schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) +{ + struct workqueue_struct wq; + wq.taskqueue = taskqueue_thread; + return queue_delayed_work(&wq, dwork, delay); +} + static inline struct workqueue_struct * _create_workqueue_common(char *name, int cpus) { @@ -190,4 +198,15 @@ cancel_delayed_work(struct delayed_work *work) return 0; } +static inline int +cancel_delayed_work_sync(struct delayed_work *work) +{ + + callout_drain(&work->timer); + if (work->work.taskqueue && + taskqueue_cancel(work->work.taskqueue, &work->work.work_task, NULL)) + taskqueue_drain(work->work.taskqueue, &work->work.work_task); + return 0; +} + #endif /* _LINUX_WORKQUEUE_H_ */ diff --git a/sys/ofed/include/rdma/ib_cm.h b/sys/ofed/include/rdma/ib_cm.h index 938858304300..40c24b6683a4 100644 --- a/sys/ofed/include/rdma/ib_cm.h +++ b/sys/ofed/include/rdma/ib_cm.h @@ -38,6 +38,9 @@ #include #include +/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */ +extern struct class cm_class; + enum ib_cm_state { IB_CM_IDLE, IB_CM_LISTEN, @@ -259,6 +262,18 @@ struct ib_cm_event { void *private_data; }; +#define CM_REQ_ATTR_ID cpu_to_be16(0x0010) +#define CM_MRA_ATTR_ID cpu_to_be16(0x0011) +#define CM_REJ_ATTR_ID cpu_to_be16(0x0012) +#define CM_REP_ATTR_ID cpu_to_be16(0x0013) +#define CM_RTU_ATTR_ID cpu_to_be16(0x0014) +#define CM_DREQ_ATTR_ID cpu_to_be16(0x0015) +#define CM_DREP_ATTR_ID cpu_to_be16(0x0016) +#define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017) +#define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018) +#define CM_LAP_ATTR_ID cpu_to_be16(0x0019) +#define CM_APR_ATTR_ID cpu_to_be16(0x001A) + /** * ib_cm_handler - User-defined callback to process communication events. * @cm_id: Communication identifier associated with the reported event. diff --git a/sys/ofed/include/rdma/ib_mad.h b/sys/ofed/include/rdma/ib_mad.h index d3b9401b77b0..32f81141efd3 100644 --- a/sys/ofed/include/rdma/ib_mad.h +++ b/sys/ofed/include/rdma/ib_mad.h @@ -151,7 +151,7 @@ struct ib_rmpp_hdr { typedef u64 __bitwise ib_sa_comp_mask; -#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n)) +#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n))) /* * ib_sa_hdr and ib_sa_mad structures must be packed because they have diff --git a/sys/ofed/include/rdma/ib_pma.h b/sys/ofed/include/rdma/ib_pma.h new file mode 100644 index 000000000000..a5889f18807b --- /dev/null +++ b/sys/ofed/include/rdma/ib_pma.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_PMA_H) +#define IB_PMA_H + +#include + +/* + * PMA class portinfo capability mask bits + */ +#define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8) +#define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9) +#define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12) + +#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) +#define IB_PMA_PORT_SAMPLES_CONTROL cpu_to_be16(0x0010) +#define IB_PMA_PORT_SAMPLES_RESULT cpu_to_be16(0x0011) +#define IB_PMA_PORT_COUNTERS cpu_to_be16(0x0012) +#define IB_PMA_PORT_COUNTERS_EXT cpu_to_be16(0x001D) +#define IB_PMA_PORT_SAMPLES_RESULT_EXT cpu_to_be16(0x001E) + +struct ib_pma_mad { + struct ib_mad_hdr mad_hdr; + u8 reserved[40]; + u8 data[192]; +} __packed; + +struct ib_pma_portsamplescontrol { + u8 opcode; + u8 port_select; + u8 tick; + u8 counter_width; /* resv: 7:3, counter width: 2:0 */ + __be32 counter_mask0_9; /* 2, 10 3-bit fields */ + __be16 counter_mask10_14; /* 1, 5 3-bit fields */ + u8 sample_mechanisms; + u8 sample_status; /* only lower 2 bits */ + __be64 option_mask; + __be64 vendor_mask; + __be32 sample_start; + __be32 sample_interval; + __be16 tag; + __be16 counter_select[15]; + __be32 reserved1; + __be64 samples_only_option_mask; + __be32 reserved2[28]; +}; + +struct ib_pma_portsamplesresult { + __be16 tag; + __be16 sample_status; /* only lower 2 bits */ + __be32 counter[15]; +}; + +struct ib_pma_portsamplesresult_ext { + __be16 tag; + __be16 sample_status; /* only lower 2 bits */ + __be32 extended_width; /* only upper 2 bits */ + __be64 counter[15]; +}; + +struct ib_pma_portcounters { + u8 reserved; + u8 port_select; + __be16 counter_select; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved1; + u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ + __be16 reserved2; + __be16 vl15_dropped; + __be32 port_xmit_data; + __be32 port_rcv_data; + __be32 port_xmit_packets; + __be32 port_rcv_packets; + __be32 port_xmit_wait; +} __packed; + + +#define IB_PMA_SEL_SYMBOL_ERROR cpu_to_be16(0x0001) +#define IB_PMA_SEL_LINK_ERROR_RECOVERY cpu_to_be16(0x0002) +#define IB_PMA_SEL_LINK_DOWNED cpu_to_be16(0x0004) +#define IB_PMA_SEL_PORT_RCV_ERRORS cpu_to_be16(0x0008) +#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS cpu_to_be16(0x0010) +#define IB_PMA_SEL_PORT_XMIT_DISCARDS cpu_to_be16(0x0040) +#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS cpu_to_be16(0x0200) +#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS cpu_to_be16(0x0400) +#define IB_PMA_SEL_PORT_VL15_DROPPED cpu_to_be16(0x0800) +#define IB_PMA_SEL_PORT_XMIT_DATA cpu_to_be16(0x1000) +#define IB_PMA_SEL_PORT_RCV_DATA cpu_to_be16(0x2000) +#define IB_PMA_SEL_PORT_XMIT_PACKETS cpu_to_be16(0x4000) +#define IB_PMA_SEL_PORT_RCV_PACKETS cpu_to_be16(0x8000) + +struct ib_pma_portcounters_ext { + u8 reserved; + u8 port_select; + __be16 counter_select; + __be32 reserved1; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_unicast_xmit_packets; + __be64 port_unicast_rcv_packets; + __be64 port_multicast_xmit_packets; + __be64 port_multicast_rcv_packets; +} __packed; + +#define IB_PMA_SELX_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_SELX_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_SELX_PORT_XMIT_PACKETS cpu_to_be16(0x0004) +#define IB_PMA_SELX_PORT_RCV_PACKETS cpu_to_be16(0x0008) +#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS cpu_to_be16(0x0010) +#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS cpu_to_be16(0x0020) +#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS cpu_to_be16(0x0040) +#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS cpu_to_be16(0x0080) + +#endif /* IB_PMA_H */ diff --git a/sys/ofed/include/rdma/ib_sa.h b/sys/ofed/include/rdma/ib_sa.h index 5a8f2cefa7c0..61588d9ccdf8 100644 --- a/sys/ofed/include/rdma/ib_sa.h +++ b/sys/ofed/include/rdma/ib_sa.h @@ -372,6 +372,28 @@ struct ib_sa_notice_data_port_error { u8 padding[49]; }; +#define IB_SA_GUIDINFO_REC_LID IB_SA_COMP_MASK(0) +#define IB_SA_GUIDINFO_REC_BLOCK_NUM IB_SA_COMP_MASK(1) +#define IB_SA_GUIDINFO_REC_RES1 IB_SA_COMP_MASK(2) +#define IB_SA_GUIDINFO_REC_RES2 IB_SA_COMP_MASK(3) +#define IB_SA_GUIDINFO_REC_GID0 IB_SA_COMP_MASK(4) +#define IB_SA_GUIDINFO_REC_GID1 IB_SA_COMP_MASK(5) +#define IB_SA_GUIDINFO_REC_GID2 IB_SA_COMP_MASK(6) +#define IB_SA_GUIDINFO_REC_GID3 IB_SA_COMP_MASK(7) +#define IB_SA_GUIDINFO_REC_GID4 IB_SA_COMP_MASK(8) +#define IB_SA_GUIDINFO_REC_GID5 IB_SA_COMP_MASK(9) +#define IB_SA_GUIDINFO_REC_GID6 IB_SA_COMP_MASK(10) +#define IB_SA_GUIDINFO_REC_GID7 IB_SA_COMP_MASK(11) + +struct ib_sa_guidinfo_rec { + __be16 lid; + u8 block_num; + /* reserved */ + u8 res1; + __be32 res2; + u8 guid_info_list[64]; +}; + struct ib_sa_client { atomic_t users; struct completion comp; @@ -556,4 +578,16 @@ ib_sa_register_inform_info(struct ib_sa_client *client, */ void ib_sa_unregister_inform_info(struct ib_inform_info *info); +int ib_sa_guid_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + + #endif /* IB_SA_H */ diff --git a/sys/ofed/include/rdma/ib_user_verbs.h b/sys/ofed/include/rdma/ib_user_verbs.h index b2721c74d71a..670d6e8d6db2 100644 --- a/sys/ofed/include/rdma/ib_user_verbs.h +++ b/sys/ofed/include/rdma/ib_user_verbs.h @@ -82,9 +82,13 @@ enum { IB_USER_VERBS_CMD_QUERY_SRQ, IB_USER_VERBS_CMD_DESTROY_SRQ, IB_USER_VERBS_CMD_POST_SRQ_RECV, + IB_USER_VERBS_CMD_OPEN_XRCD, + IB_USER_VERBS_CMD_CLOSE_XRCD, + IB_USER_VERBS_CMD_CREATE_XSRQ, + IB_USER_VERBS_CMD_OPEN_QP, + IB_USER_VERBS_CMD_ATTACH_FLOW, + IB_USER_VERBS_CMD_DETACH_FLOW, IB_USER_VERBS_CMD_CREATE_XRC_SRQ, - IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN, - IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN, IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP, IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP, IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP, @@ -230,6 +234,21 @@ struct ib_uverbs_dealloc_pd { __u32 pd_handle; }; +struct ib_uverbs_open_xrcd { + __u64 response; + __u32 fd; + __u32 oflags; + __u64 driver_data[0]; +}; + +struct ib_uverbs_open_xrcd_resp { + __u32 xrcd_handle; +}; + +struct ib_uverbs_close_xrcd { + __u32 xrcd_handle; +}; + struct ib_uverbs_reg_mr { __u64 response; __u64 start; @@ -412,6 +431,17 @@ struct ib_uverbs_create_qp { __u64 driver_data[0]; }; +struct ib_uverbs_open_qp { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 qpn; + __u8 qp_type; + __u8 reserved[7]; + __u64 driver_data[0]; +}; + +/* also used for open response */ struct ib_uverbs_create_qp_resp { __u32 qp_handle; __u32 qpn; @@ -569,6 +599,16 @@ struct ib_uverbs_send_wr { } wr; }; +struct ibv_uverbs_flow_spec { + __u32 type; + __be32 src_ip; + __be32 dst_ip; + __be16 src_port; + __be16 dst_port; + __u8 l4_protocol; + __u8 block_mc_loopback; +}; + struct ib_uverbs_post_send { __u64 response; __u32 qp_handle; @@ -646,6 +686,45 @@ struct ib_uverbs_detach_mcast { __u64 driver_data[0]; }; +struct ibv_kern_flow_spec { + __u32 type; + __u32 reserved1; + union { + struct { + __be16 ethertype; + __be16 vlan; + __u8 vlan_present; + __u8 mac[6]; + __u8 port; + } eth; + struct { + __be32 qpn; + } ib_uc; + struct { + __u8 mgid[16]; + } ib_mc; + } l2_id; + __be32 src_ip; + __be32 dst_ip; + __be16 src_port; + __be16 dst_port; + __u8 l4_protocol; + __u8 block_mc_loopback; + __u8 reserved[2]; +}; + +struct ib_uverbs_attach_flow { + __u32 qp_handle; + __u32 priority; + struct ibv_kern_flow_spec spec; +}; + +struct ib_uverbs_detach_flow { + __u32 qp_handle; + __u32 priority; + struct ibv_kern_flow_spec spec; +}; + struct ib_uverbs_create_srq { __u64 response; __u64 user_handle; @@ -656,15 +735,17 @@ struct ib_uverbs_create_srq { __u64 driver_data[0]; }; -struct ib_uverbs_create_xrc_srq { +struct ib_uverbs_create_xsrq { __u64 response; __u64 user_handle; + __u32 srq_type; __u32 pd_handle; __u32 max_wr; __u32 max_sge; __u32 srq_limit; + __u32 reserved; __u32 xrcd_handle; - __u32 xrc_cq; + __u32 cq_handle; __u64 driver_data[0]; }; @@ -672,7 +753,7 @@ struct ib_uverbs_create_srq_resp { __u32 srq_handle; __u32 max_wr; __u32 max_sge; - __u32 reserved; + __u32 srqn; }; struct ib_uverbs_modify_srq { diff --git a/sys/ofed/include/rdma/ib_verbs.h b/sys/ofed/include/rdma/ib_verbs.h index f5b054a3da39..0145cb2965a2 100644 --- a/sys/ofed/include/rdma/ib_verbs.h +++ b/sys/ofed/include/rdma/ib_verbs.h @@ -47,12 +47,15 @@ #include #include #include +#include -#include +#include #include #include #include +extern struct workqueue_struct *ib_wq; + union ib_gid { u8 raw[16]; struct { @@ -114,6 +117,11 @@ enum ib_device_cap_flags { IB_DEVICE_XRC = (1<<20), IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), + IB_DEVICE_MR_ALLOCATE = (1<<23), + IB_DEVICE_SHARED_MR = (1<<24), + IB_DEVICE_QPG = (1<<25), + IB_DEVICE_UD_RSS = (1<<26), + IB_DEVICE_UD_TSS = (1<<27) }; enum ib_atomic_cap { @@ -161,6 +169,7 @@ struct ib_device_attr { int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; + int max_rss_tbl_sz; u16 max_pkeys; u8 local_ca_ack_delay; }; @@ -207,6 +216,7 @@ enum ib_port_cap_flags { IB_PORT_SM_DISABLED = 1 << 10, IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IB_PORT_CM_SUP = 1 << 16, IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, IB_PORT_REINIT_SUP = 1 << 18, @@ -237,6 +247,15 @@ static inline int ib_width_enum_to_int(enum ib_port_width width) } } +enum ib_port_speed { + IB_SPEED_SDR = 1, + IB_SPEED_DDR = 2, + IB_SPEED_QDR = 4, + IB_SPEED_FDR10 = 8, + IB_SPEED_FDR = 16, + IB_SPEED_EDR = 32 +}; + struct ib_protocol_stats { /* TBD... */ }; @@ -421,7 +440,15 @@ enum ib_rate { IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, - IB_RATE_120_GBPS = 10 + IB_RATE_120_GBPS = 10, + IB_RATE_14_GBPS = 11, + IB_RATE_56_GBPS = 12, + IB_RATE_112_GBPS = 13, + IB_RATE_168_GBPS = 14, + IB_RATE_25_GBPS = 15, + IB_RATE_100_GBPS = 16, + IB_RATE_200_GBPS = 17, + IB_RATE_300_GBPS = 18 }; /** @@ -432,6 +459,13 @@ enum ib_rate { */ int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; +/** + * ib_rate_to_mbps - Convert the IB rate enum to Mbps. + * For example, IB_RATE_2_5_GBPS will be converted to 2500. + * @rate: rate to convert. + */ +int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__; + /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. @@ -498,6 +532,7 @@ enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), + IB_WC_IP_CSUM_OK = (1<<3), }; struct ib_wc { @@ -528,6 +563,11 @@ enum ib_cq_notify_flags { IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; +enum ib_srq_type { + IB_SRQT_BASIC, + IB_SRQT_XRC +}; + enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, @@ -543,6 +583,14 @@ struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; + enum ib_srq_type srq_type; + + union { + struct { + struct ib_xrcd *xrcd; + struct ib_cq *cq; + } xrc; + } ext; }; struct ib_qp_cap { @@ -551,6 +599,7 @@ struct ib_qp_cap { u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; + u32 qpg_tss_mask_sz; }; enum ib_sig_type { @@ -572,13 +621,32 @@ enum ib_qp_type { IB_QPT_UD, IB_QPT_XRC, IB_QPT_RAW_IPV6, - IB_QPT_RAW_ETY, - IB_QPT_RAW_ETH + IB_QPT_RAW_ETHERTYPE, + IB_QPT_RAW_PACKET = 8, + IB_QPT_XRC_INI = 9, + IB_QPT_XRC_TGT, + IB_QPT_MAX, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_QP_CREATE_NETIF_QP = 1 << 2, + /* reserve bits 26-31 for low level drivers' internal use */ + IB_QP_CREATE_RESERVED_START = 1 << 26, + IB_QP_CREATE_RESERVED_END = 1 << 31, +}; + +enum ib_qpg_type { + IB_QPG_NONE = 0, + IB_QPG_PARENT = (1<<0), + IB_QPG_CHILD_RX = (1<<1), + IB_QPG_CHILD_TX = (1<<2) +}; + +struct ib_qpg_init_attrib { + u32 tss_child_count; + u32 rss_child_count; }; struct ib_qp_init_attr { @@ -587,14 +655,26 @@ struct ib_qp_init_attr { struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; + struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; + union { + struct ib_qp *qpg_parent; /* see qpg_type */ + struct ib_qpg_init_attrib parent_attrib; + }; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; - struct ib_xrcd *xrc_domain; /* XRC qp's only */ + enum ib_qpg_type qpg_type; u8 port_num; /* special QP types only */ }; +struct ib_qp_open_attr { + void (*event_handler)(struct ib_event *, void *); + void *qp_context; + u32 qp_num; + enum ib_qp_type qp_type; +}; + enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, @@ -651,7 +731,8 @@ enum ib_qp_attr_mask { IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), - IB_QP_DEST_QPN = (1<<20) + IB_QP_DEST_QPN = (1<<20), + IB_QP_GROUP_RSS = (1<<21) }; enum ib_qp_state { @@ -724,6 +805,20 @@ enum ib_send_flags { IB_SEND_IP_CSUM = (1<<4) }; +enum ib_flow_types { + IB_FLOW_ETH = 0, + IB_FLOW_IB_UC = 1, + IB_FLOW_IB_MC_IPV4 = 2, + IB_FLOW_IB_MC_IPV6 = 3 +}; + +enum { + IB_FLOW_L4_NONE = 0, + IB_FLOW_L4_OTHER = 3, + IB_FLOW_L4_UDP = 5, + IB_FLOW_L4_TCP = 6 +}; + struct ib_sge { u64 addr; u32 length; @@ -785,7 +880,7 @@ struct ib_send_wr { u8 static_rate; } raw_ety; } wr; - u32 xrc_remote_srq_num; /* valid for XRC sends only */ + u32 xrc_remote_srq_num; /* XRC TGT QPs only */ }; struct ib_recv_wr { @@ -800,7 +895,15 @@ enum ib_access_flags { IB_ACCESS_REMOTE_WRITE = (1<<1), IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), - IB_ACCESS_MW_BIND = (1<<4) + IB_ACCESS_MW_BIND = (1<<4), + IB_ACCESS_ALLOCATE_MR = (1<<5), + IB_ACCESS_SHARED_MR_USER_READ = (1<<6), + IB_ACCESS_SHARED_MR_USER_WRITE = (1<<7), + IB_ACCESS_SHARED_MR_GROUP_READ = (1<<8), + IB_ACCESS_SHARED_MR_GROUP_WRITE = (1<<9), + IB_ACCESS_SHARED_MR_OTHER_READ = (1<<10), + IB_ACCESS_SHARED_MR_OTHER_WRITE = (1<<11) + }; struct ib_phys_buf { @@ -847,7 +950,7 @@ struct ib_ucontext { struct list_head qp_list; struct list_head srq_list; struct list_head ah_list; - struct list_head xrc_domain_list; + struct list_head xrcd_list; int closing; }; @@ -884,12 +987,14 @@ struct ib_pd { struct ib_xrcd { struct ib_device *device; struct ib_uobject *uobject; + atomic_t usecnt; /* count all exposed resources */ struct inode *inode; struct rb_node node; - atomic_t usecnt; /* count all resources */ + + struct mutex tgt_qp_mutex; + struct list_head tgt_qp_list; }; - struct ib_ah { struct ib_device *device; struct ib_pd *pd; @@ -911,13 +1016,19 @@ struct ib_cq { struct ib_srq { struct ib_device *device; struct ib_pd *pd; - struct ib_cq *xrc_cq; - struct ib_xrcd *xrcd; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; + enum ib_srq_type srq_type; atomic_t usecnt; - u32 xrc_srq_num; + + union { + struct { + struct ib_xrcd *xrcd; + struct ib_cq *cq; + u32 srq_num; + } xrc; + } ext; }; struct ib_qp { @@ -926,12 +1037,17 @@ struct ib_qp { struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; + struct ib_xrcd *xrcd; /* XRC TGT QPs only */ + struct list_head xrcd_list; + atomic_t usecnt; /* count times opened, mcast attaches */ + struct list_head open_list; + struct ib_qp *real_qp; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; - struct ib_xrcd *xrcd; /* XRC QPs only */ + enum ib_qpg_type qpg_type; }; struct ib_mr { @@ -958,6 +1074,32 @@ struct ib_fmr { u32 rkey; }; +struct ib_flow_spec { + enum ib_flow_types type; + union { + struct { + __be16 ethertype; + __be16 vlan; + u8 vlan_present; + u8 mac[6]; + u8 port; + } eth; + struct { + __be32 qpn; + } ib_uc; + struct { + u8 mgid[16]; + } ib_mc; + } l2_id; + __be32 src_ip; + __be32 dst_ip; + __be16 src_port; + __be16 dst_port; + u8 l4_protocol; + u8 block_mc_loopback; + u8 rule_type; +}; + struct ib_mad; struct ib_grh; @@ -1037,9 +1179,9 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; + spinlock_t client_data_lock; struct list_head core_list; struct list_head client_data_list; - spinlock_t client_data_lock; struct ib_cache cache; int *pkey_tbl_len; @@ -1143,7 +1285,8 @@ struct ib_device { u64 start, u64 length, u64 virt_addr, int mr_access_flags, - struct ib_udata *udata); + struct ib_udata *udata, + int mr_id); int (*query_mr)(struct ib_mr *mr, struct ib_mr_attr *mr_attr); int (*dereg_mr)(struct ib_mr *mr); @@ -1191,7 +1334,7 @@ struct ib_device { struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, - struct ib_ucontext *context, + struct ib_ucontext *ucontext, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd); int (*create_xrc_rcv_qp)(struct ib_qp_init_attr *init_attr, @@ -1211,7 +1354,17 @@ struct ib_device { int (*unreg_xrc_rcv_qp)(struct ib_xrcd *xrcd, void *context, u32 qp_num); + int (*attach_flow)(struct ib_qp *qp, + struct ib_flow_spec *spec, + int priority); + int (*detach_flow)(struct ib_qp *qp, + struct ib_flow_spec *spec, + int priority); + unsigned long (*get_unmapped_area)(struct file *file, + unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); struct ib_dma_mapping_ops *dma_ops; struct module *owner; @@ -1225,8 +1378,8 @@ struct ib_device { IB_DEV_UNREGISTERED } reg_state; - u64 uverbs_cmd_mask; int uverbs_abi_ver; + u64 uverbs_cmd_mask; char node_desc[64]; __be64 node_guid; @@ -1248,7 +1401,9 @@ struct ib_client { struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); -int ib_register_device (struct ib_device *device); +int ib_register_device(struct ib_device *device, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); @@ -1268,15 +1423,6 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } -/** - * ib_sysfs_create_port_files - iterate over port sysfs directories - * @device: the IB device - * @create: a function to create sysfs files in each port directory - */ -int ib_sysfs_create_port_files(struct ib_device *device, - int (*create)(struct ib_device *dev, u8 port_num, - struct kobject *kobj)); - /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for @@ -1427,8 +1573,8 @@ struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr); /** - * ib_create_srq - Creates an SRQ associated with the specified - * protection domain. + * ib_create_srq - Creates a SRQ associated with the specified protection + * domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the * SRQ. If SRQ creation succeeds, then the attributes are updated to @@ -1533,6 +1679,25 @@ int ib_query_qp(struct ib_qp *qp, */ int ib_destroy_qp(struct ib_qp *qp); +/** + * ib_open_qp - Obtain a reference to an existing sharable QP. + * @xrcd - XRC domain + * @qp_open_attr: Attributes identifying the QP to open. + * + * Returns a reference to a sharable QP. + */ +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr); + +/** + * ib_close_qp - Release an external reference to a QP. + * @qp: The QP handle to release + * + * The opened QP handle is released by the caller. The underlying + * shared QP is not destroyed until all internal references are released. + */ +int ib_close_qp(struct ib_qp *qp); + /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. @@ -1540,6 +1705,11 @@ int ib_destroy_qp(struct ib_qp *qp); * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. + * + * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate + * error is returned, the QP state shall not be affected, + * ib_post_send() will return an immediate error after queueing any + * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, @@ -1581,8 +1751,7 @@ static inline int ib_post_recv(struct ib_qp *qp, * the associated completion and event handlers. * @cqe: The minimum size of the CQ. * @comp_vector - Completion vector used to signal completion events. - * Must be >= 0 and < context->num_comp_vectors - * or IB_CQ_VECTOR_LEAST_ATTACHED. + * Must be >= 0 and < context->num_comp_vectors. * * Users can examine the cq structure to determine the actual CQ size. */ @@ -2154,17 +2323,19 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); - /** - * ib_dealloc_xrcd - Deallocates an extended reliably connected domain. - * @xrcd: The xrc domain to deallocate. - */ -int ib_dealloc_xrcd(struct ib_xrcd *xrcd); - -/** - * ib_alloc_xrcd - Allocates an extended reliably connected domain. - * @device: The device on which to allocate the xrcd. + * ib_alloc_xrcd - Allocates an XRC domain. + * @device: The device on which to allocate the XRC domain. */ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); +/** + * ib_dealloc_xrcd - Deallocates an XRC domain. + * @xrcd: The XRC domain to deallocate. + */ +int ib_dealloc_xrcd(struct ib_xrcd *xrcd); + +int ib_attach_flow(struct ib_qp *qp, struct ib_flow_spec *spec, int priority); +int ib_detach_flow(struct ib_qp *qp, struct ib_flow_spec *spec, int priority); + #endif /* IB_VERBS_H */