From c3191c2e2be7cd7d9a25da2f0de1d2e3768c3b0e Mon Sep 17 00:00:00 2001
From: Hans Petter Selasky <hselasky@FreeBSD.org>
Date: Wed, 15 Nov 2017 11:14:39 +0000
Subject: [PATCH] Update the mlx4 core and mlx4en(4) modules towards Linux
 v4.9.

Background:
The coming ibcore update forces an update of mlx4ib(4) which in turn requires
an updated mlx4 core module. This also affects the mlx4en(4) module because
commonly used APIs are updated. This commit is a middle step updating the
mlx4 modules towards the new ibcore.

This change contains no major new features.

Changes in mlx4:
  a) Improved error handling when mlx4 PCI devices are
  detached inside VMs.
  b) Major update of codebase towards Linux 4.9.

Changes in mlx4ib(4):
  a) Minimal changes needed in order to compile using the
  updated mlx4 core APIs.

Changes in mlx4en(4):
  a) Update flow steering code in mlx4en to use new APIs for
  registering MAC addresses and IP addresses.
  b) Update all statistics counters to be 64-bit.
  c) Minimal changes needed in order to compile using the
  updated mlx4 core APIs.

Sponsored by:	Mellanox Technologies
MFC after:	1 week
---
 sys/conf/files                                |    4 +-
 sys/dev/mlx4/cmd.h                            |  118 +-
 sys/dev/mlx4/cq.h                             |   29 +-
 sys/dev/mlx4/device.h                         |  650 ++-
 sys/dev/mlx4/driver.h                         |   84 +-
 sys/dev/mlx4/mlx4_core/fw.h                   |   78 +-
 sys/dev/mlx4/mlx4_core/fw_qos.h               |  145 +
 sys/dev/mlx4/mlx4_core/icm.h                  |    5 +-
 sys/dev/mlx4/mlx4_core/mlx4.h                 |  337 +-
 sys/dev/mlx4/mlx4_core/mlx4_alloc.c           |  435 +-
 sys/dev/mlx4/mlx4_core/mlx4_catas.c           |  290 +-
 sys/dev/mlx4/mlx4_core/mlx4_cmd.c             | 1763 +++++---
 sys/dev/mlx4/mlx4_core/mlx4_cq.c              |   65 +-
 sys/dev/mlx4/mlx4_core/mlx4_eq.c              |  747 ++--
 sys/dev/mlx4/mlx4_core/mlx4_fw.c              | 1611 +++++--
 sys/dev/mlx4/mlx4_core/mlx4_fw_qos.c          |  289 ++
 sys/dev/mlx4/mlx4_core/mlx4_icm.c             |   80 +-
 sys/dev/mlx4/mlx4_core/mlx4_intf.c            |   77 +-
 sys/dev/mlx4/mlx4_core/mlx4_main.c            | 3741 +++++++++--------
 sys/dev/mlx4/mlx4_core/mlx4_mcg.c             |  313 +-
 sys/dev/mlx4/mlx4_core/mlx4_mr.c              |  293 +-
 sys/dev/mlx4/mlx4_core/mlx4_pd.c              |   42 +-
 sys/dev/mlx4/mlx4_core/mlx4_port.c            | 1205 +++++-
 sys/dev/mlx4/mlx4_core/mlx4_profile.c         |   46 +-
 sys/dev/mlx4/mlx4_core/mlx4_qp.c              |  426 +-
 sys/dev/mlx4/mlx4_core/mlx4_reset.c           |   51 +-
 .../mlx4/mlx4_core/mlx4_resource_tracker.c    | 1314 ++++--
 sys/dev/mlx4/mlx4_core/mlx4_sense.c           |   18 +-
 sys/dev/mlx4/mlx4_core/mlx4_srq.c             |   21 +-
 sys/dev/mlx4/mlx4_core/mlx4_sys_tune.c        |  323 --
 sys/dev/mlx4/mlx4_en/en.h                     |  144 +-
 sys/dev/mlx4/mlx4_en/en_port.h                |   27 +-
 sys/dev/mlx4/mlx4_en/mlx4_en_cq.c             |   72 +-
 sys/dev/mlx4/mlx4_en/mlx4_en_main.c           |  120 +-
 sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c         |  555 +--
 sys/dev/mlx4/mlx4_en/mlx4_en_port.c           |  583 +--
 sys/dev/mlx4/mlx4_en/mlx4_en_resources.c      |    2 +-
 sys/dev/mlx4/mlx4_en/mlx4_en_rx.c             |   34 +-
 sys/dev/mlx4/mlx4_en/mlx4_en_tx.c             |   25 +-
 sys/dev/mlx4/mlx4_ib/mlx4_ib.h                |    3 -
 sys/dev/mlx4/mlx4_ib/mlx4_ib_cq.c             |   32 +-
 sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c            |  222 +-
 sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c           |  263 +-
 sys/dev/mlx4/mlx4_ib/mlx4_ib_mr.c             |    6 +-
 sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c             |   73 +-
 sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c            |    7 +-
 sys/dev/mlx4/mlx4_ib/mlx4_ib_sysfs.c          |    6 +-
 sys/dev/mlx4/qp.h                             |  128 +-
 sys/dev/mlx4/stats.h                          |  154 +-
 sys/modules/mlx4/Makefile                     |    6 +-
 50 files changed, 10969 insertions(+), 6093 deletions(-)
 create mode 100644 sys/dev/mlx4/mlx4_core/fw_qos.h
 create mode 100644 sys/dev/mlx4/mlx4_core/mlx4_fw_qos.c
 delete mode 100644 sys/dev/mlx4/mlx4_core/mlx4_sys_tune.c

diff --git a/sys/conf/files b/sys/conf/files
index 31427a695022..4081b7e97876 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4569,6 +4569,8 @@ dev/mlx4/mlx4_core/mlx4_eq.c			optional mlx4 pci \
 	compile-with "${OFED_C}"
 dev/mlx4/mlx4_core/mlx4_fw.c			optional mlx4 pci \
 	compile-with "${OFED_C}"
+dev/mlx4/mlx4_core/mlx4_fw_qos.c		optional mlx4 pci \
+	compile-with "${OFED_C}"
 dev/mlx4/mlx4_core/mlx4_icm.c			optional mlx4 pci \
 	compile-with "${OFED_C}"
 dev/mlx4/mlx4_core/mlx4_intf.c			optional mlx4 pci \
@@ -4595,8 +4597,6 @@ dev/mlx4/mlx4_core/mlx4_srq.c			optional mlx4 pci \
 	compile-with "${OFED_C}"
 dev/mlx4/mlx4_core/mlx4_resource_tracker.c	optional mlx4 pci \
 	compile-with "${OFED_C}"
-dev/mlx4/mlx4_core/mlx4_sys_tune.c		optional mlx4 pci \
-	compile-with "${OFED_C}"
 
 dev/mlx4/mlx4_en/mlx4_en_cq.c			optional mlx4en pci inet inet6	\
 	compile-with "${OFED_C}"
diff --git a/sys/dev/mlx4/cmd.h b/sys/dev/mlx4/cmd.h
index e2d41bcf417a..fe33e7287f6f 100644
--- a/sys/dev/mlx4/cmd.h
+++ b/sys/dev/mlx4/cmd.h
@@ -36,6 +36,8 @@
 #include <linux/dma-mapping.h>
 #include <linux/types.h>
 
+struct mlx4_counter;
+
 enum {
 	/* initialization and general commands */
 	MLX4_CMD_SYS_EN		 = 0x1,
@@ -67,8 +69,13 @@ enum {
 	MLX4_CMD_MAP_ICM_AUX	 = 0xffc,
 	MLX4_CMD_UNMAP_ICM_AUX	 = 0xffb,
 	MLX4_CMD_SET_ICM_SIZE	 = 0xffd,
+	MLX4_CMD_ACCESS_REG	 = 0x3b,
+	MLX4_CMD_ALLOCATE_VPP	 = 0x80,
+	MLX4_CMD_SET_VPORT_QOS	 = 0x81,
+
 	/*master notify fw on finish for slave's flr*/
 	MLX4_CMD_INFORM_FLR_DONE = 0x5b,
+	MLX4_CMD_VIRT_PORT_MAP   = 0x5c,
 	MLX4_CMD_GET_OP_REQ      = 0x59,
 
 	/* TPT commands */
@@ -116,6 +123,7 @@ enum {
 	/* special QP and management commands */
 	MLX4_CMD_CONF_SPECIAL_QP = 0x23,
 	MLX4_CMD_MAD_IFC	 = 0x24,
+	MLX4_CMD_MAD_DEMUX	 = 0x203,
 
 	/* multicast commands */
 	MLX4_CMD_READ_MCG	 = 0x25,
@@ -125,6 +133,7 @@ enum {
 	/* miscellaneous commands */
 	MLX4_CMD_DIAG_RPRT	 = 0x30,
 	MLX4_CMD_NOP		 = 0x31,
+	MLX4_CMD_CONFIG_DEV	 = 0x3a,
 	MLX4_CMD_ACCESS_MEM	 = 0x2e,
 	MLX4_CMD_SET_VEP	 = 0x52,
 
@@ -158,6 +167,9 @@ enum {
 	MLX4_QP_FLOW_STEERING_ATTACH = 0x65,
 	MLX4_QP_FLOW_STEERING_DETACH = 0x66,
 	MLX4_FLOW_STEERING_IB_UC_QP_RANGE = 0x64,
+
+	/* Update and read QCN parameters */
+	MLX4_CMD_CONGESTION_CTRL_OPCODE = 0x68,
 };
 
 enum {
@@ -166,21 +178,42 @@ enum {
 	MLX4_CMD_TIME_CLASS_C	= 60000,
 };
 
+enum {
+	/* virtual to physical port mapping opcode modifiers */
+	MLX4_GET_PORT_VIRT2PHY = 0x0,
+	MLX4_SET_PORT_VIRT2PHY = 0x1,
+};
+
 enum {
 	MLX4_MAILBOX_SIZE	= 4096,
 	MLX4_ACCESS_MEM_ALIGN	= 256,
 };
 
 enum {
-	/* set port opcode modifiers */
-	MLX4_SET_PORT_GENERAL		= 0x0,
-	MLX4_SET_PORT_RQP_CALC		= 0x1,
-	MLX4_SET_PORT_MAC_TABLE		= 0x2,
-	MLX4_SET_PORT_VLAN_TABLE	= 0x3,
-	MLX4_SET_PORT_PRIO_MAP		= 0x4,
-	MLX4_SET_PORT_GID_TABLE		= 0x5,
-	MLX4_SET_PORT_PRIO2TC		= 0x8,
-	MLX4_SET_PORT_SCHEDULER		= 0x9
+	/* Set port opcode modifiers */
+	MLX4_SET_PORT_IB_OPCODE		= 0x0,
+	MLX4_SET_PORT_ETH_OPCODE	= 0x1,
+	MLX4_SET_PORT_BEACON_OPCODE	= 0x4,
+};
+
+enum {
+	/* Set port Ethernet input modifiers */
+	MLX4_SET_PORT_GENERAL   = 0x0,
+	MLX4_SET_PORT_RQP_CALC  = 0x1,
+	MLX4_SET_PORT_MAC_TABLE = 0x2,
+	MLX4_SET_PORT_VLAN_TABLE = 0x3,
+	MLX4_SET_PORT_PRIO_MAP  = 0x4,
+	MLX4_SET_PORT_GID_TABLE = 0x5,
+	MLX4_SET_PORT_PRIO2TC	= 0x8,
+	MLX4_SET_PORT_SCHEDULER = 0x9,
+	MLX4_SET_PORT_VXLAN	= 0xB,
+	MLX4_SET_PORT_ROCE_ADDR	= 0xD
+};
+
+enum {
+	MLX4_CMD_MAD_DEMUX_CONFIG	= 0,
+	MLX4_CMD_MAD_DEMUX_QUERY_STATE	= 1,
+	MLX4_CMD_MAD_DEMUX_QUERY_RESTR	= 2, /* Query mad demux restrictions */
 };
 
 enum {
@@ -188,6 +221,43 @@ enum {
 	MLX4_CMD_NATIVE
 };
 
+/*
+ * MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP -
+ * Receive checksum value is reported in CQE also for non TCP/UDP packets.
+ *
+ * MLX4_RX_CSUM_MODE_L4 -
+ * L4_CSUM bit in CQE, which indicates whether or not L4 checksum
+ * was validated correctly, is supported.
+ *
+ * MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP -
+ * IP_OK CQE's field is supported also for non TCP/UDP IP packets.
+ *
+ * MLX4_RX_CSUM_MODE_MULTI_VLAN -
+ * Receive Checksum offload is supported for packets with more than 2 vlan headers.
+ */
+enum mlx4_rx_csum_mode {
+	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP		= 1UL << 0,
+	MLX4_RX_CSUM_MODE_L4				= 1UL << 1,
+	MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP		= 1UL << 2,
+	MLX4_RX_CSUM_MODE_MULTI_VLAN			= 1UL << 3
+};
+
+struct mlx4_config_dev_params {
+	u16	vxlan_udp_dport;
+	u8	rx_csum_flags_port_1;
+	u8	rx_csum_flags_port_2;
+};
+
+enum mlx4_en_congestion_control_algorithm {
+	MLX4_CTRL_ALGO_802_1_QAU_REACTION_POINT = 0,
+};
+
+enum mlx4_en_congestion_control_opmod {
+	MLX4_CONGESTION_CONTROL_GET_PARAMS,
+	MLX4_CONGESTION_CONTROL_GET_STATISTICS,
+	MLX4_CONGESTION_CONTROL_SET_PARAMS = 4,
+};
+
 struct mlx4_dev;
 
 struct mlx4_cmd_mailbox {
@@ -233,26 +303,28 @@ static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_para
 struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
 void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
 
+int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
+			   struct mlx4_counter *counter_stats, int reset);
 u32 mlx4_comm_get_version(void);
-int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac);
-int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan,
+		     u8 qos, __be16 proto);
+int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate,
+		     int max_tx_rate);
 int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
-int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state);
-int mlx4_get_vf_link_state(struct mlx4_dev *dev, int port, int vf);
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params);
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev);
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev);
 /*
  * mlx4_get_slave_default_vlan -
- * retrun true if VST ( default vlan)
- * if VST will fill vlan & qos (if not NULL)
+ * return true if VST ( default vlan)
+ * if VST, will return vlan & qos (if not NULL)
  */
-bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, u16 *vlan, u8 *qos);
-
-enum {
-	IFLA_VF_LINK_STATE_AUTO,	/* link state of the uplink */
-	IFLA_VF_LINK_STATE_ENABLE,	/* link always up */
-	IFLA_VF_LINK_STATE_DISABLE,	/* link always down */
-	__IFLA_VF_LINK_STATE_MAX,
-};
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
+				 u16 *vlan, u8 *qos);
 
 #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
+#define COMM_CHAN_EVENT_INTERNAL_ERR (1 << 17)
 
 #endif /* MLX4_CMD_H */
diff --git a/sys/dev/mlx4/cq.h b/sys/dev/mlx4/cq.h
index e7d6ddd7538f..6b338d842139 100644
--- a/sys/dev/mlx4/cq.h
+++ b/sys/dev/mlx4/cq.h
@@ -42,31 +42,22 @@ struct mlx4_cqe {
 	__be32			vlan_my_qpn;
 	__be32			immed_rss_invalid;
 	__be32			g_mlpath_rqpn;
+	__be16			sl_vid;
 	union {
 		struct {
-			union {
-				struct {
-					__be16			sl_vid;
-					__be16	rlid;
-				};
-				__be32			timestamp_16_47;
-			};
+			__be16	rlid;
 			__be16  status;
 			u8      ipv6_ext_mask;
 			u8      badfcs_enc;
 		};
-		struct {
-			__be16 reserved1;
-			u8  smac[6];
-		};
+		u8  smac[ETH_ALEN];
 	};
 	__be32			byte_cnt;
 	__be16			wqe_index;
 	__be16			checksum;
-	u8			reserved2[1];
-	__be16			timestamp_0_15;
+	u8			reserved[3];
 	u8			owner_sr_opcode;
-} __packed;
+};
 
 struct mlx4_err_cqe {
 	__be32			my_qpn;
@@ -95,7 +86,13 @@ struct mlx4_ts_cqe {
 } __packed;
 
 enum {
-	MLX4_CQE_VLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_L2_TUNNEL_IPOK		= 1 << 31,
+	MLX4_CQE_CVLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_SVLAN_PRESENT_MASK	= 1 << 30,
+	MLX4_CQE_L2_TUNNEL		= 1 << 27,
+	MLX4_CQE_L2_TUNNEL_CSUM		= 1 << 26,
+	MLX4_CQE_L2_TUNNEL_IPV4		= 1 << 25,
+
 	MLX4_CQE_QPN_MASK		= 0xffffff,
 	MLX4_CQE_VID_MASK		= 0xfff,
 };
@@ -177,5 +174,5 @@ int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
 		   u16 count, u16 period);
 int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
 		   int entries, struct mlx4_mtt *mtt);
-int mlx4_cq_ignore_overrun(struct mlx4_dev *dev, struct mlx4_cq *cq);
+
 #endif /* MLX4_CQ_H */
diff --git a/sys/dev/mlx4/device.h b/sys/dev/mlx4/device.h
index c9c9dc3bb1cc..e5d9f076c416 100644
--- a/sys/dev/mlx4/device.h
+++ b/sys/dev/mlx4/device.h
@@ -39,20 +39,21 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 #include <linux/workqueue.h>
+#include <linux/if_ether.h>
+#include <linux/mutex.h>
+
 #include <asm/atomic.h>
 
 #include <linux/clocksource.h>
 
+#define DEFAULT_UAR_PAGE_SHIFT  12
+
 #define MAX_MSIX_P_PORT		17
 #define MAX_MSIX		64
-#define MSIX_LEGACY_SZ		4
 #define MIN_MSIX_P_PORT		5
+#define MLX4_IS_LEGACY_EQ_MODE(dev_cap) ((dev_cap).num_comp_vectors < \
+					 (dev_cap).num_ports * MIN_MSIX_P_PORT)
 
-#define MLX4_ROCE_MAX_GIDS	128
-#define MLX4_ROCE_PF_GIDS	16
-
-#define MLX4_NUM_UP			8
-#define MLX4_NUM_TC			8
 #define MLX4_MAX_100M_UNITS_VAL		255	/*
 						 * work around: can't set values
 						 * greater then this value when
@@ -62,6 +63,9 @@
 #define MLX4_RATELIMIT_1G_UNITS		4	/* 1 Gbps */
 #define MLX4_RATELIMIT_DEFAULT		0x00ff
 
+#define MLX4_ROCE_MAX_GIDS	128
+#define MLX4_ROCE_PF_GIDS	16
+
 #define CORE_CLOCK_MASK 0xffffffffffffULL
 
 enum {
@@ -70,8 +74,9 @@ enum {
 	MLX4_FLAG_MASTER	= 1 << 2,
 	MLX4_FLAG_SLAVE		= 1 << 3,
 	MLX4_FLAG_SRIOV		= 1 << 4,
-	MLX4_FLAG_DEV_NUM_STR	= 1 << 5,
-	MLX4_FLAG_OLD_REG_MAC   = 1 << 6,
+	MLX4_FLAG_OLD_REG_MAC	= 1 << 6,
+	MLX4_FLAG_BONDED	= 1 << 7,
+	MLX4_FLAG_SECURE_HOST	= 1 << 8,
 };
 
 enum {
@@ -81,7 +86,8 @@ enum {
 
 enum {
 	MLX4_MAX_PORTS		= 2,
-	MLX4_MAX_PORT_PKEYS	= 128
+	MLX4_MAX_PORT_PKEYS	= 128,
+	MLX4_MAX_PORT_GIDS	= 128
 };
 
 /* base qkey for use in sriov tunnel-qp/proxy-qp communication.
@@ -92,14 +98,14 @@ enum {
 #define MLX4_RESERVED_QKEY_MASK  (0xFFFF0000)
 
 enum {
-	MLX4_BOARD_ID_LEN = 64,
-	MLX4_VSD_LEN = 208
+	MLX4_BOARD_ID_LEN = 64
 };
 
 enum {
 	MLX4_MAX_NUM_PF		= 16,
-	MLX4_MAX_NUM_VF		= 64,
-	MLX4_MFUNC_MAX		= 80,
+	MLX4_MAX_NUM_VF		= 126,
+	MLX4_MAX_NUM_VF_P_PORT  = 64,
+	MLX4_MFUNC_MAX		= 128,
 	MLX4_MAX_EQ_NUM		= 1024,
 	MLX4_MFUNC_EQ_NUM	= 4,
 	MLX4_MFUNC_MAX_EQES     = 8,
@@ -119,6 +125,14 @@ enum {
 	MLX4_STEERING_MODE_DEVICE_MANAGED
 };
 
+enum {
+	MLX4_STEERING_DMFS_A0_DEFAULT,
+	MLX4_STEERING_DMFS_A0_DYNAMIC,
+	MLX4_STEERING_DMFS_A0_STATIC,
+	MLX4_STEERING_DMFS_A0_DISABLE,
+	MLX4_STEERING_DMFS_A0_NOT_SUPPORTED
+};
+
 static inline const char *mlx4_steering_mode_str(int steering_mode)
 {
 	switch (steering_mode) {
@@ -136,6 +150,11 @@ static inline const char *mlx4_steering_mode_str(int steering_mode)
 	}
 }
 
+enum {
+	MLX4_TUNNEL_OFFLOAD_MODE_NONE,
+	MLX4_TUNNEL_OFFLOAD_MODE_VXLAN
+};
+
 enum {
 	MLX4_DEV_CAP_FLAG_RC		= 1LL <<  0,
 	MLX4_DEV_CAP_FLAG_UC		= 1LL <<  1,
@@ -161,12 +180,10 @@ enum {
 	MLX4_DEV_CAP_FLAG_UDP_RSS	= 1LL << 40,
 	MLX4_DEV_CAP_FLAG_VEP_UC_STEER	= 1LL << 41,
 	MLX4_DEV_CAP_FLAG_VEP_MC_STEER	= 1LL << 42,
-	MLX4_DEV_CAP_FLAG_CROSS_CHANNEL	= 1LL << 44,
 	MLX4_DEV_CAP_FLAG_COUNTERS	= 1LL << 48,
-	MLX4_DEV_CAP_FLAG_COUNTERS_EXT	= 1LL << 49,
-	MLX4_DEV_CAP_FLAG_SET_PORT_ETH_SCHED = 1LL << 53,
+	MLX4_DEV_CAP_FLAG_RSS_IP_FRAG   = 1LL << 52,
+	MLX4_DEV_CAP_FLAG_SET_ETH_SCHED = 1LL << 53,
 	MLX4_DEV_CAP_FLAG_SENSE_SUPPORT	= 1LL << 55,
-	MLX4_DEV_CAP_FLAG_FAST_DROP	= 1LL << 57,
 	MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV = 1LL << 59,
 	MLX4_DEV_CAP_FLAG_64B_EQE	= 1LL << 61,
 	MLX4_DEV_CAP_FLAG_64B_CQE	= 1LL << 62
@@ -177,29 +194,49 @@ enum {
 	MLX4_DEV_CAP_FLAG2_RSS_TOP		= 1LL <<  1,
 	MLX4_DEV_CAP_FLAG2_RSS_XOR		= 1LL <<  2,
 	MLX4_DEV_CAP_FLAG2_FS_EN		= 1LL <<  3,
-	MLX4_DEV_CAP_FLAG2_FSM			= 1LL <<  4,
-	MLX4_DEV_CAP_FLAG2_VLAN_CONTROL		= 1LL <<  5,
-	MLX4_DEV_CAP_FLAG2_UPDATE_QP		= 1LL <<  6,
-	MLX4_DEV_CAP_FLAG2_LB_SRC_CHK		= 1LL <<  7,
-	MLX4_DEV_CAP_FLAG2_DMFS_IPOIB		= 1LL <<  8,
-	MLX4_DEV_CAP_FLAG2_ETS_CFG		= 1LL <<  9,
-	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  10,
-	MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN		= 1LL <<  11,
-	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 12,
-	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  13,
-	MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW	   = 1LL <<  14,
-	MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN	= 1LL <<  15,
-	MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS	= 1LL <<  16,
-	MLX4_DEV_CAP_FLAG2_FS_EN_NCSI		= 1LL <<  17,
+	MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN	= 1LL <<  4,
+	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  5,
+	MLX4_DEV_CAP_FLAG2_VLAN_CONTROL		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG2_FSM			= 1LL <<  7,
+	MLX4_DEV_CAP_FLAG2_UPDATE_QP		= 1LL <<  8,
+	MLX4_DEV_CAP_FLAG2_DMFS_IPOIB		= 1LL <<  9,
+	MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS	= 1LL <<  10,
+	MLX4_DEV_CAP_FLAG2_MAD_DEMUX		= 1LL <<  11,
+	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  12,
+	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13,
+	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14,
+	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
+	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
+	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
 	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
-	MLX4_DEV_CAP_FLAG2_DMFS_TAG_MODE	= 1LL <<  19,
-	MLX4_DEV_CAP_FLAG2_ROCEV2		= 1LL <<  20,
-	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL	= 1LL <<  21,
-	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  22,
-	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  23,
-	MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1LL << 24,
-	MLX4_DEV_CAP_FLAG2_RX_CSUM_MODE		= 1LL <<  25,
-	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  26,
+	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19,
+	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20,
+	MLX4_DEV_CAP_FLAG2_PORT_REMAP		= 1LL <<  21,
+	MLX4_DEV_CAP_FLAG2_QCN			= 1LL <<  22,
+	MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT	= 1LL <<  23,
+	MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN         = 1LL <<  24,
+	MLX4_DEV_CAP_FLAG2_QOS_VPP		= 1LL <<  25,
+	MLX4_DEV_CAP_FLAG2_ETS_CFG		= 1LL <<  26,
+	MLX4_DEV_CAP_FLAG2_PORT_BEACON		= 1LL <<  27,
+	MLX4_DEV_CAP_FLAG2_IGNORE_FCS		= 1LL <<  28,
+	MLX4_DEV_CAP_FLAG2_PHV_EN		= 1LL <<  29,
+	MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN	= 1LL <<  30,
+	MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31,
+	MLX4_DEV_CAP_FLAG2_LB_SRC_CHK           = 1ULL << 32,
+	MLX4_DEV_CAP_FLAG2_ROCE_V1_V2		= 1ULL <<  33,
+	MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER   = 1ULL <<  34,
+	MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT	= 1ULL <<  35,
+	MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP          = 1ULL <<  36,
+	MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT = 1ULL << 37,
+};
+
+enum {
+	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP		= 1LL << 0,
+	MLX4_QUERY_FUNC_FLAGS_A0_RES_QP		= 1LL << 1
+};
+
+enum {
+	MLX4_VF_CAP_FLAG_RESET			= 1 << 0
 };
 
 /* bit enums for an 8-bit flags field indicating special use
@@ -211,53 +248,45 @@ enum {
  * This enum may use only bits 0..7.
  */
 enum {
-	MLX4_RESERVE_BF_QP	= 1 << 7,
+	MLX4_RESERVE_A0_QP	= 1 << 6,
+	MLX4_RESERVE_ETH_BF_QP	= 1 << 7,
 };
 
-enum {
-	MLX4_DEV_CAP_CQ_FLAG_IO			= 1 <<  0
-};
-
-enum {
-	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP	= 1LL << 0
-};
-
-/* bit enums for an 8-bit flags field indicating special use
- * QPs which require special handling in qp_reserve_range.
- * Currently, this only includes QPs used by the ETH interface,
- * where we expect to use blueflame.  These QPs must not have
- * bits 6 and 7 set in their qp number.
- *
- * This enum may use only bits 0..7.
- */
-enum {
-	MLX4_RESERVE_ETH_BF_QP		= 1 << 7,
-};
-
-
 enum {
 	MLX4_DEV_CAP_64B_EQE_ENABLED	= 1LL << 0,
-	MLX4_DEV_CAP_64B_CQE_ENABLED	= 1LL << 1
+	MLX4_DEV_CAP_64B_CQE_ENABLED	= 1LL << 1,
+	MLX4_DEV_CAP_CQE_STRIDE_ENABLED	= 1LL << 2,
+	MLX4_DEV_CAP_EQE_STRIDE_ENABLED	= 1LL << 3
 };
 
 enum {
-	MLX4_USER_DEV_CAP_64B_CQE	= 1L << 0
+	MLX4_USER_DEV_CAP_LARGE_CQE	= 1L << 0
 };
 
 enum {
-	MLX4_FUNC_CAP_64B_EQE_CQE	= 1L << 0
+	MLX4_FUNC_CAP_64B_EQE_CQE	= 1L << 0,
+	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1,
+	MLX4_FUNC_CAP_DMFS_A0_STATIC	= 1L << 2
 };
 
 
 #define MLX4_ATTR_EXTENDED_PORT_INFO	cpu_to_be16(0xff90)
 
 enum {
-	MLX4_BMME_FLAG_WIN_TYPE_2B	= 1 << 1,
+	MLX4_BMME_FLAG_WIN_TYPE_2B	= 1 <<  1,
 	MLX4_BMME_FLAG_LOCAL_INV	= 1 <<  6,
 	MLX4_BMME_FLAG_REMOTE_INV	= 1 <<  7,
 	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
 	MLX4_BMME_FLAG_RESERVED_LKEY	= 1 << 10,
 	MLX4_BMME_FLAG_FAST_REG_WR	= 1 << 11,
+	MLX4_BMME_FLAG_ROCE_V1_V2	= 1 << 19,
+	MLX4_BMME_FLAG_PORT_REMAP	= 1 << 24,
+	MLX4_BMME_FLAG_VSD_INIT2RTR	= 1 << 28,
+};
+
+enum {
+	MLX4_FLAG_PORT_REMAP		= MLX4_BMME_FLAG_PORT_REMAP,
+	MLX4_FLAG_ROCE_V1_V2		= MLX4_BMME_FLAG_ROCE_V1_V2
 };
 
 enum mlx4_event {
@@ -329,6 +358,7 @@ enum {
 	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
 	MLX4_PERM_ATOMIC	= 1 << 14,
 	MLX4_PERM_BIND_MW	= 1 << 15,
+	MLX4_PERM_MASK		= 0xFC00
 };
 
 enum {
@@ -373,12 +403,10 @@ enum {
 	MLX4_MTT_FLAG_PRESENT		= 1
 };
 
-enum {
-	MLX4_MAX_MTT_SHIFT		= 31
-};
-
 enum mlx4_qp_region {
 	MLX4_QP_REGION_FW = 0,
+	MLX4_QP_REGION_RSS_RAW_ETH,
+	MLX4_QP_REGION_BOTTOM = MLX4_QP_REGION_RSS_RAW_ETH,
 	MLX4_QP_REGION_ETH_ADDR,
 	MLX4_QP_REGION_FC_ADDR,
 	MLX4_QP_REGION_FC_EXCH,
@@ -389,8 +417,7 @@ enum mlx4_port_type {
 	MLX4_PORT_TYPE_NONE	= 0,
 	MLX4_PORT_TYPE_IB	= 1,
 	MLX4_PORT_TYPE_ETH	= 2,
-	MLX4_PORT_TYPE_AUTO	= 3,
-	MLX4_PORT_TYPE_NA	= 4
+	MLX4_PORT_TYPE_AUTO	= 3
 };
 
 enum mlx4_special_vlan_idx {
@@ -413,10 +440,22 @@ enum {
 	MLX4_MAX_FAST_REG_PAGES = 511,
 };
 
+enum {
+	/*
+	 * Max wqe size for rdma read is 512 bytes, so this
+	 * limits our max_sge_rd as the wqe needs to fit:
+	 * - ctrl segment (16 bytes)
+	 * - rdma segment (16 bytes)
+	 * - scatter elements (16 bytes each)
+	 */
+	MLX4_MAX_SGE_RD	= (512 - 16 - 16) / 16
+};
+
 enum {
 	MLX4_DEV_PMC_SUBTYPE_GUID_INFO	 = 0x14,
 	MLX4_DEV_PMC_SUBTYPE_PORT_INFO	 = 0x15,
 	MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE	 = 0x16,
+	MLX4_DEV_PMC_SUBTYPE_SL_TO_VL_MAP = 0x17,
 };
 
 /* Port mgmt change event handling */
@@ -428,14 +467,44 @@ enum {
 	MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK	= 1 << 4,
 };
 
+union sl2vl_tbl_to_u64 {
+	u8	sl8[8];
+	u64	sl64;
+};
+
+enum {
+	MLX4_DEVICE_STATE_UP			= 1 << 0,
+	MLX4_DEVICE_STATE_INTERNAL_ERROR	= 1 << 1,
+};
+
+enum {
+	MLX4_INTERFACE_STATE_UP		= 1 << 0,
+	MLX4_INTERFACE_STATE_DELETION	= 1 << 1,
+};
+
 #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
 			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
 
 enum mlx4_module_id {
-	MLX4_MODULE_ID_SFP		= 0x3,
-	MLX4_MODULE_ID_QSFP		= 0xC,
-	MLX4_MODULE_ID_QSFP_PLUS	= 0xD,
-	MLX4_MODULE_ID_QSFP28		= 0x11,
+	MLX4_MODULE_ID_SFP              = 0x3,
+	MLX4_MODULE_ID_QSFP             = 0xC,
+	MLX4_MODULE_ID_QSFP_PLUS        = 0xD,
+	MLX4_MODULE_ID_QSFP28           = 0x11,
+};
+
+enum { /* rl */
+	MLX4_QP_RATE_LIMIT_NONE		= 0,
+	MLX4_QP_RATE_LIMIT_KBS		= 1,
+	MLX4_QP_RATE_LIMIT_MBS		= 2,
+	MLX4_QP_RATE_LIMIT_GBS		= 3
+};
+
+struct mlx4_rate_limit_caps {
+	u16	num_rates; /* Number of different rates */
+	u8	min_unit;
+	u16	min_val;
+	u8	max_unit;
+	u16	max_val;
 };
 
 static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
@@ -480,6 +549,8 @@ struct mlx4_caps {
 	int			max_rq_desc_sz;
 	int			max_qp_init_rdma;
 	int			max_qp_dest_rdma;
+	int			max_tc_eth;
+	u32			*qp0_qkey;
 	u32			*qp0_proxy;
 	u32			*qp1_proxy;
 	u32			*qp0_tunnel;
@@ -495,10 +566,9 @@ struct mlx4_caps {
 	int			num_eqs;
 	int			reserved_eqs;
 	int			num_comp_vectors;
-	int			comp_pool;
 	int			num_mpts;
 	int			max_fmr_maps;
-	u64			num_mtts;
+	int			num_mtts;
 	int			fmr_reserved_mtts;
 	int			reserved_mtts;
 	int			reserved_mrws;
@@ -508,6 +578,8 @@ struct mlx4_caps {
 	int			reserved_mcgs;
 	int			num_qp_per_mgm;
 	int			steering_mode;
+	int			dmfs_high_steer_mode;
+	int			fs_log_max_ucast_qp_range_size;
 	int			num_pds;
 	int			reserved_pds;
 	int			max_xrcds;
@@ -520,7 +592,6 @@ struct mlx4_caps {
 	u32			bmme_flags;
 	u32			reserved_lkey;
 	u16			stat_rate_support;
-	u8			cq_timestamp;
 	u8			port_width_cap[MLX4_MAX_PORTS + 1];
 	int			max_gso_sz;
 	int			max_rss_tbl_sz;
@@ -538,19 +609,21 @@ struct mlx4_caps {
 	u32			max_counters;
 	u8			port_ib_mtu[MLX4_MAX_PORTS + 1];
 	u16			sqp_demux;
-	u32			sync_qp;
-	u32			cq_flags;
 	u32			eqe_size;
 	u32			cqe_size;
 	u8			eqe_factor;
-	u32			userspace_caps; /* userspace must be aware to */
-	u32			function_caps;  /* functions must be aware to */
-	u8			fast_drop;
+	u32			userspace_caps; /* userspace must be aware of these */
+	u32			function_caps;  /* VFs must be aware of these */
 	u16			hca_core_clock;
-	u32			max_basic_counters;
-	u32			max_extended_counters;
-	u8			def_counter_index[MLX4_MAX_PORTS + 1];
+	u64			phys_port_id[MLX4_MAX_PORTS + 1];
+	int			tunnel_offload_mode;
+	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
+	u8			phv_bit[MLX4_MAX_PORTS + 1];
 	u8			alloc_res_qp_mask;
+	u32			dmfs_high_rate_qpn_base;
+	u32			dmfs_high_rate_qpn_range;
+	u32			vf_caps;
+	struct mlx4_rate_limit_caps rl_caps;
 };
 
 struct mlx4_buf_list {
@@ -647,7 +720,7 @@ struct mlx4_uar {
 };
 
 struct mlx4_bf {
-	unsigned long		offset;
+	unsigned int		offset;
 	int			buf_size;
 	struct mlx4_uar	       *uar;
 	void __iomem	       *reg;
@@ -661,6 +734,7 @@ struct mlx4_cq {
 
 	u32			cons_index;
 
+	u16                     irq;
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
 	int			arm_sn;
@@ -670,8 +744,8 @@ struct mlx4_cq {
 
 	atomic_t		refcount;
 	struct completion	free;
-	int			eqn;
-	u16			irq;
+	int		reset_notify_added;
+	struct list_head	reset_notify;
 };
 
 struct mlx4_qp {
@@ -720,9 +794,9 @@ struct mlx4_eth_av {
 	__be32		sl_tclass_flowlabel;
 	u8		dgid[16];
 	u8		s_mac[6];
-	u8	reserved4[2];
+	u8		reserved4[2];
 	__be16		vlan;
-	u8		mac[6];
+	u8		mac[ETH_ALEN];
 };
 
 union mlx4_ext_av {
@@ -730,66 +804,17 @@ union mlx4_ext_av {
 	struct mlx4_eth_av	eth;
 };
 
-struct mlx4_if_stat_control {
-	u8 reserved1[3];
-	/* Extended counters enabled */
-	u8 cnt_mode;
-	/* Number of interfaces */
-	__be32 num_of_if;
-	__be32 reserved[2];
+struct mlx4_counter {
+	u8	reserved1[3];
+	u8	counter_mode;
+	__be32	num_ifc;
+	u32	reserved2[2];
+	__be64	rx_frames;
+	__be64	rx_bytes;
+	__be64	tx_frames;
+	__be64	tx_bytes;
 };
 
-struct mlx4_if_stat_basic {
-	struct mlx4_if_stat_control control;
-	struct {
-		__be64 IfRxFrames;
-		__be64 IfRxOctets;
-		__be64 IfTxFrames;
-		__be64 IfTxOctets;
-	} counters[];
-};
-#define MLX4_IF_STAT_BSC_SZ(ports)(sizeof(struct mlx4_if_stat_extended) +\
-				   sizeof(((struct mlx4_if_stat_extended *)0)->\
-				   counters[0]) * ports)
-
-struct mlx4_if_stat_extended {
-	struct mlx4_if_stat_control control;
-	struct {
-		__be64 IfRxUnicastFrames;
-		__be64 IfRxUnicastOctets;
-		__be64 IfRxMulticastFrames;
-		__be64 IfRxMulticastOctets;
-		__be64 IfRxBroadcastFrames;
-		__be64 IfRxBroadcastOctets;
-		__be64 IfRxNoBufferFrames;
-		__be64 IfRxNoBufferOctets;
-		__be64 IfRxErrorFrames;
-		__be64 IfRxErrorOctets;
-		__be32 reserved[39];
-		__be64 IfTxUnicastFrames;
-		__be64 IfTxUnicastOctets;
-		__be64 IfTxMulticastFrames;
-		__be64 IfTxMulticastOctets;
-		__be64 IfTxBroadcastFrames;
-		__be64 IfTxBroadcastOctets;
-		__be64 IfTxDroppedFrames;
-		__be64 IfTxDroppedOctets;
-		__be64 IfTxRequestedFramesSent;
-		__be64 IfTxGeneratedFramesSent;
-		__be64 IfTxTsoOctets;
-	} __packed counters[];
-};
-#define MLX4_IF_STAT_EXT_SZ(ports)   (sizeof(struct mlx4_if_stat_extended) +\
-				      sizeof(((struct mlx4_if_stat_extended *)\
-				      0)->counters[0]) * ports)
-
-union mlx4_counter {
-	struct mlx4_if_stat_control	control;
-	struct mlx4_if_stat_basic	basic;
-	struct mlx4_if_stat_extended	ext;
-};
-#define MLX4_IF_STAT_SZ(ports)		MLX4_IF_STAT_EXT_SZ(ports)
-
 struct mlx4_quotas {
 	int qp;
 	int cq;
@@ -800,8 +825,35 @@ struct mlx4_quotas {
 	int xrcd;
 };
 
-struct mlx4_dev {
+struct mlx4_vf_dev {
+	u8			min_port;
+	u8			n_ports;
+};
+
+enum mlx4_pci_status {
+	MLX4_PCI_STATUS_DISABLED,
+	MLX4_PCI_STATUS_ENABLED,
+};
+
+struct mlx4_dev_persistent {
 	struct pci_dev	       *pdev;
+	struct mlx4_dev	       *dev;
+	int                     nvfs[MLX4_MAX_PORTS + 1];
+	int			num_vfs;
+	enum mlx4_port_type curr_port_type[MLX4_MAX_PORTS + 1];
+	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
+	struct work_struct      catas_work;
+	struct workqueue_struct *catas_wq;
+	struct mutex	device_state_mutex; /* protect HW state */
+	u8		state;
+	struct mutex	interface_state_mutex; /* protect SW state */
+	u8	interface_state;
+	struct mutex		pci_status_mutex; /* sync pci state */
+	enum mlx4_pci_status	pci_status;
+};
+
+struct mlx4_dev {
+	struct mlx4_dev_persistent *persist;
 	unsigned long		flags;
 	unsigned long		num_slaves;
 	struct mlx4_caps	caps;
@@ -809,14 +861,14 @@ struct mlx4_dev {
 	struct mlx4_quotas	quotas;
 	struct radix_tree_root	qp_table_tree;
 	u8			rev_id;
+	u8			port_random_macs;
 	char			board_id[MLX4_BOARD_ID_LEN];
-	u16			vsd_vendor_id;
-	char			vsd[MLX4_VSD_LEN];
-	int			num_vfs;
 	int			numa_node;
 	int			oper_log_mgm_entry_size;
 	u64			regid_promisc_array[MLX4_MAX_PORTS + 1];
 	u64			regid_allmulti_array[MLX4_MAX_PORTS + 1];
+	struct mlx4_vf_dev     *dev_vfs;
+	u8  uar_page_shift;
 };
 
 struct mlx4_clock_params {
@@ -892,6 +944,9 @@ struct mlx4_eqe {
 					__be32 block_ptr;
 					__be32 tbl_entries_mask;
 				} __packed tbl_change_info;
+				struct {
+					u8 sl2vl_table[8];
+				} __packed sl2vl_tbl_change_info;
 			} params;
 		} __packed port_mgmt_change;
 		struct {
@@ -922,39 +977,35 @@ struct mlx4_init_port_param {
 #define MAD_IFC_DATA_SZ 192
 /* MAD IFC Mailbox */
 struct mlx4_mad_ifc {
-	u8      base_version;
-	u8      mgmt_class;
-	u8      class_version;
-	u8      method;
-	__be16  status;
-	__be16  class_specific;
-	__be64  tid;
-	__be16  attr_id;
-	__be16  resv;
-	__be32  attr_mod;
-	__be64  mkey;
-	__be16  dr_slid;
-	__be16  dr_dlid;
-	u8      reserved[28];
-	u8      data[MAD_IFC_DATA_SZ];
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64	mkey;
+	__be16	dr_slid;
+	__be16	dr_dlid;
+	u8	reserved[28];
+	u8	data[MAD_IFC_DATA_SZ];
 } __packed;
 
 #define mlx4_foreach_port(port, dev, type)				\
 	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
 		if ((type) == (dev)->caps.port_mask[(port)])
 
-#define mlx4_foreach_non_ib_transport_port(port, dev)                     \
-	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
-		if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
-
 #define mlx4_foreach_ib_transport_port(port, dev)                         \
-	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
 		if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
-			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \
+			((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2))
 
 #define MLX4_INVALID_SLAVE_ID	0xFF
-
-#define MLX4_SINK_COUNTER_INDEX 0xff
+#define MLX4_SINK_COUNTER_INDEX(dev)	(dev->caps.max_counters - 1)
 
 void handle_port_mgmt_change_event(struct work_struct *work);
 
@@ -977,7 +1028,9 @@ static inline int mlx4_num_reserved_sqps(struct mlx4_dev *dev)
 static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn)
 {
 	return (qpn < dev->phys_caps.base_sqpn + 8 +
-		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev));
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev) &&
+		qpn >= dev->phys_caps.base_sqpn) ||
+	       (qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW]);
 }
 
 static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn)
@@ -1000,8 +1053,13 @@ static inline int mlx4_is_slave(struct mlx4_dev *dev)
 	return dev->flags & MLX4_FLAG_SLAVE;
 }
 
+static inline int mlx4_is_eth(struct mlx4_dev *dev, int port)
+{
+	return dev->caps.port_type[port] == MLX4_PORT_TYPE_IB ? 0 : 1;
+}
+
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
-		   struct mlx4_buf *buf);
+		   struct mlx4_buf *buf, gfp_t gfp);
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
 static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
 {
@@ -1038,9 +1096,10 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw);
 int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		   int start_index, int npages, u64 *page_list);
 int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
-		       struct mlx4_buf *buf);
+		       struct mlx4_buf *buf, gfp_t gfp);
 
-int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order);
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order,
+		  gfp_t gfp);
 void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db);
 
 int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
@@ -1052,12 +1111,12 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
 		  unsigned vector, int collapsed, int timestamp_en);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
-
 int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 			  int *base, u8 flags);
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp,
+		  gfp_t gfp);
 void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
 int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn,
@@ -1093,6 +1152,7 @@ enum mlx4_net_trans_rule_id {
 	MLX4_NET_TRANS_RULE_ID_IPV4,
 	MLX4_NET_TRANS_RULE_ID_TCP,
 	MLX4_NET_TRANS_RULE_ID_UDP,
+	MLX4_NET_TRANS_RULE_ID_VXLAN,
 	MLX4_NET_TRANS_RULE_NUM, /* should be last */
 	MLX4_NET_TRANS_RULE_DUMMY = -1,	/* force enum to be signed */
 };
@@ -1111,9 +1171,11 @@ static inline int map_hw_to_sw_id(u16 header_id)
 }
 
 enum mlx4_net_trans_promisc_mode {
-	MLX4_FS_REGULAR		= 1,
+	MLX4_FS_REGULAR = 1,
 	MLX4_FS_ALL_DEFAULT,
 	MLX4_FS_MC_DEFAULT,
+	MLX4_FS_MIRROR_RX_PORT,
+	MLX4_FS_MIRROR_SX_PORT,
 	MLX4_FS_UC_SNIFFER,
 	MLX4_FS_MC_SNIFFER,
 	MLX4_FS_MODE_NUM, /* should be last */
@@ -1121,10 +1183,10 @@ enum mlx4_net_trans_promisc_mode {
 };
 
 struct mlx4_spec_eth {
-	u8	dst_mac[6];
-	u8	dst_mac_msk[6];
-	u8	src_mac[6];
-	u8	src_mac_msk[6];
+	u8	dst_mac[ETH_ALEN];
+	u8	dst_mac_msk[ETH_ALEN];
+	u8	src_mac[ETH_ALEN];
+	u8	src_mac_msk[ETH_ALEN];
 	u8	ether_type_enable;
 	__be16	ether_type;
 	__be16	vlan_id_msk;
@@ -1146,10 +1208,16 @@ struct mlx4_spec_ipv4 {
 };
 
 struct mlx4_spec_ib {
-	__be32 l3_qpn;
-	__be32 qpn_msk;
-	u8 dst_gid[16];
-	u8 dst_gid_msk[16];
+	__be32  l3_qpn;
+	__be32	qpn_msk;
+	u8	dst_gid[16];
+	u8	dst_gid_msk[16];
+};
+
+struct mlx4_spec_vxlan {
+	__be32 vni;
+	__be32 vni_mask;
+
 };
 
 struct mlx4_spec_list {
@@ -1160,6 +1228,7 @@ struct mlx4_spec_list {
 		struct mlx4_spec_ib ib;
 		struct mlx4_spec_ipv4 ipv4;
 		struct mlx4_spec_tcp_udp tcp_udp;
+		struct mlx4_spec_vxlan vxlan;
 	};
 };
 
@@ -1246,6 +1315,15 @@ struct mlx4_net_trans_rule_hw_ipv4 {
 	__be32	src_ip_msk;
 } __packed;
 
+struct mlx4_net_trans_rule_hw_vxlan {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be32	rsvd1;
+	__be32	vni;
+	__be32	vni_mask;
+} __packed;
+
 struct _rule_hw {
 	union {
 		struct {
@@ -1257,9 +1335,22 @@ struct _rule_hw {
 		struct mlx4_net_trans_rule_hw_ib ib;
 		struct mlx4_net_trans_rule_hw_ipv4 ipv4;
 		struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
+		struct mlx4_net_trans_rule_hw_vxlan vxlan;
 	};
 };
 
+enum {
+	VXLAN_STEER_BY_OUTER_MAC	= 1 << 0,
+	VXLAN_STEER_BY_OUTER_VLAN	= 1 << 1,
+	VXLAN_STEER_BY_VSID_VNI		= 1 << 2,
+	VXLAN_STEER_BY_INNER_MAC	= 1 << 3,
+	VXLAN_STEER_BY_INNER_VLAN	= 1 << 4,
+};
+
+enum {
+	MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS = 0x2,
+};
+
 int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
 				enum mlx4_net_trans_promisc_mode mode);
 int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
@@ -1268,6 +1359,7 @@ int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
 int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
 int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
 int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
 
 int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
 void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
@@ -1277,9 +1369,15 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
 int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 			   u8 promisc);
-int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc);
-int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
-		u8 *pg, u16 *ratelimit);
+int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time);
+int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port,
+			    u8 ignore_fcs_value);
+int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable);
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val);
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv);
+int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port,
+				      bool *vlan_offload_disabled);
+int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx);
 int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
 int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
 void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
@@ -1293,28 +1391,42 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
 		    u32 *lkey, u32 *rkey);
 int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_SYNC_TPT(struct mlx4_dev *dev);
-int mlx4_query_diag_counters(struct mlx4_dev *mlx4_dev, int array_length,
-			     u8 op_modifier, u32 in_offset[],
-			     u32 counter_out[]);
-
-int mlx4_test_interrupts(struct mlx4_dev *dev);
-int mlx4_assign_eq(struct mlx4_dev *dev, char* name, int * vector);
+int mlx4_test_interrupt(struct mlx4_dev *dev, int vector);
+int mlx4_test_async(struct mlx4_dev *dev);
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+			     const u32 offset[], u32 value[],
+			     size_t array_len, u8 port);
+u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port);
+bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector);
+int mlx4_assign_eq(struct mlx4_dev *dev, u8 port, int *vector);
 void mlx4_release_eq(struct mlx4_dev *dev, int vec);
 
+int mlx4_is_eq_shared(struct mlx4_dev *dev, int vector);
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec);
+
+int mlx4_get_phys_port_id(struct mlx4_dev *dev);
 int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port);
 int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
 
-int mlx4_counter_alloc(struct mlx4_dev *dev, u8 port, u32 *idx);
-void mlx4_counter_free(struct mlx4_dev *dev, u8 port, u32 idx);
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port);
 
+void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry,
+			 int port);
+__be64 mlx4_get_admin_guid(struct mlx4_dev *dev, int entry, int port);
+void mlx4_set_random_admin_guid(struct mlx4_dev *dev, int entry, int port);
 int mlx4_flow_attach(struct mlx4_dev *dev,
 		     struct mlx4_net_trans_rule *rule, u64 *reg_id);
 int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id);
-int map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
-			       enum mlx4_net_trans_promisc_mode flow_type);
-int map_sw_to_hw_steering_id(struct mlx4_dev *dev,
-			     enum mlx4_net_trans_rule_id id);
-int hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type);
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id);
+int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
+
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id);
 
 void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
 			  int i, int val);
@@ -1324,23 +1436,135 @@ int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey);
 int mlx4_is_slave_active(struct mlx4_dev *dev, int slave);
 int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port);
 int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port);
-int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr, u16 lid, u8 sl);
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr);
 int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, u8 port_subtype_change);
 enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port);
 int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int event, enum slave_port_gen_event *gen_event);
 
 void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid);
 __be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave);
-int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id);
-int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid);
 
-int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, u32 max_range_qpn);
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
+				 int *slave_id);
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
+				 u8 *gid);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
+				      u32 max_range_qpn);
 
 s64 mlx4_read_clock(struct mlx4_dev *dev);
+
+struct mlx4_active_ports {
+	DECLARE_BITMAP(ports, MLX4_MAX_PORTS);
+};
+/* Returns a bitmap of the physical ports which are assigned to slave */
+struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave);
+
+/* Returns the physical port that represents the virtual port of the slave, */
+/* or a value < 0 in case of an error. If a slave has 2 ports, the identity */
+/* mapping is returned.							    */
+int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port);
+
+struct mlx4_slaves_pport {
+	DECLARE_BITMAP(slaves, MLX4_MFUNC_MAX);
+};
+/* Returns a bitmap of all slaves that are assigned to port. */
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
+						   int port);
+
+/* Returns a bitmap of all slaves that are assigned exactly to all the */
+/* the ports that are set in crit_ports.			       */
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
+		struct mlx4_dev *dev,
+		const struct mlx4_active_ports *crit_ports);
+
+/* Returns the slave's virtual port that represents the physical port. */
+int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port);
+
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
+
+int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis);
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port);
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2);
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enable);
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry);
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry);
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn);
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access);
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry);
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry);
+
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
+			 u16 offset, u16 size, u8 *data);
+int mlx4_max_tc(struct mlx4_dev *dev);
+
+/* Returns true if running in low memory profile (kdump kernel) */
+static inline bool mlx4_low_memory_profile(void)
+{
+	return false;
+}
+
+/* ACCESS REG commands */
+enum mlx4_access_reg_method {
+	MLX4_ACCESS_REG_QUERY = 0x1,
+	MLX4_ACCESS_REG_WRITE = 0x2,
+};
+
+/* ACCESS PTYS Reg command */
+enum mlx4_ptys_proto {
+	MLX4_PTYS_IB = 1<<0,
+	MLX4_PTYS_EN = 1<<2,
+};
+
+struct mlx4_ptys_reg {
+	u8 resrvd1;
+	u8 local_port;
+	u8 resrvd2;
+	u8 proto_mask;
+	__be32 resrvd3[2];
+	__be32 eth_proto_cap;
+	__be16 ib_width_cap;
+	__be16 ib_speed_cap;
+	__be32 resrvd4;
+	__be32 eth_proto_admin;
+	__be16 ib_width_admin;
+	__be16 ib_speed_admin;
+	__be32 resrvd5;
+	__be32 eth_proto_oper;
+	__be16 ib_width_oper;
+	__be16 ib_speed_oper;
+	__be32 resrvd6;
+	__be32 eth_proto_lp_adv;
+} __packed;
+
+int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
+			 enum mlx4_access_reg_method method,
+			 struct mlx4_ptys_reg *ptys_reg);
+
 int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
 				   struct mlx4_clock_params *params);
 
-int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
-			u16 offset, u16 size, u8 *data);
+static inline int mlx4_to_hw_uar_index(struct mlx4_dev *dev, int index)
+{
+	return (index << (PAGE_SHIFT - dev->uar_page_shift));
+}
 
+static inline int mlx4_get_num_reserved_uar(struct mlx4_dev *dev)
+{
+	/* The first 128 UARs are used for EQ doorbells */
+	return (128 >> (PAGE_SHIFT - dev->uar_page_shift));
+}
 #endif /* MLX4_DEVICE_H */
diff --git a/sys/dev/mlx4/driver.h b/sys/dev/mlx4/driver.h
index b6414572b37a..a12d6a508ef0 100644
--- a/sys/dev/mlx4/driver.h
+++ b/sys/dev/mlx4/driver.h
@@ -38,8 +38,6 @@
 struct mlx4_dev;
 
 #define MLX4_MAC_MASK	   0xffffffffffffULL
-#define MLX4_BE_SHORT_MASK cpu_to_be16(0xffff)
-#define MLX4_BE_WORD_MASK  cpu_to_be32(0xffffffff)
 
 enum mlx4_dev_event {
 	MLX4_DEV_EVENT_CATASTROPHIC_ERROR,
@@ -51,76 +49,46 @@ enum mlx4_dev_event {
 	MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
 };
 
+enum {
+	MLX4_INTFF_BONDING	= 1 << 0
+};
+
 struct mlx4_interface {
 	void *			(*add)	 (struct mlx4_dev *dev);
 	void			(*remove)(struct mlx4_dev *dev, void *context);
 	void			(*event) (struct mlx4_dev *dev, void *context,
 					  enum mlx4_dev_event event, unsigned long param);
 	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
+	void			(*activate)(struct mlx4_dev *dev, void *context);
 	struct list_head	list;
 	enum mlx4_protocol	protocol;
+	int			flags;
 };
 
-enum {
-	MLX4_MAX_DEVICES	= 32,
-	MLX4_DEVS_TBL_SIZE	= MLX4_MAX_DEVICES + 1,
-	MLX4_DBDF2VAL_STR_SIZE	= 512,
-	MLX4_STR_NAME_SIZE	= 64,
-	MLX4_MAX_BDF_VALS	= 2,
-	MLX4_ENDOF_TBL		= -1LL
-};
-
-struct mlx4_dbdf2val {
-	u64 dbdf;
-	int val[MLX4_MAX_BDF_VALS];
-};
-
-struct mlx4_range {
-	int min;
-	int max;
-};
-
-/*
- * mlx4_dbdf2val_lst struct holds all the data needed to convert
- * dbdf-to-value-list string into dbdf-to-value table.
- * dbdf-to-value-list string is a comma separated list of dbdf-to-value strings.
- * the format of dbdf-to-value string is: "[mmmm:]bb:dd.f-v1[;v2]"
- * mmmm - Domain number (optional)
- * bb - Bus number
- * dd - device number
- * f  - Function number
- * v1 - First value related to the domain-bus-device-function.
- * v2 - Second value related to the domain-bus-device-function (optional).
- * bb, dd - Two hexadecimal digits without preceding 0x.
- * mmmm - Four hexadecimal digits without preceding 0x.
- * f  - One hexadecimal without preceding 0x.
- * v1,v2 - Number with normal convention (e.g 100, 0xd3).
- * dbdf-to-value-list string format:
- *     "[mmmm:]bb:dd.f-v1[;v2],[mmmm:]bb:dd.f-v1[;v2],..."
- *
- */
-struct mlx4_dbdf2val_lst {
-	char		name[MLX4_STR_NAME_SIZE];    /* String name */
-	char		str[MLX4_DBDF2VAL_STR_SIZE]; /* dbdf2val list str */
-	struct mlx4_dbdf2val tbl[MLX4_DEVS_TBL_SIZE];/* dbdf to value table */
-	int		num_vals;		     /* # of vals per dbdf */
-	int		def_val[MLX4_MAX_BDF_VALS];  /* Default values */
-	struct mlx4_range range;		     /* Valid values range */
-};
-
-int mlx4_fill_dbdf2val_tbl(struct mlx4_dbdf2val_lst *dbdf2val_lst);
-int mlx4_get_val(struct mlx4_dbdf2val *tbl, struct pci_dev *pdev, int idx,
-		 int *val);
-
 int mlx4_register_interface(struct mlx4_interface *intf);
 void mlx4_unregister_interface(struct mlx4_interface *intf);
 
-void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto,
-			    int port);
+int mlx4_bond(struct mlx4_dev *dev);
+int mlx4_unbond(struct mlx4_dev *dev);
+static inline int mlx4_is_bonded(struct mlx4_dev *dev)
+{
+	return !!(dev->flags & MLX4_FLAG_BONDED);
+}
+
+static inline int mlx4_is_mf_bonded(struct mlx4_dev *dev)
+{
+	return (mlx4_is_bonded(dev) && mlx4_is_mfunc(dev));
+}
+
+struct mlx4_port_map {
+	u8	port1;
+	u8	port2;
+};
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p);
+
+void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port);
 
-#ifndef ETH_ALEN
-#define ETH_ALEN	6
-#endif
 static inline u64 mlx4_mac_to_u64(const u8 *addr)
 {
 	u64 mac = 0;
diff --git a/sys/dev/mlx4/mlx4_core/fw.h b/sys/dev/mlx4/mlx4_core/fw.h
index 748b2841d93d..c4c4883b4d06 100644
--- a/sys/dev/mlx4/mlx4_core/fw.h
+++ b/sys/dev/mlx4/mlx4_core/fw.h
@@ -43,6 +43,28 @@ struct mlx4_mod_stat_cfg {
 	u8 log_pg_sz_m;
 };
 
+struct mlx4_port_cap {
+	u8  link_state;
+	u8  supported_port_types;
+	u8  suggested_type;
+	u8  default_sense;
+	u8  log_max_macs;
+	u8  log_max_vlans;
+	int ib_mtu;
+	int max_port_width;
+	int max_vl;
+	int max_tc_eth;
+	int max_gids;
+	int max_pkeys;
+	u64 def_mac;
+	u16 eth_mtu;
+	int trans_type;
+	int vendor_oui;
+	u16 wavelength;
+	u64 trans_code;
+	u8 dmfs_optimized_state;
+};
+
 struct mlx4_dev_cap {
 	int max_srq_sz;
 	int max_qp_sz;
@@ -58,26 +80,13 @@ struct mlx4_dev_cap {
 	int max_eqs;
 	int num_sys_eqs;
 	int reserved_mtts;
-	int max_mrw_sz;
 	int reserved_mrws;
-	int max_mtt_seg;
 	int max_requester_per_qp;
 	int max_responder_per_qp;
 	int max_rdma_global;
 	int local_ca_ack_delay;
 	int num_ports;
 	u32 max_msg_sz;
-	int ib_mtu[MLX4_MAX_PORTS + 1];
-	int max_port_width[MLX4_MAX_PORTS + 1];
-	int max_vl[MLX4_MAX_PORTS + 1];
-	int max_gids[MLX4_MAX_PORTS + 1];
-	int max_pkeys[MLX4_MAX_PORTS + 1];
-	u64 def_mac[MLX4_MAX_PORTS + 1];
-	u16 eth_mtu[MLX4_MAX_PORTS + 1];
-	int trans_type[MLX4_MAX_PORTS + 1];
-	int vendor_oui[MLX4_MAX_PORTS + 1];
-	u16 wavelength[MLX4_MAX_PORTS + 1];
-	u64 trans_code[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
 	int fs_log_max_ucast_qp_range_size;
 	int fs_max_num_qp_per_entry;
@@ -115,15 +124,11 @@ struct mlx4_dev_cap {
 	u64 max_icm_sz;
 	int max_gso_sz;
 	int max_rss_tbl_sz;
-	u8  supported_port_types[MLX4_MAX_PORTS + 1];
-	u8  suggested_type[MLX4_MAX_PORTS + 1];
-	u8  default_sense[MLX4_MAX_PORTS + 1];
-	u8  log_max_macs[MLX4_MAX_PORTS + 1];
-	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
-	u32 max_basic_counters;
-	u32 sync_qp;
-	u8  timestamp_support;
-	u32 max_extended_counters;
+	u32 max_counters;
+	u32 dmfs_high_rate_qpn_base;
+	u32 dmfs_high_rate_qpn_range;
+	struct mlx4_rate_limit_caps rl_caps;
+	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_func_cap {
@@ -138,14 +143,17 @@ struct mlx4_func_cap {
 	int	max_eq;
 	int	reserved_eq;
 	int	mcg_quota;
+	u32	qp0_qkey;
 	u32	qp0_tunnel_qpn;
 	u32	qp0_proxy_qpn;
 	u32	qp1_tunnel_qpn;
 	u32	qp1_proxy_qpn;
+	u32	reserved_lkey;
 	u8	physical_port;
-	u8	port_flags;
-	u8	def_counter_index;
-	u8	extra_flags;
+	u8	flags0;
+	u8	flags1;
+	u64	phys_port_id;
+	u32	extra_flags;
 };
 
 struct mlx4_func {
@@ -159,9 +167,7 @@ struct mlx4_func {
 };
 
 struct mlx4_adapter {
-	u16 vsd_vendor_id;
 	char board_id[MLX4_BOARD_ID_LEN];
-	char vsd[MLX4_VSD_LEN];
 	u8   inta_pin;
 };
 
@@ -180,7 +186,7 @@ struct mlx4_init_hca_param {
 	u64 global_caps;
 	u16 log_mc_entry_sz;
 	u16 log_mc_hash_sz;
-	u16 hca_core_clock;
+	u16 hca_core_clock; /* Internal Clock Frequency (in MHz) */
 	u8  log_num_qps;
 	u8  log_num_srqs;
 	u8  log_num_cqs;
@@ -190,11 +196,15 @@ struct mlx4_init_hca_param {
 	u8  log_mc_table_sz;
 	u8  log_mpt_sz;
 	u8  log_uar_sz;
+	u8  mw_enabled;  /* Enable memory windows */
 	u8  uar_page_sz; /* log pg sz in 4k chunks */
-	u8  mw_enable;	/* Enable memory windows */
-	u8  fs_hash_enable_bits;
 	u8  steering_mode; /* for QUERY_HCA */
+	u8  dmfs_high_steer_mode; /* for QUERY_HCA */
 	u64 dev_cap_enabled;
+	u16 cqe_size; /* For use only when CQE stride feature enabled */
+	u16 eqe_size; /* For use only when EQE stride feature enabled */
+	u8 rss_ip_frags;
+	u8 phv_check_en; /* for QUERY_HCA */
 };
 
 struct mlx4_init_ib_param {
@@ -218,14 +228,17 @@ struct mlx4_set_ib_param {
 	u32 cap_mask;
 };
 
+void mlx4_dev_cap_dump(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
-int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap);
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 			struct mlx4_func_cap *func_cap);
 int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 				struct mlx4_vhcr *vhcr,
 				struct mlx4_cmd_mailbox *inbox,
 				struct mlx4_cmd_mailbox *outbox,
 				struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave);
 int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
 int mlx4_UNMAP_FA(struct mlx4_dev *dev);
 int mlx4_RUN_FW(struct mlx4_dev *dev);
@@ -236,9 +249,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
 int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
 int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
 int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
 int mlx4_NOP(struct mlx4_dev *dev);
 int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
-int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave);
 void mlx4_opreq_action(struct work_struct *work);
 
 #endif /* MLX4_FW_H */
diff --git a/sys/dev/mlx4/mlx4_core/fw_qos.h b/sys/dev/mlx4/mlx4_core/fw_qos.h
new file mode 100644
index 000000000000..fc91926df110
--- /dev/null
+++ b/sys/dev/mlx4/mlx4_core/fw_qos.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_QOS_H
+#define MLX4_FW_QOS_H
+
+#include <dev/mlx4/cmd.h>
+#include <dev/mlx4/device.h>
+
+#define MLX4_NUM_UP 8
+#define MLX4_NUM_TC 8
+
+/* Default supported priorities for VPP allocation */
+#define MLX4_DEFAULT_QOS_PRIO (0)
+
+/* Derived from FW feature definition, 0 is the default vport fo all QPs */
+#define MLX4_VPP_DEFAULT_VPORT (0)
+
+struct mlx4_vport_qos_param {
+	u32 bw_share;
+	u32 max_avg_bw;
+	u8 enable;
+};
+
+/**
+ * mlx4_SET_PORT_PRIO2TC - This routine maps user priorities to traffic
+ * classes of a given port and device.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @prio2tc: Array of TC associated with each priorities.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc);
+
+/**
+ * mlx4_SET_PORT_SCHEDULER - This routine configures the arbitration between
+ * traffic classes (ETS) and configured rate limit for traffic classes.
+ * tc_tx_bw, pg and ratelimit are arrays where each index represents a TC.
+ * The description for those parameters below refers to a single TC.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @tc_tx_bw: The percentage of the bandwidth allocated for traffic class
+ *  within a TC group. The sum of the bw_percentage of all the traffic
+ *  classes within a TC group must equal 100% for correct operation.
+ * @pg: The TC group the traffic class is associated with.
+ * @ratelimit: The maximal bandwidth allowed for the use by this traffic class.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+			    u8 *pg, u16 *ratelimit);
+/**
+ * mlx4_ALLOCATE_VPP_get - Query port VPP availible resources and allocation.
+ * Before distribution of VPPs to priorities, only availible_vpp is returned.
+ * After initialization it returns the distribution of VPPs among priorities.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @availible_vpp: Pointer to variable where number of availible VPPs is stored
+ * @vpp_p_up: Distribution of VPPs to priorities is stored in this array
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
+			  u16 *availible_vpp, u8 *vpp_p_up);
+/**
+ * mlx4_ALLOCATE_VPP_set - Distribution of VPPs among differnt priorities.
+ * The total number of VPPs assigned to all for a port must not exceed
+ * the value reported by availible_vpp in mlx4_ALLOCATE_VPP_get.
+ * VPP allocation is allowed only after the port type has been set,
+ * and while no QPs are open for this port.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vpp_p_up: Allocation of VPPs to different priorities.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_ALLOCATE_VPP_set(struct mlx4_dev *dev, u8 port, u8 *vpp_p_up);
+
+/**
+ * mlx4_SET_VPORT_QOS_get - Query QoS proporties of a Vport.
+ * Each priority allowed for the Vport is assigned with a share of the BW,
+ * and a BW limitation. This commands query the current QoS values.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vport: Vport id.
+ * @out_param: Array of mlx4_vport_qos_param that will contain the values.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_VPORT_QOS_get(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *out_param);
+
+/**
+ * mlx4_SET_VPORT_QOS_set - Set QoS proporties of a Vport.
+ * QoS parameters can be modified at any time, but must be initialized
+ * before any QP is associated with the VPort.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vport: Vport id.
+ * @out_param: Array of mlx4_vport_qos_param which holds the requested values.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_VPORT_QOS_set(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *in_param);
+
+#endif /* MLX4_FW_QOS_H */
diff --git a/sys/dev/mlx4/mlx4_core/icm.h b/sys/dev/mlx4/mlx4_core/icm.h
index f7a2537bfbc0..82613515be0a 100644
--- a/sys/dev/mlx4/mlx4_core/icm.h
+++ b/sys/dev/mlx4/mlx4_core/icm.h
@@ -72,14 +72,15 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 				gfp_t gfp_mask, int coherent);
 void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
 
-int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj,
+		   gfp_t gfp);
 void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
 int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 			 u32 start, u32 end);
 void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 			  u32 start, u32 end);
 int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
-			u64 virt, int obj_size,	u64 nobj, int reserved,
+			u64 virt, int obj_size,	u32 nobj, int reserved,
 			int use_lowmem, int use_coherent);
 void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
 void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, dma_addr_t *dma_handle);
diff --git a/sys/dev/mlx4/mlx4_core/mlx4.h b/sys/dev/mlx4/mlx4_core/mlx4.h
index f29175e7c2ca..4a4f63492df6 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4.h
+++ b/sys/dev/mlx4/mlx4_core/mlx4.h
@@ -44,18 +44,17 @@
 #include <linux/semaphore.h>
 #include <linux/workqueue.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 #include <dev/mlx4/device.h>
 #include <dev/mlx4/driver.h>
 #include <dev/mlx4/doorbell.h>
 #include <dev/mlx4/cmd.h>
+#include <dev/mlx4/mlx4_core/fw_qos.h>
 
 #define DRV_NAME	"mlx4_core"
 #define PFX		DRV_NAME ": "
-#define DRV_VERSION	"2.1.6"
-
-#define DRV_STACK_NAME		"Linux-MLNX_OFED"
-#define DRV_STACK_VERSION	"2.1"
-#define DRV_NAME_FOR_FW		DRV_STACK_NAME","DRV_STACK_VERSION
+#define DRV_VERSION	"3.4.1"
+#define DRV_RELDATE	"October 2017"
 
 #define MLX4_FS_UDP_UC_EN		(1 << 1)
 #define MLX4_FS_TCP_UC_EN		(1 << 2)
@@ -63,20 +62,11 @@
 #define MLX4_FS_MGM_LOG_ENTRY_SIZE	7
 #define MLX4_FS_NUM_MCG			(1 << 17)
 
-struct mlx4_set_port_prio2tc_context {
-	u8 prio2tc[4];
-};
+#define INIT_HCA_TPT_MW_ENABLE          (1 << 7)
 
-struct mlx4_port_scheduler_tc_cfg_be {
-	__be16 pg;
-	__be16 bw_precentage;
-	__be16 max_bw_units; /* 3-100Mbps, 4-1Gbps, other values - reserved */
-	__be16 max_bw_value;
-};
+#define MLX4_QUERY_IF_STAT_RESET	BIT(31)
 
-struct mlx4_set_port_scheduler_context {
-	struct mlx4_port_scheduler_tc_cfg_be tc[MLX4_NUM_TC];
-};
+#define	ETH_P_8021AD	0x88A8
 
 enum {
 	MLX4_HCR_BASE		= 0x80680,
@@ -84,14 +74,17 @@ enum {
 	MLX4_CLR_INT_SIZE	= 0x00008,
 	MLX4_SLAVE_COMM_BASE	= 0x0,
 	MLX4_COMM_PAGESIZE	= 0x1000,
-	MLX4_CLOCK_SIZE		= 0x00008
+	MLX4_CLOCK_SIZE		= 0x00008,
+	MLX4_COMM_CHAN_CAPS	= 0x8,
+	MLX4_COMM_CHAN_FLAGS	= 0xc
 };
 
 enum {
 	MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE = 10,
 	MLX4_MIN_MGM_LOG_ENTRY_SIZE = 7,
 	MLX4_MAX_MGM_LOG_ENTRY_SIZE = 12,
-	MLX4_MAX_QP_PER_MGM	= 4 * ((1 << MLX4_MAX_MGM_LOG_ENTRY_SIZE)/16 - 2),
+	MLX4_MAX_QP_PER_MGM = 4 * ((1 << MLX4_MAX_MGM_LOG_ENTRY_SIZE) / 16 - 2),
+	MLX4_MTT_ENTRY_PER_SEG	= 8,
 };
 
 enum {
@@ -118,6 +111,10 @@ enum mlx4_mpt_state {
 };
 
 #define MLX4_COMM_TIME		10000
+#define MLX4_COMM_OFFLINE_TIME_OUT 30000
+#define MLX4_COMM_CMD_NA_OP    0x0
+
+
 enum {
 	MLX4_COMM_CMD_RESET,
 	MLX4_COMM_CMD_VHCR0,
@@ -128,6 +125,11 @@ enum {
 	MLX4_COMM_CMD_FLR = 254
 };
 
+enum {
+	MLX4_VF_SMI_DISABLED,
+	MLX4_VF_SMI_ENABLED
+};
+
 /*The flag indicates that the slave should delay the RESET cmd*/
 #define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb
 /*indicates how many retries will be done if we are in the middle of FLR*/
@@ -191,7 +193,7 @@ struct mlx4_vhcr_cmd {
 	u8 status;
 	u8 flags;
 	__be16 opcode;
-} __packed;
+};
 
 struct mlx4_cmd_info {
 	u16 opcode;
@@ -199,7 +201,6 @@ struct mlx4_cmd_info {
 	bool has_outbox;
 	bool out_is_imm;
 	bool encode_slave_id;
-	bool skip_err_print;
 	int (*verify)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
 		      struct mlx4_cmd_mailbox *inbox);
 	int (*wrapper)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
@@ -208,35 +209,33 @@ struct mlx4_cmd_info {
 		       struct mlx4_cmd_info *cmd);
 };
 
-enum {
-	MLX4_DEBUG_MASK_CMD_TIME = 0x100,
-};
-
 #ifdef CONFIG_MLX4_DEBUG
 extern int mlx4_debug_level;
 #else /* CONFIG_MLX4_DEBUG */
 #define mlx4_debug_level	(0)
 #endif /* CONFIG_MLX4_DEBUG */
 
-#define mlx4_dbg(mdev, format, arg...)					\
+#define mlx4_dbg(mdev, format, ...)					\
 do {									\
 	if (mlx4_debug_level)						\
-		dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ##arg); \
+		dev_printk(KERN_DEBUG,					\
+			   &(mdev)->persist->pdev->dev, format,		\
+			   ##__VA_ARGS__);				\
 } while (0)
 
-#define mlx4_err(mdev, format, arg...) \
-	dev_err(&mdev->pdev->dev, format, ##arg)
-#define mlx4_info(mdev, format, arg...) \
-	dev_info(&mdev->pdev->dev, format, ##arg)
-#define mlx4_warn(mdev, format, arg...) \
-	dev_warn(&mdev->pdev->dev, format, ##arg)
+#define mlx4_err(mdev, format, ...)					\
+	dev_err(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
+#define mlx4_info(mdev, format, ...)					\
+	dev_info(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
+#define mlx4_warn(mdev, format, ...)					\
+	dev_warn(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
 
 extern int mlx4_log_num_mgm_entry_size;
 extern int log_mtts_per_seg;
-extern int mlx4_blck_lb;
-extern int mlx4_set_4k_mtu;
+extern int mlx4_internal_err_reset;
 
-#define MLX4_MAX_NUM_SLAVES	(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF)
+#define MLX4_MAX_NUM_SLAVES	(min(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF, \
+				     MLX4_MFUNC_MAX))
 #define ALL_SLAVES 0xff
 
 struct mlx4_bitmap {
@@ -246,6 +245,7 @@ struct mlx4_bitmap {
 	u32                     reserved_top;
 	u32			mask;
 	u32			avail;
+	u32			effective_len;
 	spinlock_t		lock;
 	unsigned long	       *table;
 };
@@ -277,6 +277,8 @@ struct mlx4_icm_table {
 #define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
 #define MLX4_MPT_FLAG_REGION	    (1 <<  8)
 
+#define MLX4_MPT_PD_MASK	    (0x1FFFFUL)
+#define MLX4_MPT_PD_VF_MASK	    (0xFE0000UL)
 #define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
 #define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
 #define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
@@ -286,6 +288,15 @@ struct mlx4_icm_table {
 #define MLX4_MPT_STATUS_SW		0xF0
 #define MLX4_MPT_STATUS_HW		0x00
 
+#define MLX4_CQE_SIZE_MASK_STRIDE	0x3
+#define MLX4_EQE_SIZE_MASK_STRIDE	0x30
+
+#define MLX4_EQ_ASYNC			0
+#define MLX4_EQ_TO_CQ_VECTOR(vector)	((vector) - \
+					 !!((int)(vector) >= MLX4_EQ_ASYNC))
+#define MLX4_CQ_TO_EQ_VECTOR(vector)	((vector) + \
+					 !!((int)(vector) >= MLX4_EQ_ASYNC))
+
 /*
  * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
  */
@@ -381,6 +392,10 @@ struct mlx4_eq {
 	int			nent;
 	struct mlx4_buf_list   *page_list;
 	struct mlx4_mtt		mtt;
+	u32			ncqs;
+	struct mlx4_active_ports actv_ports;
+	u32			ref_count;
+	int			affinity_cpu_id;
 };
 
 struct mlx4_slave_eqe {
@@ -401,7 +416,7 @@ struct mlx4_profile {
 	int			num_cq;
 	int			num_mcg;
 	int			num_mpt;
-	unsigned		num_mtt_segs;
+	unsigned		num_mtt;
 };
 
 struct mlx4_fw {
@@ -460,6 +475,7 @@ struct mlx4_slave_state {
 	u8 init_port_mask;
 	bool active;
 	bool old_vlan_api;
+	bool vst_qinq_supported;
 	u8 function;
 	dma_addr_t vhcr_dma;
 	u16 mtu[MLX4_MAX_PORTS + 1];
@@ -481,18 +497,20 @@ struct mlx4_slave_state {
 #define MLX4_VGT 4095
 #define NO_INDX  (-1)
 
-
 struct mlx4_vport_state {
 	u64 mac;
 	u16 default_vlan;
 	u8  default_qos;
+	__be16 vlan_proto;
 	u32 tx_rate;
 	bool spoofchk;
-	u32 link_state;
+	u8 qos_vport;
+	__be64 guid;
 };
 
 struct mlx4_vf_admin_state {
 	struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1];
+	u8 enable_smi[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_vport_oper_state {
@@ -500,8 +518,10 @@ struct mlx4_vport_oper_state {
 	int mac_idx;
 	int vlan_idx;
 };
+
 struct mlx4_vf_oper_state {
 	struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1];
+	u8 smi_enabled[MLX4_MAX_PORTS + 1];
 };
 
 struct slave_list {
@@ -510,7 +530,7 @@ struct slave_list {
 };
 
 struct resource_allocator {
-	spinlock_t alloc_lock;
+	spinlock_t alloc_lock; /* protect quotas */
 	union {
 		int res_reserved;
 		int res_port_rsvd[MLX4_MAX_PORTS];
@@ -542,6 +562,11 @@ struct mlx4_slave_event_eq {
 	struct mlx4_eqe event_eqe[SLAVE_EVENT_EQ_SIZE];
 };
 
+struct mlx4_qos_manager {
+	int num_of_qos_vfs;
+	DECLARE_BITMAP(priority_bm, MLX4_NUM_UP);
+};
+
 struct mlx4_master_qp0_state {
 	int proxy_qp0_active;
 	int qp0_active;
@@ -555,11 +580,12 @@ struct mlx4_mfunc_master_ctx {
 	struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1];
 	int			init_port_ref[MLX4_MAX_PORTS + 1];
 	u16			max_mtu[MLX4_MAX_PORTS + 1];
+	u8			pptx;
+	u8			pprx;
 	int			disable_mcast_ref[MLX4_MAX_PORTS + 1];
 	struct mlx4_resource_tracker res_tracker;
 	struct workqueue_struct *comm_wq;
 	struct work_struct	comm_work;
-	struct work_struct	arm_comm_work;
 	struct work_struct	slave_event_work;
 	struct work_struct	slave_flr_event_work;
 	spinlock_t		slave_state_lock;
@@ -567,6 +593,7 @@ struct mlx4_mfunc_master_ctx {
 	struct mlx4_eqe		cmd_eqe;
 	struct mlx4_slave_event_eq slave_eq;
 	struct mutex		gen_eqe_mutex[MLX4_MFUNC_MAX];
+	struct mlx4_qos_manager qos_ctl[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_mfunc {
@@ -591,10 +618,10 @@ struct mlx4_mgm {
 struct mlx4_cmd {
 	struct pci_pool	       *pool;
 	void __iomem	       *hcr;
-	struct mutex		hcr_mutex;
 	struct mutex		slave_cmd_mutex;
 	struct semaphore	poll_sem;
 	struct semaphore	event_sem;
+	struct rw_semaphore	switch_sem;
 	int			max_cmds;
 	spinlock_t		context_lock;
 	int			free_head;
@@ -603,11 +630,13 @@ struct mlx4_cmd {
 	u8			use_events;
 	u8			toggle;
 	u8			comm_toggle;
+	u8			initialized;
 };
 
 enum {
 	MLX4_VF_IMMED_VLAN_FLAG_VLAN = 1 << 0,
 	MLX4_VF_IMMED_VLAN_FLAG_QOS = 1 << 1,
+	MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE = 1 << 2,
 };
 struct mlx4_vf_immed_vlan_work {
 	struct work_struct	work;
@@ -618,8 +647,10 @@ struct mlx4_vf_immed_vlan_work {
 	int			orig_vlan_ix;
 	u8			port;
 	u8			qos;
+	u8                      qos_vport;
 	u16			vlan_id;
 	u16			orig_vlan_id;
+	__be16			vlan_proto;
 };
 
 
@@ -639,7 +670,6 @@ struct mlx4_mr_table {
 struct mlx4_cq_table {
 	struct mlx4_bitmap	bitmap;
 	spinlock_t		lock;
-	rwlock_t		cq_table_lock;
 	struct radix_tree_root	tree;
 	struct mlx4_icm_table	table;
 	struct mlx4_icm_table	cmpt_table;
@@ -666,8 +696,17 @@ struct mlx4_srq_table {
 	struct mlx4_icm_table	cmpt_table;
 };
 
+enum mlx4_qp_table_zones {
+	MLX4_QP_TABLE_ZONE_GENERAL,
+	MLX4_QP_TABLE_ZONE_RSS,
+	MLX4_QP_TABLE_ZONE_RAW_ETH,
+	MLX4_QP_TABLE_ZONE_NUM
+};
+
 struct mlx4_qp_table {
-	struct mlx4_bitmap	bitmap;
+	struct mlx4_bitmap	*bitmap_gen;
+	struct mlx4_zone_allocator *zones;
+	u32			zones_uids[MLX4_QP_TABLE_ZONE_NUM];
 	u32			rdmarc_base;
 	int			rdmarc_shift;
 	spinlock_t		lock;
@@ -696,17 +735,30 @@ struct mlx4_catas_err {
 struct mlx4_mac_table {
 	__be64			entries[MLX4_MAX_MAC_NUM];
 	int			refs[MLX4_MAX_MAC_NUM];
+	bool			is_dup[MLX4_MAX_MAC_NUM];
 	struct mutex		mutex;
 	int			total;
 	int			max;
 };
 
+#define MLX4_ROCE_GID_ENTRY_SIZE	16
+
+struct mlx4_roce_gid_entry {
+	u8 raw[MLX4_ROCE_GID_ENTRY_SIZE];
+};
+
+struct mlx4_roce_gid_table {
+	struct mlx4_roce_gid_entry	roce_gids[MLX4_ROCE_MAX_GIDS];
+	struct mutex			mutex;
+};
+
 #define MLX4_MAX_VLAN_NUM	128
 #define MLX4_VLAN_TABLE_SIZE	(MLX4_MAX_VLAN_NUM << 2)
 
 struct mlx4_vlan_table {
 	__be32			entries[MLX4_MAX_VLAN_NUM];
 	int			refs[MLX4_MAX_VLAN_NUM];
+	int			is_dup[MLX4_MAX_VLAN_NUM];
 	struct mutex		mutex;
 	int			total;
 	int			max;
@@ -722,11 +774,15 @@ enum {
 	MCAST_DEFAULT		= 2
 };
 
-
 struct mlx4_set_port_general_context {
-	u8 reserved[3];
+	u16 reserved1;
+	u8 v_ignore_fcs;
 	u8 flags;
-	u16 reserved2;
+	union {
+		u8 ignore_fcs;
+		u8 roce_mode;
+	};
+	u8 reserved2;
 	__be16 mtu;
 	u8 pptx;
 	u8 pfctx;
@@ -734,6 +790,9 @@ struct mlx4_set_port_general_context {
 	u8 pprx;
 	u8 pfcrx;
 	u16 reserved4;
+	u32 reserved5;
+	u8 phv_en;
+	u8 reserved6[3];
 };
 
 struct mlx4_set_port_rqp_calc_context {
@@ -754,13 +813,6 @@ struct mlx4_set_port_rqp_calc_context {
 	__be32 mcast;
 };
 
-struct mlx4_hca_info {
-	struct mlx4_dev	       *dev;
-	struct device_attribute firmware_attr;
-	struct device_attribute hca_attr;
-	struct device_attribute board_attr;
-};
-
 struct mlx4_port_info {
 	struct mlx4_dev	       *dev;
 	int			port;
@@ -771,6 +823,7 @@ struct mlx4_port_info {
 	struct device_attribute port_mtu_attr;
 	struct mlx4_mac_table	mac_table;
 	struct mlx4_vlan_table	vlan_table;
+	struct mlx4_roce_gid_table gid_table;
 	int			base_qpn;
 };
 
@@ -779,10 +832,11 @@ struct mlx4_sense {
 	u8			do_sense_port[MLX4_MAX_PORTS + 1];
 	u8			sense_allowed[MLX4_MAX_PORTS + 1];
 	struct delayed_work	sense_poll;
+	int			gone;
 };
 
 struct mlx4_msix_ctl {
-	u64		pool_bm;
+	DECLARE_BITMAP(pool_bm, MAX_MSIX);
 	struct mutex	pool_lock;
 };
 
@@ -796,22 +850,6 @@ enum {
 	MLX4_PCI_DEV_FORCE_SENSE_PORT	= 1 << 1,
 };
 
-struct mlx4_roce_gid_entry {
-	u8 raw[16];
-};
-
-struct counter_index {
-	struct  list_head	list;
-	u32			index;
-};
-
-struct mlx4_counters {
-	struct mlx4_bitmap	bitmap;
-	struct list_head	global_port_list[MLX4_MAX_PORTS];
-	struct list_head	vf_list[MLX4_MAX_NUM_VF][MLX4_MAX_PORTS];
-	struct mutex		mutex;
-};
-
 enum {
 	MLX4_NO_RR	= 0,
 	MLX4_USE_RR	= 1,
@@ -825,6 +863,7 @@ struct mlx4_priv {
 	spinlock_t		ctx_lock;
 
 	int			pci_dev_data;
+	int                     removed;
 
 	struct list_head        pgdir_list;
 	struct mutex            pgdir_mutex;
@@ -842,7 +881,8 @@ struct mlx4_priv {
 	struct mlx4_srq_table	srq_table;
 	struct mlx4_qp_table	qp_table;
 	struct mlx4_mcg_table	mcg_table;
-	struct mlx4_counters	counters_table;
+	struct mlx4_bitmap	counters_bitmap;
+	int			def_counter[MLX4_MAX_PORTS];
 
 	struct mlx4_catas_err	catas_err;
 
@@ -851,7 +891,6 @@ struct mlx4_priv {
 	struct mlx4_uar		driver_uar;
 	void __iomem	       *kar;
 	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
-	struct mlx4_hca_info	hca_info;
 	struct mlx4_sense       sense;
 	struct mutex		port_mutex;
 	struct mlx4_msix_ctl	msix_ctl;
@@ -863,8 +902,10 @@ struct mlx4_priv {
 	int			reserved_mtts;
 	int			fs_hash_mode;
 	u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct mlx4_port_map	v2p; /* cached port mapping configuration */
+	struct mutex		bond_mutex; /* for bond mode */
 	__be64			slave_node_guids[MLX4_MFUNC_MAX];
-	struct mlx4_roce_gid_entry roce_gids[MLX4_MAX_PORTS][128];
+
 	atomic_t		opreq_count;
 	struct work_struct	opreq_task;
 };
@@ -913,7 +954,7 @@ void mlx4_cleanup_cq_table(struct mlx4_dev *dev);
 void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
 void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
 void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
-int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn);
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp);
 void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn);
 int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn);
 void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn);
@@ -921,7 +962,7 @@ int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn);
 void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn);
 int __mlx4_mpt_reserve(struct mlx4_dev *dev);
 void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index);
-int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index);
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp);
 void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index);
 u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order);
 void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order);
@@ -956,6 +997,11 @@ int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
 			  struct mlx4_cmd_mailbox *inbox,
 			  struct mlx4_cmd_mailbox *outbox,
 			  struct mlx4_cmd_info *cmd);
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
 int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
 		     struct mlx4_vhcr *vhcr,
 		     struct mlx4_cmd_mailbox *inbox,
@@ -968,20 +1014,17 @@ int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
 void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
 int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		     int start_index, int npages, u64 *page_list);
-int __mlx4_counter_alloc(struct mlx4_dev *dev, int slave, int port, u32 *idx);
-void __mlx4_counter_free(struct mlx4_dev *dev, int slave, int port, u32 idx);
-
-int __mlx4_slave_counters_free(struct mlx4_dev *dev, int slave);
-int __mlx4_clear_if_stat(struct mlx4_dev *dev,
-			 u8 counter_index);
-u8 mlx4_get_default_counter_index(struct mlx4_dev *dev, int slave, int port);
-
+int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int mlx4_calc_vf_counters(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_counter *data);
 int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
 void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev);
 void mlx4_stop_catas_poll(struct mlx4_dev *dev);
-void mlx4_catas_init(void);
+int mlx4_catas_init(struct mlx4_dev *dev);
+void mlx4_catas_end(struct mlx4_dev *dev);
 int mlx4_restart_one(struct pci_dev *pdev);
 int mlx4_register_device(struct mlx4_dev *dev);
 void mlx4_unregister_device(struct mlx4_dev *dev);
@@ -996,7 +1039,6 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 		      struct mlx4_dev_cap *dev_cap,
 		      struct mlx4_init_hca_param *init_hca);
 void mlx4_master_comm_channel(struct work_struct *work);
-void mlx4_master_arm_comm_channel(struct work_struct *work);
 void mlx4_gen_slave_eqe(struct work_struct *work);
 void mlx4_master_handle_slave_flr(struct work_struct *work);
 
@@ -1137,17 +1179,27 @@ int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave,
 
 int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe);
 
+enum {
+	MLX4_CMD_CLEANUP_STRUCT = 1UL << 0,
+	MLX4_CMD_CLEANUP_POOL	= 1UL << 1,
+	MLX4_CMD_CLEANUP_HCR	= 1UL << 2,
+	MLX4_CMD_CLEANUP_VHCR	= 1UL << 3,
+	MLX4_CMD_CLEANUP_ALL	= (MLX4_CMD_CLEANUP_VHCR << 1) - 1
+};
+
 int mlx4_cmd_init(struct mlx4_dev *dev);
-void mlx4_cmd_cleanup(struct mlx4_dev *dev);
+void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask);
 int mlx4_multi_func_init(struct mlx4_dev *dev);
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev);
 void mlx4_multi_func_cleanup(struct mlx4_dev *dev);
 void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
 int mlx4_cmd_use_events(struct mlx4_dev *dev);
 void mlx4_cmd_use_polling(struct mlx4_dev *dev);
 
 int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
-		  unsigned long timeout);
+		  u16 op, unsigned long timeout);
 
+void mlx4_cq_tasklet_cb(unsigned long data);
 void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
 void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
 
@@ -1155,7 +1207,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
 
 void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
 
-void mlx4_handle_catas_err(struct mlx4_dev *dev);
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);
 
 int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
 		    enum mlx4_port_type *type);
@@ -1172,8 +1224,14 @@ int mlx4_change_port_types(struct mlx4_dev *dev,
 
 void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
 void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
+			      struct mlx4_roce_gid_table *table);
 void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
 int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+int mlx4_bond_vlan_table(struct mlx4_dev *dev);
+int mlx4_unbond_vlan_table(struct mlx4_dev *dev);
+int mlx4_bond_mac_table(struct mlx4_dev *dev);
+int mlx4_unbond_mac_table(struct mlx4_dev *dev);
 
 int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz);
 /* resource tracker functions*/
@@ -1181,6 +1239,7 @@ int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
 				    enum mlx4_resource resource_type,
 				    u64 resource_id, int *slave);
 void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id);
+void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave);
 int mlx4_init_resource_tracker(struct mlx4_dev *dev);
 
 void mlx4_free_resource_tracker(struct mlx4_dev *dev,
@@ -1227,6 +1286,12 @@ int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 			   struct mlx4_cmd_mailbox *outbox,
 			   struct mlx4_cmd_info *cmd);
 
+int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+
 int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
 			 struct mlx4_vhcr *vhcr,
 			 struct mlx4_cmd_mailbox *inbox,
@@ -1241,7 +1306,6 @@ int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
 			      u8 gid[16], u8 port,
 			      int block_mcast_loopback,
 			      enum mlx4_protocol prot, u64 *reg_id);
-int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
 int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
 				struct mlx4_vhcr *vhcr,
 				struct mlx4_cmd_mailbox *inbox,
@@ -1279,11 +1343,11 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
 					 struct mlx4_cmd_mailbox *inbox,
 					 struct mlx4_cmd_mailbox *outbox,
 					 struct mlx4_cmd_info *cmd);
-int mlx4_MOD_STAT_CFG_wrapper(struct mlx4_dev *dev, int slave,
-			  struct mlx4_vhcr *vhcr,
-			  struct mlx4_cmd_mailbox *inbox,
-			  struct mlx4_cmd_mailbox *outbox,
-			  struct mlx4_cmd_info *cmd);
+int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
 
 int mlx4_get_mgm_entry_size(struct mlx4_dev *dev);
 int mlx4_get_qp_per_mgm(struct mlx4_dev *dev);
@@ -1315,13 +1379,86 @@ static inline spinlock_t *mlx4_tlock(struct mlx4_dev *dev)
 
 #define NOT_MASKED_PD_BITS 17
 
-void sys_tune_init(void);
-void sys_tune_fini(void);
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work);
 
 void mlx4_init_quotas(struct mlx4_dev *dev);
 
-int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave);
-int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave);
-void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work);
+/* for VFs, replace zero MACs with randomly-generated MACs at driver start */
+void mlx4_replace_zero_macs(struct mlx4_dev *dev);
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
+/* Returns the VF index of slave */
+int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
+int mlx4_config_mad_demux(struct mlx4_dev *dev);
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable);
+int mlx4_bond_fs_rules(struct mlx4_dev *dev);
+int mlx4_unbond_fs_rules(struct mlx4_dev *dev);
+
+enum mlx4_zone_flags {
+	MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO	= 1UL << 0,
+	MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO	= 1UL << 1,
+	MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO	= 1UL << 2,
+	MLX4_ZONE_USE_RR			= 1UL << 3,
+};
+
+enum mlx4_zone_alloc_flags {
+	/* No two objects could overlap between zones. UID
+	 * could be left unused. If this flag is given and
+	 * two overlapped zones are used, an object will be free'd
+	 * from the smallest possible matching zone.
+	 */
+	MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP	= 1UL << 0,
+};
+
+struct mlx4_zone_allocator;
+
+/* Create a new zone allocator */
+struct mlx4_zone_allocator *mlx4_zone_allocator_create(enum mlx4_zone_alloc_flags flags);
+
+/* Attach a mlx4_bitmap <bitmap> of priority <priority> to the zone allocator
+ * <zone_alloc>. Allocating an object from this zone adds an offset <offset>.
+ * Similarly, when searching for an object to free, this offset it taken into
+ * account. The use_rr mlx4_ib parameter for allocating objects from this <bitmap>
+ * is given through the MLX4_ZONE_USE_RR flag in <flags>.
+ * When an allocation fails, <zone_alloc> tries to allocate from other zones
+ * according to the policy set by <flags>. <puid> is the unique identifier
+ * received to this zone.
+ */
+int mlx4_zone_add_one(struct mlx4_zone_allocator *zone_alloc,
+		      struct mlx4_bitmap *bitmap,
+		      u32 flags,
+		      int priority,
+		      int offset,
+		      u32 *puid);
+
+/* Remove bitmap indicated by <uid> from <zone_alloc> */
+int mlx4_zone_remove_one(struct mlx4_zone_allocator *zone_alloc, u32 uid);
+
+/* Delete the zone allocator <zone_alloc. This function doesn't destroy
+ * the attached bitmaps.
+ */
+void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc);
+
+/* Allocate <count> objects with align <align> and skip_mask <skip_mask>
+ * from the mlx4_bitmap whose uid is <uid>. The bitmap which we actually
+ * allocated from is returned in <puid>. If the allocation fails, a negative
+ * number is returned. Otherwise, the offset of the first object is returned.
+ */
+u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count,
+			    int align, u32 skip_mask, u32 *puid);
+
+/* Free <count> objects, start from <obj> of the uid <uid> from zone_allocator
+ * <zones>.
+ */
+u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones,
+			   u32 uid, u32 obj, u32 count);
+
+/* If <zones> was allocated with MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP, instead of
+ * specifying the uid when freeing an object, zone allocator could figure it by
+ * itself. Other parameters are similar to mlx4_zone_free.
+ */
+u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count);
+
+/* Returns a pointer to mlx4_bitmap that was attached to <zones> with <uid> */
+struct mlx4_bitmap *mlx4_zone_get_bitmap(struct mlx4_zone_allocator *zones, u32 uid);
 
 #endif /* MLX4_H */
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_alloc.c b/sys/dev/mlx4/mlx4_core/mlx4_alloc.c
index da36b34d98b5..dcf6204dfc1a 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_alloc.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_alloc.c
@@ -116,12 +116,12 @@ u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
 	spin_lock(&bitmap->lock);
 
 	obj = find_aligned_range(bitmap->table, bitmap->last,
-				bitmap->max, cnt, align, skip_mask);
+				 bitmap->max, cnt, align, skip_mask);
 	if (obj >= bitmap->max) {
 		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
 				& bitmap->mask;
 		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
-						cnt, align, skip_mask);
+					 cnt, align, skip_mask);
 	}
 
 	if (obj < bitmap->max) {
@@ -148,6 +148,11 @@ u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap)
 	return bitmap->avail;
 }
 
+static u32 mlx4_bitmap_masked_value(struct mlx4_bitmap *bitmap, u32 obj)
+{
+	return obj & (bitmap->max + bitmap->reserved_top - 1);
+}
+
 void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
 			    int use_rr)
 {
@@ -167,23 +172,17 @@ void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
 int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
 		     u32 reserved_bot, u32 reserved_top)
 {
-	/* sanity check */
-	if (num <= (u64)reserved_top + reserved_bot)
-		return -EINVAL;
-
 	/* num must be a power of 2 */
 	if (num != roundup_pow_of_two(num))
 		return -EINVAL;
 
-	if (reserved_bot + reserved_top >= num)
-		return -EINVAL;
-
 	bitmap->last = 0;
 	bitmap->top  = 0;
 	bitmap->max  = num - reserved_top;
 	bitmap->mask = mask;
 	bitmap->reserved_top = reserved_top;
 	bitmap->avail = num - reserved_top - reserved_bot;
+	bitmap->effective_len = bitmap->avail;
 	spin_lock_init(&bitmap->lock);
 	bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
 				sizeof (long), GFP_KERNEL);
@@ -200,6 +199,382 @@ void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
 	kfree(bitmap->table);
 }
 
+struct mlx4_zone_allocator {
+	struct list_head		entries;
+	struct list_head		prios;
+	u32				last_uid;
+	u32				mask;
+	/* protect the zone_allocator from concurrent accesses */
+	spinlock_t			lock;
+	enum mlx4_zone_alloc_flags	flags;
+};
+
+struct mlx4_zone_entry {
+	struct list_head		list;
+	struct list_head		prio_list;
+	u32				uid;
+	struct mlx4_zone_allocator	*allocator;
+	struct mlx4_bitmap		*bitmap;
+	int				use_rr;
+	int				priority;
+	int				offset;
+	enum mlx4_zone_flags		flags;
+};
+
+struct mlx4_zone_allocator *mlx4_zone_allocator_create(enum mlx4_zone_alloc_flags flags)
+{
+	struct mlx4_zone_allocator *zones = kmalloc(sizeof(*zones), GFP_KERNEL);
+
+	if (NULL == zones)
+		return NULL;
+
+	INIT_LIST_HEAD(&zones->entries);
+	INIT_LIST_HEAD(&zones->prios);
+	spin_lock_init(&zones->lock);
+	zones->last_uid = 0;
+	zones->mask = 0;
+	zones->flags = flags;
+
+	return zones;
+}
+
+int mlx4_zone_add_one(struct mlx4_zone_allocator *zone_alloc,
+		      struct mlx4_bitmap *bitmap,
+		      u32 flags,
+		      int priority,
+		      int offset,
+		      u32 *puid)
+{
+	u32 mask = mlx4_bitmap_masked_value(bitmap, (u32)-1);
+	struct mlx4_zone_entry *it;
+	struct mlx4_zone_entry *zone = kmalloc(sizeof(*zone), GFP_KERNEL);
+
+	if (NULL == zone)
+		return -ENOMEM;
+
+	zone->flags = flags;
+	zone->bitmap = bitmap;
+	zone->use_rr = (flags & MLX4_ZONE_USE_RR) ? MLX4_USE_RR : 0;
+	zone->priority = priority;
+	zone->offset = offset;
+
+	spin_lock(&zone_alloc->lock);
+
+	zone->uid = zone_alloc->last_uid++;
+	zone->allocator = zone_alloc;
+
+	if (zone_alloc->mask < mask)
+		zone_alloc->mask = mask;
+
+	list_for_each_entry(it, &zone_alloc->prios, prio_list)
+		if (it->priority >= priority)
+			break;
+
+	if (&it->prio_list == &zone_alloc->prios || it->priority > priority)
+		list_add_tail(&zone->prio_list, &it->prio_list);
+	list_add_tail(&zone->list, &it->list);
+
+	spin_unlock(&zone_alloc->lock);
+
+	*puid = zone->uid;
+
+	return 0;
+}
+
+/* Should be called under a lock */
+static int __mlx4_zone_remove_one_entry(struct mlx4_zone_entry *entry)
+{
+	struct mlx4_zone_allocator *zone_alloc = entry->allocator;
+
+	if (!list_empty(&entry->prio_list)) {
+		/* Check if we need to add an alternative node to the prio list */
+		if (!list_is_last(&entry->list, &zone_alloc->entries)) {
+			struct mlx4_zone_entry *next = list_first_entry(&entry->list,
+									typeof(*next),
+									list);
+
+			if (next->priority == entry->priority)
+				list_add_tail(&next->prio_list, &entry->prio_list);
+		}
+
+		list_del(&entry->prio_list);
+	}
+
+	list_del(&entry->list);
+
+	if (zone_alloc->flags & MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP) {
+		u32 mask = 0;
+		struct mlx4_zone_entry *it;
+
+		list_for_each_entry(it, &zone_alloc->prios, prio_list) {
+			u32 cur_mask = mlx4_bitmap_masked_value(it->bitmap, (u32)-1);
+
+			if (mask < cur_mask)
+				mask = cur_mask;
+		}
+		zone_alloc->mask = mask;
+	}
+
+	return 0;
+}
+
+void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc)
+{
+	struct mlx4_zone_entry *zone, *tmp;
+
+	spin_lock(&zone_alloc->lock);
+
+	list_for_each_entry_safe(zone, tmp, &zone_alloc->entries, list) {
+		list_del(&zone->list);
+		list_del(&zone->prio_list);
+		kfree(zone);
+	}
+
+	spin_unlock(&zone_alloc->lock);
+	kfree(zone_alloc);
+}
+
+/* Should be called under a lock */
+static u32 __mlx4_alloc_from_zone(struct mlx4_zone_entry *zone, int count,
+				  int align, u32 skip_mask, u32 *puid)
+{
+	u32 uid = 0;
+	u32 res;
+	struct mlx4_zone_allocator *zone_alloc = zone->allocator;
+	struct mlx4_zone_entry *curr_node;
+
+	res = mlx4_bitmap_alloc_range(zone->bitmap, count,
+				      align, skip_mask);
+
+	if (res != (u32)-1) {
+		res += zone->offset;
+		uid = zone->uid;
+		goto out;
+	}
+
+	list_for_each_entry(curr_node, &zone_alloc->prios, prio_list) {
+		if (unlikely(curr_node->priority == zone->priority))
+			break;
+	}
+
+	if (zone->flags & MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO) {
+		struct mlx4_zone_entry *it = curr_node;
+
+		list_for_each_entry_continue_reverse(it, &zone_alloc->entries, list) {
+			res = mlx4_bitmap_alloc_range(it->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += it->offset;
+				uid = it->uid;
+				goto out;
+			}
+		}
+	}
+
+	if (zone->flags & MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO) {
+		struct mlx4_zone_entry *it = curr_node;
+
+		list_for_each_entry_from(it, &zone_alloc->entries, list) {
+			if (unlikely(it == zone))
+				continue;
+
+			if (unlikely(it->priority != curr_node->priority))
+				break;
+
+			res = mlx4_bitmap_alloc_range(it->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += it->offset;
+				uid = it->uid;
+				goto out;
+			}
+		}
+	}
+
+	if (zone->flags & MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO) {
+		if (list_is_last(&curr_node->prio_list, &zone_alloc->prios))
+			goto out;
+
+		curr_node = list_first_entry(&curr_node->prio_list,
+					     typeof(*curr_node),
+					     prio_list);
+
+		list_for_each_entry_from(curr_node, &zone_alloc->entries, list) {
+			res = mlx4_bitmap_alloc_range(curr_node->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += curr_node->offset;
+				uid = curr_node->uid;
+				goto out;
+			}
+		}
+	}
+
+out:
+	if (NULL != puid && res != (u32)-1)
+		*puid = uid;
+	return res;
+}
+
+/* Should be called under a lock */
+static void __mlx4_free_from_zone(struct mlx4_zone_entry *zone, u32 obj,
+				  u32 count)
+{
+	mlx4_bitmap_free_range(zone->bitmap, obj - zone->offset, count, zone->use_rr);
+}
+
+/* Should be called under a lock */
+static struct mlx4_zone_entry *__mlx4_find_zone_by_uid(
+		struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+
+	list_for_each_entry(zone, &zones->entries, list) {
+		if (zone->uid == uid)
+			return zone;
+	}
+
+	return NULL;
+}
+
+struct mlx4_bitmap *mlx4_zone_get_bitmap(struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+	struct mlx4_bitmap *bitmap;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	bitmap = zone == NULL ? NULL : zone->bitmap;
+
+	spin_unlock(&zones->lock);
+
+	return bitmap;
+}
+
+int mlx4_zone_remove_one(struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+	int res;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	res = __mlx4_zone_remove_one_entry(zone);
+
+out:
+	spin_unlock(&zones->lock);
+	kfree(zone);
+
+	return res;
+}
+
+/* Should be called under a lock */
+static struct mlx4_zone_entry *__mlx4_find_zone_by_uid_unique(
+		struct mlx4_zone_allocator *zones, u32 obj)
+{
+	struct mlx4_zone_entry *zone, *zone_candidate = NULL;
+	u32 dist = (u32)-1;
+
+	/* Search for the smallest zone that this obj could be
+	 * allocated from. This is done in order to handle
+	 * situations when small bitmaps are allocated from bigger
+	 * bitmaps (and the allocated space is marked as reserved in
+	 * the bigger bitmap.
+	 */
+	list_for_each_entry(zone, &zones->entries, list) {
+		if (obj >= zone->offset) {
+			u32 mobj = (obj - zone->offset) & zones->mask;
+
+			if (mobj < zone->bitmap->max) {
+				u32 curr_dist = zone->bitmap->effective_len;
+
+				if (curr_dist < dist) {
+					dist = curr_dist;
+					zone_candidate = zone;
+				}
+			}
+		}
+	}
+
+	return zone_candidate;
+}
+
+u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count,
+			    int align, u32 skip_mask, u32 *puid)
+{
+	struct mlx4_zone_entry *zone;
+	int res = -1;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone)
+		goto out;
+
+	res = __mlx4_alloc_from_zone(zone, count, align, skip_mask, puid);
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones, u32 uid, u32 obj, u32 count)
+{
+	struct mlx4_zone_entry *zone;
+	int res = 0;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_free_from_zone(zone, obj, count);
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count)
+{
+	struct mlx4_zone_entry *zone;
+	int res;
+
+	if (!(zones->flags & MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP))
+		return -EFAULT;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid_unique(zones, obj);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_free_from_zone(zone, obj, count);
+	res = 0;
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
 /*
  * Handling for queue buffers -- we allocate a bunch of memory and
  * register it in a memory region at HCA virtual address 0.  If the
@@ -208,7 +583,7 @@ void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
  */
 
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
-		   struct mlx4_buf *buf)
+		   struct mlx4_buf *buf, gfp_t gfp)
 {
 	dma_addr_t t;
 
@@ -216,8 +591,8 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		buf->nbufs        = 1;
 		buf->npages       = 1;
 		buf->page_shift   = get_order(size) + PAGE_SHIFT;
-		buf->direct.buf   = dma_alloc_coherent(&dev->pdev->dev,
-						       size, &t, GFP_KERNEL);
+		buf->direct.buf   = dma_alloc_coherent(&dev->persist->pdev->dev,
+						       size, &t, gfp);
 		if (!buf->direct.buf)
 			return -ENOMEM;
 
@@ -237,14 +612,15 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		buf->npages      = buf->nbufs;
 		buf->page_shift  = PAGE_SHIFT;
 		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
-					   GFP_KERNEL);
+					   gfp);
 		if (!buf->page_list)
 			return -ENOMEM;
 
 		for (i = 0; i < buf->nbufs; ++i) {
 			buf->page_list[i].buf =
-				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
-						   &t, GFP_KERNEL);
+				dma_alloc_coherent(&dev->persist->pdev->dev,
+						   PAGE_SIZE,
+						   &t, gfp);
 			if (!buf->page_list[i].buf)
 				goto err_free;
 
@@ -255,7 +631,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 
 		if (BITS_PER_LONG == 64) {
 			struct page **pages;
-			pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+			pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
 			if (!pages)
 				goto err_free;
 			for (i = 0; i < buf->nbufs; ++i)
@@ -281,15 +657,17 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 	int i;
 
 	if (buf->nbufs == 1)
-		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+		dma_free_coherent(&dev->persist->pdev->dev, size,
+				  buf->direct.buf,
 				  buf->direct.map);
 	else {
-		if (BITS_PER_LONG == 64 && buf->direct.buf)
+		if (BITS_PER_LONG == 64)
 			vunmap(buf->direct.buf);
 
 		for (i = 0; i < buf->nbufs; ++i)
 			if (buf->page_list[i].buf)
-				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+				dma_free_coherent(&dev->persist->pdev->dev,
+						  PAGE_SIZE,
 						  buf->page_list[i].buf,
 						  buf->page_list[i].map);
 		kfree(buf->page_list);
@@ -297,11 +675,12 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 }
 EXPORT_SYMBOL_GPL(mlx4_buf_free);
 
-static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device)
+static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device,
+						 gfp_t gfp)
 {
 	struct mlx4_db_pgdir *pgdir;
 
-	pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL);
+	pgdir = kzalloc(sizeof *pgdir, gfp);
 	if (!pgdir)
 		return NULL;
 
@@ -309,7 +688,7 @@ static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device)
 	pgdir->bits[0] = pgdir->order0;
 	pgdir->bits[1] = pgdir->order1;
 	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
-					    &pgdir->db_dma, GFP_KERNEL);
+					    &pgdir->db_dma, gfp);
 	if (!pgdir->db_page) {
 		kfree(pgdir);
 		return NULL;
@@ -349,7 +728,7 @@ static int mlx4_alloc_db_from_pgdir(struct mlx4_db_pgdir *pgdir,
 	return 0;
 }
 
-int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order)
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_db_pgdir *pgdir;
@@ -361,7 +740,7 @@ int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order)
 		if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
 			goto out;
 
-	pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev));
+	pgdir = mlx4_alloc_db_pgdir(&dev->persist->pdev->dev, gfp);
 	if (!pgdir) {
 		ret = -ENOMEM;
 		goto out;
@@ -398,7 +777,7 @@ void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
 	set_bit(i, db->u.pgdir->bits[o]);
 
 	if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) {
-		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
 		list_del(&db->u.pgdir->list);
 		kfree(db->u.pgdir);
@@ -413,13 +792,13 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
 {
 	int err;
 
-	err = mlx4_db_alloc(dev, &wqres->db, 1);
+	err = mlx4_db_alloc(dev, &wqres->db, 1, GFP_KERNEL);
 	if (err)
 		return err;
 
 	*wqres->db.db = 0;
 
-	err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf);
+	err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf, GFP_KERNEL);
 	if (err)
 		goto err_db;
 
@@ -428,7 +807,7 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
 	if (err)
 		goto err_buf;
 
-	err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf);
+	err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf, GFP_KERNEL);
 	if (err)
 		goto err_mtt;
 
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_catas.c b/sys/dev/mlx4/mlx4_core/mlx4_catas.c
index 497e0ddb8956..165b8faa2896 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_catas.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_catas.c
@@ -42,16 +42,177 @@
 
 #define 	MLX4_CATAS_POLL_INTERVAL	(5 * HZ)
 
-static DEFINE_SPINLOCK(catas_lock);
 
-static LIST_HEAD(catas_list);
-static struct work_struct catas_work;
 
-static int internal_err_reset = 1;
-module_param(internal_err_reset, int, 0644);
+int mlx4_internal_err_reset = 1;
+module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
 MODULE_PARM_DESC(internal_err_reset,
-		 "Reset device on internal errors if non-zero"
-		 " (default 1, in SRIOV mode default is 0)");
+		 "Reset device on internal errors if non-zero (default 1)");
+
+static int read_vendor_id(struct mlx4_dev *dev)
+{
+	u16 vendor_id = 0;
+	int ret;
+
+	ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
+	if (ret) {
+		mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
+		return ret;
+	}
+
+	if (vendor_id == 0xffff) {
+		mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_reset_master(struct mlx4_dev *dev)
+{
+	int err = 0;
+
+	if (mlx4_is_master(dev))
+		mlx4_report_internal_err_comm_event(dev);
+
+	if (!pci_channel_offline(dev->persist->pdev)) {
+		err = read_vendor_id(dev);
+		/* If PCI can't be accessed to read vendor ID we assume that its
+		 * link was disabled and chip was already reset.
+		 */
+		if (err)
+			return 0;
+
+		err = mlx4_reset(dev);
+		if (err)
+			mlx4_err(dev, "Fail to reset HCA\n");
+	}
+
+	return err;
+}
+
+static int mlx4_reset_slave(struct mlx4_dev *dev)
+{
+#define COM_CHAN_RST_REQ_OFFSET 0x10
+#define COM_CHAN_RST_ACK_OFFSET 0x08
+
+	u32 comm_flags;
+	u32 rst_req;
+	u32 rst_ack;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return 0;
+
+	comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+				  MLX4_COMM_CHAN_FLAGS));
+	if (comm_flags == 0xffffffff) {
+		mlx4_err(dev, "VF reset is not needed\n");
+		return 0;
+	}
+
+	if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
+		mlx4_err(dev, "VF reset is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+		COM_CHAN_RST_REQ_OFFSET;
+	rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+		COM_CHAN_RST_ACK_OFFSET;
+	if (rst_req != rst_ack) {
+		mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
+		return -EIO;
+	}
+
+	rst_req ^= 1;
+	mlx4_warn(dev, "VF is sending reset request to Firmware\n");
+	comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
+	__raw_writel((__force u32)cpu_to_be32(comm_flags),
+		     (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
+	/* Make sure that our comm channel write doesn't
+	 * get mixed in with writes from another CPU.
+	 */
+	mmiowb();
+
+	end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+			COM_CHAN_RST_ACK_OFFSET;
+
+		/* Reading rst_req again since the communication channel can
+		 * be reset at any time by the PF and all its bits will be
+		 * set to zero.
+		 */
+		rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+			COM_CHAN_RST_REQ_OFFSET;
+
+		if (rst_ack == rst_req) {
+			mlx4_warn(dev, "VF Reset succeed\n");
+			return 0;
+		}
+		cond_resched();
+	}
+	mlx4_err(dev, "Fail to send reset over the communication channel\n");
+	return -ETIMEDOUT;
+}
+
+static int mlx4_comm_internal_err(u32 slave_read)
+{
+	return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
+		(slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
+}
+
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err;
+	struct mlx4_dev *dev;
+
+	if (!mlx4_internal_err_reset)
+		return;
+
+	mutex_lock(&persist->device_state_mutex);
+	if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		goto out;
+
+	dev = persist->dev;
+	mlx4_err(dev, "device is going to be reset\n");
+	if (mlx4_is_slave(dev))
+		err = mlx4_reset_slave(dev);
+	else
+		err = mlx4_reset_master(dev);
+	BUG_ON(err != 0);
+
+	dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
+	mlx4_err(dev, "device was reset successfully\n");
+	mutex_unlock(&persist->device_state_mutex);
+
+	/* At that step HW was already reset, now notify clients */
+	mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+	mlx4_cmd_wake_completions(dev);
+	return;
+
+out:
+	mutex_unlock(&persist->device_state_mutex);
+}
+
+static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err = 0;
+
+	mlx4_enter_error_state(persist);
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
+	    !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
+		err = mlx4_restart_one(persist->pdev);
+		mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
+			  err);
+	}
+	mutex_unlock(&persist->interface_state_mutex);
+}
 
 static void dump_err_buf(struct mlx4_dev *dev)
 {
@@ -69,58 +230,40 @@ static void poll_catas(unsigned long dev_ptr)
 {
 	struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr;
 	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 slave_read;
 
-	if (readl(priv->catas_err.map)) {
-		/* If the device is off-line, we cannot try to recover it */
-		if (pci_channel_offline(dev->pdev))
-			mod_timer(&priv->catas_err.timer,
-				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
-		else {
-			dump_err_buf(dev);
-			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
-
-			if (internal_err_reset) {
-				spin_lock(&catas_lock);
-				list_add(&priv->catas_err.list, &catas_list);
-				spin_unlock(&catas_lock);
-
-				queue_work(mlx4_wq, &catas_work);
-			}
+	if (mlx4_is_slave(dev)) {
+		slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (mlx4_comm_internal_err(slave_read)) {
+			mlx4_warn(dev, "Internal error detected on the communication channel\n");
+			goto internal_err;
 		}
-	} else
-		mod_timer(&priv->catas_err.timer,
-			  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+	} else if (readl(priv->catas_err.map)) {
+		dump_err_buf(dev);
+		goto internal_err;
+	}
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_warn(dev, "Internal error mark was detected on device\n");
+		goto internal_err;
+	}
+
+	mod_timer(&priv->catas_err.timer,
+		  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+	return;
+
+internal_err:
+	if (mlx4_internal_err_reset)
+		queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
 }
 
 static void catas_reset(struct work_struct *work)
 {
-	struct mlx4_priv *priv, *tmppriv;
-	struct mlx4_dev *dev;
+	struct mlx4_dev_persistent *persist =
+		container_of(work, struct mlx4_dev_persistent,
+			     catas_work);
 
-	LIST_HEAD(tlist);
-	int ret;
-
-	spin_lock_irq(&catas_lock);
-	list_splice_init(&catas_list, &tlist);
-	spin_unlock_irq(&catas_lock);
-
-	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
-		struct pci_dev *pdev = priv->dev.pdev;
-
-		/* If the device is off-line, we cannot reset it */
-		if (pci_channel_offline(pdev))
-			continue;
-
-		ret = mlx4_restart_one(priv->dev.pdev);
-		/* 'priv' now is not valid */
-		if (ret)
-			pr_err("mlx4 %s: Reset failed (%d)\n",
-			       pci_name(pdev), ret);
-		else {
-			dev  = pci_get_drvdata(pdev);
-			mlx4_dbg(dev, "Reset succeeded\n");
-		}
-	}
+	mlx4_handle_error_state(persist);
 }
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev)
@@ -128,22 +271,21 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	phys_addr_t addr;
 
-	/*If we are in SRIOV the default of the module param must be 0*/
-	if (mlx4_is_mfunc(dev))
-		internal_err_reset = 0;
-
 	INIT_LIST_HEAD(&priv->catas_err.list);
 	init_timer(&priv->catas_err.timer);
 	priv->catas_err.map = NULL;
 
-	addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
-		priv->fw.catas_offset;
+	if (!mlx4_is_slave(dev)) {
+		addr = pci_resource_start(dev->persist->pdev,
+					  priv->fw.catas_bar) +
+					  priv->fw.catas_offset;
 
-	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
-	if (!priv->catas_err.map) {
-		mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
-			  (unsigned long long) addr);
-		return;
+		priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+		if (!priv->catas_err.map) {
+			mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
+				  (unsigned long long)addr);
+			return;
+		}
 	}
 
 	priv->catas_err.timer.data     = (unsigned long) dev;
@@ -164,12 +306,24 @@ void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 		priv->catas_err.map = NULL;
 	}
 
-	spin_lock_irq(&catas_lock);
-	list_del_init(&priv->catas_err.list);
-	spin_unlock_irq(&catas_lock);
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
+		flush_workqueue(dev->persist->catas_wq);
 }
 
-void  __init mlx4_catas_init(void)
+int  mlx4_catas_init(struct mlx4_dev *dev)
 {
-	INIT_WORK(&catas_work, catas_reset);
+	INIT_WORK(&dev->persist->catas_work, catas_reset);
+	dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
+	if (!dev->persist->catas_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_catas_end(struct mlx4_dev *dev)
+{
+	if (dev->persist->catas_wq) {
+		destroy_workqueue(dev->persist->catas_wq);
+		dev->persist->catas_wq = NULL;
+	}
 }
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_cmd.c b/sys/dev/mlx4/mlx4_core/mlx4_cmd.c
index 8d0b461b8c79..6f972a0f3da1 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_cmd.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_cmd.c
@@ -37,6 +37,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/errno.h>
+#include <linux/delay.h>
 
 #include <dev/mlx4/cmd.h>
 #include <dev/mlx4/device.h>
@@ -48,6 +49,7 @@
 
 #include "mlx4.h"
 #include "fw.h"
+#include "fw_qos.h"
 
 #define CMD_POLL_TOKEN 0xffff
 #define INBOX_MASK	0xffffffffffffff00ULL
@@ -163,131 +165,6 @@ static int mlx4_status_to_errno(u8 status)
 	return trans_table[status];
 }
 
-static const char *cmd_to_str(u16 cmd)
-{
-	switch (cmd) {
-	case MLX4_CMD_SYS_EN:		return "SYS_EN";
-	case MLX4_CMD_SYS_DIS:		return "SYS_DIS";
-	case MLX4_CMD_MAP_FA:		return "MAP_FA";
-	case MLX4_CMD_UNMAP_FA:		return "UNMAP_FA";
-	case MLX4_CMD_RUN_FW:		return "RUN_FW";
-	case MLX4_CMD_MOD_STAT_CFG:	return "MOD_STAT_CFG";
-	case MLX4_CMD_QUERY_DEV_CAP:	return "QUERY_DEV_CAP";
-	case MLX4_CMD_QUERY_FW:		return "QUERY_FW";
-	case MLX4_CMD_ENABLE_LAM:	return "ENABLE_LAM";
-	case MLX4_CMD_DISABLE_LAM:	return "DISABLE_LAM";
-	case MLX4_CMD_QUERY_DDR:	return "QUERY_DDR";
-	case MLX4_CMD_QUERY_ADAPTER:	return "QUERY_ADAPTER";
-	case MLX4_CMD_INIT_HCA:		return "INIT_HCA";
-	case MLX4_CMD_CLOSE_HCA:	return "CLOSE_HCA";
-	case MLX4_CMD_INIT_PORT:	return "INIT_PORT";
-	case MLX4_CMD_CLOSE_PORT:	return "CLOSE_PORT";
-	case MLX4_CMD_QUERY_HCA:	return "QUERY_HCA";
-	case MLX4_CMD_QUERY_PORT:	return "QUERY_PORT";
-	case MLX4_CMD_SENSE_PORT:	return "SENSE_PORT";
-	case MLX4_CMD_HW_HEALTH_CHECK:  return "HW_HEALTH_CHECK";
-	case MLX4_CMD_SET_PORT:		return "SET_PORT";
-	case MLX4_CMD_SET_NODE:		return "SET_NODE";
-	case MLX4_CMD_QUERY_FUNC:	return "QUERY_FUNC";
-	case MLX4_CMD_MAP_ICM:		return "MAP_ICM";
-	case MLX4_CMD_UNMAP_ICM:	return "UNMAP_ICM";
-	case MLX4_CMD_MAP_ICM_AUX:	return "MAP_ICM_AUX";
-	case MLX4_CMD_UNMAP_ICM_AUX:	return "UNMAP_ICM_AUX";
-	case MLX4_CMD_SET_ICM_SIZE:	return "SET_ICM_SIZE";
-		/*master notify fw on finish for slave's flr*/
-	case MLX4_CMD_INFORM_FLR_DONE:	return "INFORM_FLR_DONE";
-	case MLX4_CMD_GET_OP_REQ:	return "GET_OP_REQ";
-
-		/* TPT commands */
-	case MLX4_CMD_SW2HW_MPT:	return "SW2HW_MPT";
-	case MLX4_CMD_QUERY_MPT:	return "QUERY_MPT";
-	case MLX4_CMD_HW2SW_MPT:	return "HW2SW_MPT";
-	case MLX4_CMD_READ_MTT:		return "READ_MTT";
-	case MLX4_CMD_WRITE_MTT:	return "WRITE_MTT";
-	case MLX4_CMD_SYNC_TPT:		return "SYNC_TPT";
-
-		/* EQ commands */
-	case MLX4_CMD_MAP_EQ:		return "MAP_EQ";
-	case MLX4_CMD_SW2HW_EQ:		return "SW2HW_EQ";
-	case MLX4_CMD_HW2SW_EQ:		return "HW2SW_EQ";
-	case MLX4_CMD_QUERY_EQ:		return "QUERY_EQ";
-
-		/* CQ commands */
-	case MLX4_CMD_SW2HW_CQ:		return "SW2HW_CQ";
-	case MLX4_CMD_HW2SW_CQ:		return "HW2SW_CQ";
-	case MLX4_CMD_QUERY_CQ:		return "QUERY_CQ:";
-	case MLX4_CMD_MODIFY_CQ:	return "MODIFY_CQ:";
-
-		/* SRQ commands */
-	case MLX4_CMD_SW2HW_SRQ:	return "SW2HW_SRQ";
-	case MLX4_CMD_HW2SW_SRQ:	return "HW2SW_SRQ";
-	case MLX4_CMD_QUERY_SRQ:	return "QUERY_SRQ";
-	case MLX4_CMD_ARM_SRQ:		return "ARM_SRQ";
-
-		/* QP/EE commands */
-	case MLX4_CMD_RST2INIT_QP:	return "RST2INIT_QP";
-	case MLX4_CMD_INIT2RTR_QP:	return "INIT2RTR_QP";
-	case MLX4_CMD_RTR2RTS_QP:	return "RTR2RTS_QP";
-	case MLX4_CMD_RTS2RTS_QP:	return "RTS2RTS_QP";
-	case MLX4_CMD_SQERR2RTS_QP:	return "SQERR2RTS_QP";
-	case MLX4_CMD_2ERR_QP:		return "2ERR_QP";
-	case MLX4_CMD_RTS2SQD_QP:	return "RTS2SQD_QP";
-	case MLX4_CMD_SQD2SQD_QP:	return "SQD2SQD_QP";
-	case MLX4_CMD_SQD2RTS_QP:	return "SQD2RTS_QP";
-	case MLX4_CMD_2RST_QP:		return "2RST_QP";
-	case MLX4_CMD_QUERY_QP:		return "QUERY_QP";
-	case MLX4_CMD_INIT2INIT_QP:	return "INIT2INIT_QP";
-	case MLX4_CMD_SUSPEND_QP:	return "SUSPEND_QP";
-	case MLX4_CMD_UNSUSPEND_QP:	return "UNSUSPEND_QP";
-		/* special QP and management commands */
-	case MLX4_CMD_CONF_SPECIAL_QP:	return "CONF_SPECIAL_QP";
-	case MLX4_CMD_MAD_IFC:		return "MAD_IFC";
-
-		/* multicast commands */
-	case MLX4_CMD_READ_MCG:		return "READ_MCG";
-	case MLX4_CMD_WRITE_MCG:	return "WRITE_MCG";
-	case MLX4_CMD_MGID_HASH:	return "MGID_HASH";
-
-		/* miscellaneous commands */
-	case MLX4_CMD_DIAG_RPRT:	return "DIAG_RPRT";
-	case MLX4_CMD_NOP:		return "NOP";
-	case MLX4_CMD_ACCESS_MEM:	return "ACCESS_MEM";
-	case MLX4_CMD_SET_VEP:		return "SET_VEP";
-
-		/* Ethernet specific commands */
-	case MLX4_CMD_SET_VLAN_FLTR:	return "SET_VLAN_FLTR";
-	case MLX4_CMD_SET_MCAST_FLTR:	return "SET_MCAST_FLTR";
-	case MLX4_CMD_DUMP_ETH_STATS:	return "DUMP_ETH_STATS";
-
-		/* Communication channel commands */
-	case MLX4_CMD_ARM_COMM_CHANNEL:	return "ARM_COMM_CHANNEL";
-	case MLX4_CMD_GEN_EQE:		return "GEN_EQE";
-
-		/* virtual commands */
-	case MLX4_CMD_ALLOC_RES:	return "ALLOC_RES";
-	case MLX4_CMD_FREE_RES:		return "FREE_RES";
-	case MLX4_CMD_MCAST_ATTACH:	return "MCAST_ATTACH";
-	case MLX4_CMD_UCAST_ATTACH:	return "UCAST_ATTACH";
-	case MLX4_CMD_PROMISC:		return "PROMISC";
-	case MLX4_CMD_QUERY_FUNC_CAP:	return "QUERY_FUNC_CAP";
-	case MLX4_CMD_QP_ATTACH:	return "QP_ATTACH";
-
-		/* debug commands */
-	case MLX4_CMD_QUERY_DEBUG_MSG:	return "QUERY_DEBUG_MSG";
-	case MLX4_CMD_SET_DEBUG_MSG:	return "SET_DEBUG_MSG";
-
-		/* statistics commands */
-	case MLX4_CMD_QUERY_IF_STAT:	return "QUERY_IF_STAT";
-	case MLX4_CMD_SET_IF_STAT:	return "SET_IF_STAT";
-
-		/* register/delete flow steering network rules */
-	case MLX4_QP_FLOW_STEERING_ATTACH:	return "QP_FLOW_STEERING_ATTACH";
-	case MLX4_QP_FLOW_STEERING_DETACH:	return "QP_FLOW_STEERING_DETACH";
-	case MLX4_FLOW_STEERING_IB_UC_QP_RANGE:	return "FLOW_STEERING_IB_UC_QP_RANGE";
-	default: return "OTHER";
-	}
-}
-
 static u8 mlx4_errno_to_status(int errno)
 {
 	switch (errno) {
@@ -308,6 +185,72 @@ static u8 mlx4_errno_to_status(int errno)
 	}
 }
 
+static int mlx4_internal_err_ret_value(struct mlx4_dev *dev, u16 op,
+				       u8 op_modifier)
+{
+	switch (op) {
+	case MLX4_CMD_UNMAP_ICM:
+	case MLX4_CMD_UNMAP_ICM_AUX:
+	case MLX4_CMD_UNMAP_FA:
+	case MLX4_CMD_2RST_QP:
+	case MLX4_CMD_HW2SW_EQ:
+	case MLX4_CMD_HW2SW_CQ:
+	case MLX4_CMD_HW2SW_SRQ:
+	case MLX4_CMD_HW2SW_MPT:
+	case MLX4_CMD_CLOSE_HCA:
+	case MLX4_QP_FLOW_STEERING_DETACH:
+	case MLX4_CMD_FREE_RES:
+	case MLX4_CMD_CLOSE_PORT:
+		return CMD_STAT_OK;
+
+	case MLX4_CMD_QP_ATTACH:
+		/* On Detach case return success */
+		if (op_modifier == 0)
+			return CMD_STAT_OK;
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
+	default:
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+	}
+}
+
+static int mlx4_closing_cmd_fatal_error(u16 op, u8 fw_status)
+{
+	/* Any error during the closing commands below is considered fatal */
+	if (op == MLX4_CMD_CLOSE_HCA ||
+	    op == MLX4_CMD_HW2SW_EQ ||
+	    op == MLX4_CMD_HW2SW_CQ ||
+	    op == MLX4_CMD_2RST_QP ||
+	    op == MLX4_CMD_HW2SW_SRQ ||
+	    op == MLX4_CMD_SYNC_TPT ||
+	    op == MLX4_CMD_UNMAP_ICM ||
+	    op == MLX4_CMD_UNMAP_ICM_AUX ||
+	    op == MLX4_CMD_UNMAP_FA)
+		return 1;
+	/* Error on MLX4_CMD_HW2SW_MPT is fatal except when fw status equals
+	  * CMD_STAT_REG_BOUND.
+	  * This status indicates that memory region has memory windows bound to it
+	  * which may result from invalid user space usage and is not fatal.
+	  */
+	if (op == MLX4_CMD_HW2SW_MPT && fw_status != CMD_STAT_REG_BOUND)
+		return 1;
+	return 0;
+}
+
+static int mlx4_cmd_reset_flow(struct mlx4_dev *dev, u16 op, u8 op_modifier,
+			       int err)
+{
+	/* Only if reset flow is really active return code is based on
+	  * command, otherwise current error code is returned.
+	  */
+	if (mlx4_internal_err_reset) {
+		mlx4_enter_error_state(dev->persist);
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+	}
+
+	return err;
+}
+
 static int comm_pending(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -316,16 +259,30 @@ static int comm_pending(struct mlx4_dev *dev)
 	return (swab32(status) >> 31) != priv->cmd.comm_toggle;
 }
 
-static void mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param)
+static int mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	u32 val;
 
+	/* To avoid writing to unknown addresses after the device state was
+	 * changed to internal error and the function was rest,
+	 * check the INTERNAL_ERROR flag which is updated under
+	 * device_state_mutex lock.
+	 */
+	mutex_lock(&dev->persist->device_state_mutex);
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mutex_unlock(&dev->persist->device_state_mutex);
+		return -EIO;
+	}
+
 	priv->cmd.comm_toggle ^= 1;
 	val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31);
 	__raw_writel((__force u32) cpu_to_be32(val),
 		     &priv->mfunc.comm->slave_write);
 	mmiowb();
+	mutex_unlock(&dev->persist->device_state_mutex);
+	return 0;
 }
 
 static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
@@ -338,15 +295,20 @@ static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
 
 	/* First, verify that the master reports correct status */
 	if (comm_pending(dev)) {
-		mlx4_warn(dev, "Communication channel is not idle."
-			  "my toggle is %d (cmd:0x%x)\n",
+		mlx4_warn(dev, "Communication channel is not idle - my toggle is %d (cmd:0x%x)\n",
 			  priv->cmd.comm_toggle, cmd);
 		return -EAGAIN;
 	}
 
 	/* Write command */
 	down(&priv->cmd.poll_sem);
-	mlx4_comm_cmd_post(dev, cmd, param);
+	if (mlx4_comm_cmd_post(dev, cmd, param)) {
+		/* Only in case the device state is INTERNAL_ERROR,
+		 * mlx4_comm_cmd_post returns with an error
+		 */
+		err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+		goto out;
+	}
 
 	end = msecs_to_jiffies(timeout) + jiffies;
 	while (comm_pending(dev) && time_before(jiffies, end))
@@ -357,21 +319,24 @@ static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
 		 * FLR process. The only non-zero result in the RESET command
 		 * is MLX4_DELAY_RESET_SLAVE*/
 		if ((MLX4_COMM_CMD_RESET == cmd)) {
-			mlx4_warn(dev, "Got slave FLRed from Communication"
-				  " channel (ret:0x%x)\n", ret_from_pending);
 			err = MLX4_DELAY_RESET_SLAVE;
+			goto out;
 		} else {
-			mlx4_warn(dev, "Communication channel timed out\n");
-			err = -ETIMEDOUT;
+			mlx4_warn(dev, "Communication channel command 0x%x timed out\n",
+				  cmd);
+			err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
 		}
 	}
 
+	if (err)
+		mlx4_enter_error_state(dev->persist);
+out:
 	up(&priv->cmd.poll_sem);
 	return err;
 }
 
-static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
-			      u16 param, unsigned long timeout)
+static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 vhcr_cmd,
+			      u16 param, u16 op, unsigned long timeout)
 {
 	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
 	struct mlx4_cmd_context *context;
@@ -380,17 +345,6 @@ static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
 
 	down(&cmd->event_sem);
 
-	end = msecs_to_jiffies(timeout) + jiffies;
-	while (comm_pending(dev) && time_before(jiffies, end))
-		cond_resched();
-	if (comm_pending(dev)) {
-		mlx4_warn(dev, "mlx4_comm_cmd_wait: Comm channel "
-			  "is not idle. My toggle is %d (op: 0x%x)\n",
-			  mlx4_priv(dev)->cmd.comm_toggle, op);
-		up(&cmd->event_sem);
-		return -EAGAIN;
-	}
-
 	spin_lock(&cmd->context_lock);
 	BUG_ON(cmd->free_head < 0);
 	context = &cmd->context[cmd->free_head];
@@ -398,29 +352,49 @@ static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
 	cmd->free_head = context->next;
 	spin_unlock(&cmd->context_lock);
 
-	init_completion(&context->done);
+	reinit_completion(&context->done);
 
-	mlx4_comm_cmd_post(dev, op, param);
+	if (mlx4_comm_cmd_post(dev, vhcr_cmd, param)) {
+		/* Only in case the device state is INTERNAL_ERROR,
+		 * mlx4_comm_cmd_post returns with an error
+		 */
+		err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+		goto out;
+	}
 
-	/* In slave, wait unconditionally for completion */
-	wait_for_completion(&context->done);
+	if (!wait_for_completion_timeout(&context->done,
+					 msecs_to_jiffies(timeout))) {
+		mlx4_warn(dev, "communication channel command 0x%x (op=0x%x) timed out\n",
+			  vhcr_cmd, op);
+		goto out_reset;
+	}
 
 	err = context->result;
 	if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) {
 		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
-			 op, context->fw_status);
-		goto out;
+			 vhcr_cmd, context->fw_status);
+		if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+			goto out_reset;
 	}
 
-out:
 	/* wait for comm channel ready
 	 * this is necessary for prevention the race
 	 * when switching between event to polling mode
+	 * Skipping this section in case the device is in FATAL_ERROR state,
+	 * In this state, no commands are sent via the comm channel until
+	 * the device has returned from reset.
 	 */
-	end = msecs_to_jiffies(timeout) + jiffies;
-	while (comm_pending(dev) && time_before(jiffies, end))
-		cond_resched();
+	if (!(dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
+		end = msecs_to_jiffies(timeout) + jiffies;
+		while (comm_pending(dev) && time_before(jiffies, end))
+			cond_resched();
+	}
+	goto out;
 
+out_reset:
+	err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+	mlx4_enter_error_state(dev->persist);
+out:
 	spin_lock(&cmd->context_lock);
 	context->next = cmd->free_head;
 	cmd->free_head = context - cmd->context;
@@ -431,10 +405,13 @@ static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
 }
 
 int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
-		  unsigned long timeout)
+		  u16 op, unsigned long timeout)
 {
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
 	if (mlx4_priv(dev)->cmd.use_events)
-		return mlx4_comm_cmd_wait(dev, cmd, param, timeout);
+		return mlx4_comm_cmd_wait(dev, cmd, param, op, timeout);
 	return mlx4_comm_cmd_poll(dev, cmd, param, timeout);
 }
 
@@ -442,7 +419,7 @@ static int cmd_pending(struct mlx4_dev *dev)
 {
 	u32 status;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return -EIO;
 
 	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
@@ -452,38 +429,27 @@ static int cmd_pending(struct mlx4_dev *dev)
 		 !!(status & swab32(1 << HCR_T_BIT)));
 }
 
-static int get_status(struct mlx4_dev *dev, u32 *status, int *go_bit,
-		      int *t_bit)
-{
-	if (pci_channel_offline(dev->pdev))
-		return -EIO;
-
-	*status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
-	*t_bit = !!(*status & swab32(1 << HCR_T_BIT));
-	*go_bit = !!(*status & swab32(1 << HCR_GO_BIT));
-
-	return 0;
-}
-
-static int mlx4_cmd_post(struct mlx4_dev *dev, struct timespec *ts1,
-			 u64 in_param, u64 out_param, u32 in_modifier,
-			 u8 op_modifier, u16 op, u16 token, int event)
+static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			 u32 in_modifier, u8 op_modifier, u16 op, u16 token,
+			 int event)
 {
 	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
 	u32 __iomem *hcr = cmd->hcr;
-	int ret = -EAGAIN;
+	int ret = -EIO;
 	unsigned long end;
-	int err, go_bit = 0, t_bit = 0;
-	u32 status = 0;
 
-	mutex_lock(&cmd->hcr_mutex);
-
-	if (pci_channel_offline(dev->pdev)) {
+	mutex_lock(&dev->persist->device_state_mutex);
+	/* To avoid writing to unknown addresses after the device state was
+	  * changed to internal error and the chip was reset,
+	  * check the INTERNAL_ERROR flag which is updated under
+	  * device_state_mutex lock.
+	  */
+	if (pci_channel_offline(dev->persist->pdev) ||
+	    (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
 		 */
-		ret = -EIO;
 		goto out;
 	}
 
@@ -492,12 +458,11 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, struct timespec *ts1,
 		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
 
 	while (cmd_pending(dev)) {
-		if (pci_channel_offline(dev->pdev)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
 			/*
 			 * Device is going through error recovery
 			 * and cannot accept commands.
 			 */
-			ret = -EIO;
 			goto out;
 		}
 
@@ -521,9 +486,6 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, struct timespec *ts1,
 	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
 	__raw_writel((__force u32) cpu_to_be32(token << 16),		  hcr + 5);
 
-	if (ts1)
-		ktime_get_ts(ts1);
-
 	/* __raw_writel may not order writes. */
 	wmb();
 
@@ -544,16 +506,11 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, struct timespec *ts1,
 	ret = 0;
 
 out:
-	if (ret) {
-		err = get_status(dev, &status, &go_bit, &t_bit);
-		mlx4_warn(dev, "Could not post command %s (0x%x): ret=%d, "
-			  "in_param=0x%llx, in_mod=0x%x, op_mod=0x%x, "
-			  "get_status err=%d, status_reg=0x%x, go_bit=%d, "
-			  "t_bit=%d, toggle=0x%x\n", cmd_to_str(op), op, ret,
-			  (unsigned long long) in_param, in_modifier, op_modifier, err, status,
-			  go_bit, t_bit, cmd->toggle);
-	}
-	mutex_unlock(&cmd->hcr_mutex);
+	if (ret)
+		mlx4_warn(dev, "Could not post command 0x%x: ret=%d, in_param=0x%llx, in_mod=0x%x, op_mod=0x%x\n",
+			  op, ret, (long long)in_param, in_modifier, op_modifier);
+	mutex_unlock(&dev->persist->device_state_mutex);
+
 	return ret;
 }
 
@@ -583,16 +540,18 @@ static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 					*out_param =
 						be64_to_cpu(vhcr->out_param);
 				else {
-					mlx4_err(dev, "response expected while"
-						 "output mailbox is NULL for "
-						 "command 0x%x\n", op);
+					mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+						 op);
 					vhcr->status = CMD_STAT_BAD_PARAM;
 				}
 			}
 			ret = mlx4_status_to_errno(vhcr->status);
 		}
+		if (ret &&
+		    dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			ret = mlx4_internal_err_ret_value(dev, op, op_modifier);
 	} else {
-		ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0,
+		ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0, op,
 				    MLX4_COMM_TIME + timeout);
 		if (!ret) {
 			if (out_is_imm) {
@@ -600,16 +559,20 @@ static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 					*out_param =
 						be64_to_cpu(vhcr->out_param);
 				else {
-					mlx4_err(dev, "response expected while"
-						 "output mailbox is NULL for "
-						 "command 0x%x\n", op);
+					mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+						 op);
 					vhcr->status = CMD_STAT_BAD_PARAM;
 				}
 			}
 			ret = mlx4_status_to_errno(vhcr->status);
-		} else
-			mlx4_err(dev, "failed execution of VHCR_POST command"
-				 "opcode %s (0x%x)\n", cmd_to_str(op), op);
+		} else {
+			if (dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR)
+				ret = mlx4_internal_err_ret_value(dev, op,
+								  op_modifier);
+			else
+				mlx4_err(dev, "failed execution of VHCR_POST command opcode 0x%x\n", op);
+		}
 	}
 
 	mutex_unlock(&priv->cmd.slave_cmd_mutex);
@@ -628,28 +591,40 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 
 	down(&priv->cmd.poll_sem);
 
-	if (pci_channel_offline(dev->pdev)) {
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
 		 */
-		err = -EIO;
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
 		goto out;
 	}
 
-	err = mlx4_cmd_post(dev, NULL, in_param, out_param ? *out_param : 0,
+	if (out_is_imm && !out_param) {
+		mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+			 op);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
 			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
 	if (err)
-		goto out;
+		goto out_reset;
 
 	end = msecs_to_jiffies(timeout) + jiffies;
 	while (cmd_pending(dev) && time_before(jiffies, end)) {
-		if (pci_channel_offline(dev->pdev)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
 			/*
 			 * Device is going through error recovery
 			 * and cannot accept commands.
 			 */
 			err = -EIO;
+			goto out_reset;
+		}
+
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
 			goto out;
 		}
 
@@ -657,10 +632,10 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	}
 
 	if (cmd_pending(dev)) {
-		mlx4_warn(dev, "command %s (0x%x) timed out (go bit not cleared)\n",
-			  cmd_to_str(op), op);
-		err = -ETIMEDOUT;
-		goto out;
+		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
+			  op);
+		err = -EIO;
+		goto out_reset;
 	}
 
 	if (out_is_imm)
@@ -672,10 +647,17 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	stat = be32_to_cpu((__force __be32)
 			   __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
 	err = mlx4_status_to_errno(stat);
-	if (err)
-		mlx4_err(dev, "command %s (0x%x) failed: fw status = 0x%x\n",
-			 cmd_to_str(op), op, stat);
+	if (err) {
+		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+			 op, stat);
+		if (mlx4_closing_cmd_fatal_error(op, stat))
+			goto out_reset;
+		goto out;
+	}
 
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
 out:
 	up(&priv->cmd.poll_sem);
 	return err;
@@ -704,15 +686,8 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 {
 	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
 	struct mlx4_cmd_context *context;
+	long ret_wait;
 	int err = 0;
-	int go_bit = 0, t_bit = 0, stat_err;
-	u32 status = 0;
-	struct timespec	ts1, ts2;
-	ktime_t t1, t2, delta;
-	s64 ds;
-
-	if (out_is_imm && !out_param)
-		return -EINVAL;
 
 	down(&cmd->event_sem);
 
@@ -723,59 +698,75 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	cmd->free_head = context->next;
 	spin_unlock(&cmd->context_lock);
 
-	init_completion(&context->done);
-
-	err = mlx4_cmd_post(dev, &ts1, in_param, out_param ? *out_param : 0,
-			    in_modifier, op_modifier, op, context->token, 1);
-	if (err)
-		goto out;
-
-	if (!wait_for_completion_timeout(&context->done,
-					 msecs_to_jiffies(timeout))) {
-		stat_err = get_status(dev, &status, &go_bit, &t_bit);
-		mlx4_warn(dev, "command %s (0x%x) timed out: in_param=0x%llx, "
-			  "in_mod=0x%x, op_mod=0x%x, get_status err=%d, "
-			  "status_reg=0x%x, go_bit=%d, t_bit=%d, toggle=0x%x\n"
-			  , cmd_to_str(op), op, (unsigned long long) in_param, in_modifier,
-			  op_modifier, stat_err, status, go_bit, t_bit,
-			  mlx4_priv(dev)->cmd.toggle);
-		err = -EBUSY;
+	if (out_is_imm && !out_param) {
+		mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+			 op);
+		err = -EINVAL;
 		goto out;
 	}
-	if (mlx4_debug_level & MLX4_DEBUG_MASK_CMD_TIME) {
-		ktime_get_ts(&ts2);
-		t1 = timespec_to_ktime(ts1);
-		t2 = timespec_to_ktime(ts2);
-		delta = ktime_sub(t2, t1);
-		ds = ktime_to_ns(delta);
-		pr_info("mlx4: fw exec time for %s is %lld nsec\n", cmd_to_str(op), (long long) ds);
+
+	reinit_completion(&context->done);
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, context->token, 1);
+	if (err)
+		goto out_reset;
+
+	if (op == MLX4_CMD_SENSE_PORT) {
+		ret_wait =
+			wait_for_completion_interruptible_timeout(&context->done,
+								  msecs_to_jiffies(timeout));
+		if (ret_wait < 0) {
+			context->fw_status = 0;
+			context->out_param = 0;
+			context->result = 0;
+		}
+	} else {
+		ret_wait = (long)wait_for_completion_timeout(&context->done,
+							     msecs_to_jiffies(timeout));
+	}
+	if (!ret_wait) {
+		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
+			  op);
+		if (op == MLX4_CMD_NOP) {
+			err = -EBUSY;
+			goto out;
+		} else {
+			err = -EIO;
+			goto out_reset;
+		}
 	}
 
 	err = context->result;
 	if (err) {
-		mlx4_err(dev, "command %s (0x%x) failed: in_param=0x%llx, "
-			 "in_mod=0x%x, op_mod=0x%x, fw status = 0x%x\n",
-			 cmd_to_str(op), op, (unsigned long long) in_param, in_modifier,
-			 op_modifier, context->fw_status);
+		/* Since we do not want to have this error message always
+		 * displayed at driver start when there are ConnectX2 HCAs
+		 * on the host, we deprecate the error message for this
+		 * specific command/input_mod/opcode_mod/fw-status to be debug.
+		 */
+		if (op == MLX4_CMD_SET_PORT &&
+		    (in_modifier == 1 || in_modifier == 2) &&
+		    op_modifier == MLX4_SET_PORT_IB_OPCODE &&
+		    context->fw_status == CMD_STAT_BAD_SIZE)
+			mlx4_dbg(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		else
+			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+		else if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+			goto out_reset;
 
-		switch(context->fw_status) {
-		case CMD_STAT_BAD_PARAM:
-			mlx4_err(dev, "Parameter is not supported, "
-			    "parameter is out of range\n");
-			break;
-		case CMD_STAT_EXCEED_LIM:
-			mlx4_err(dev, "Required capability exceeded "
-			    "device limits\n");
-			break;
-		default:
-			break;
-		}
 		goto out;
 	}
 
 	if (out_is_imm)
 		*out_param = context->out_param;
 
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
 out:
 	spin_lock(&cmd->context_lock);
 	context->next = cmd->free_head;
@@ -790,18 +781,27 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	       int out_is_imm, u32 in_modifier, u8 op_modifier,
 	       u16 op, unsigned long timeout, int native)
 {
-	if (pci_channel_offline(dev->pdev))
-		return -EIO;
+	if (pci_channel_offline(dev->persist->pdev))
+		return mlx4_cmd_reset_flow(dev, op, op_modifier, -EIO);
 
 	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+		int ret;
+
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			return mlx4_internal_err_ret_value(dev, op,
+							  op_modifier);
+		down_read(&mlx4_priv(dev)->cmd.switch_sem);
 		if (mlx4_priv(dev)->cmd.use_events)
-			return mlx4_cmd_wait(dev, in_param, out_param,
-					     out_is_imm, in_modifier,
-					     op_modifier, op, timeout);
+			ret = mlx4_cmd_wait(dev, in_param, out_param,
+					    out_is_imm, in_modifier,
+					    op_modifier, op, timeout);
 		else
-			return mlx4_cmd_poll(dev, in_param, out_param,
-					     out_is_imm, in_modifier,
-					     op_modifier, op, timeout);
+			ret = mlx4_cmd_poll(dev, in_param, out_param,
+					    out_is_imm, in_modifier,
+					    op_modifier, op, timeout);
+
+		up_read(&mlx4_priv(dev)->cmd.switch_sem);
+		return ret;
 	}
 	return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm,
 			      in_modifier, op_modifier, op, timeout);
@@ -809,7 +809,7 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 EXPORT_SYMBOL_GPL(__mlx4_cmd);
 
 
-static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
 {
 	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL,
 			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
@@ -826,7 +826,8 @@ static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr,
 	    (slave & ~0x7f) | (size & 0xff)) {
 		mlx4_err(dev, "Bad access mem params - slave_addr:0x%llx "
 			      "master_addr:0x%llx slave_id:%d size:%d\n",
-			      (unsigned long long) slave_addr, (unsigned long long) master_addr, slave, size);
+			      (unsigned long long)slave_addr,
+			      (unsigned long long)master_addr, slave, size);
 		return -EINVAL;
 	}
 
@@ -903,26 +904,37 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 {
 	struct ib_smp *smp = inbox->buf;
 	u32 index;
-	u8 port;
+	u8 port, slave_port;
+	u8 opcode_modifier;
 	u16 *table;
 	int err;
 	int vidx, pidx;
+	int network_view;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct ib_smp *outsmp = outbox->buf;
 	__be16 *outtab = (__be16 *)(outsmp->data);
 	__be32 slave_cap_mask;
 	__be64 slave_node_guid;
-	port = vhcr->in_modifier;
+
+	slave_port = vhcr->in_modifier;
+	port = mlx4_slave_convert_port(dev, slave, slave_port);
+
+	/* network-view bit is for driver use only, and should not be passed to FW */
+	opcode_modifier = vhcr->op_modifier & ~0x8; /* clear netw view bit */
+	network_view = !!(vhcr->op_modifier & 0x8);
 
 	if (smp->base_version == 1 &&
 	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
 	    smp->class_version == 1) {
-		if (smp->method	== IB_MGMT_METHOD_GET) {
+		/* host view is paravirtualized */
+		if (!network_view && smp->method == IB_MGMT_METHOD_GET) {
 			if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) {
 				index = be32_to_cpu(smp->attr_mod);
 				if (port < 1 || port > dev->caps.num_ports)
 					return -EINVAL;
-				table = kcalloc(dev->caps.pkey_table_len[port], sizeof *table, GFP_KERNEL);
+				table = kcalloc((dev->caps.pkey_table_len[port] / 32) + 1,
+						sizeof(*table) * 32, GFP_KERNEL);
+
 				if (!table)
 					return -ENOMEM;
 				/* need to get the full pkey table because the paravirtualized
@@ -941,8 +953,9 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 			if (smp->attr_id == IB_SMP_ATTR_PORT_INFO) {
 				/*get the slave specific caps:*/
 				/*do the command */
+				smp->attr_mod = cpu_to_be32(port);
 				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
-					    vhcr->in_modifier, vhcr->op_modifier,
+					    port, opcode_modifier,
 					    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 				/* modify the response for slaves */
 				if (!err && slave != mlx4_master_func_num(dev)) {
@@ -955,25 +968,38 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 				return err;
 			}
 			if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) {
-				/* compute slave's gid block */
-				smp->attr_mod = cpu_to_be32(slave / 8);
-				/* execute cmd */
-				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
-					     vhcr->in_modifier, vhcr->op_modifier,
-					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
-				if (!err) {
-					/* if needed, move slave gid to index 0 */
-					if (slave % 8)
-						memcpy(outsmp->data,
-						       outsmp->data + (slave % 8) * 8, 8);
-					/* delete all other gids */
-					memset(outsmp->data + 8, 0, 56);
+				__be64 guid = mlx4_get_admin_guid(dev, slave,
+								  port);
+
+				/* set the PF admin guid to the FW/HW burned
+				 * GUID, if it wasn't yet set
+				 */
+				if (slave == 0 && guid == 0) {
+					smp->attr_mod = 0;
+					err = mlx4_cmd_box(dev,
+							   inbox->dma,
+							   outbox->dma,
+							   vhcr->in_modifier,
+							   opcode_modifier,
+							   vhcr->op,
+							   MLX4_CMD_TIME_CLASS_C,
+							   MLX4_CMD_NATIVE);
+					if (err)
+						return err;
+					mlx4_set_admin_guid(dev,
+							    *(__be64 *)outsmp->
+							    data, slave, port);
+				} else {
+					memcpy(outsmp->data, &guid, 8);
 				}
-				return err;
+
+				/* clean all other gids */
+				memset(outsmp->data + 8, 0, 56);
+				return 0;
 			}
 			if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) {
 				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
-					     vhcr->in_modifier, vhcr->op_modifier,
+					     port, opcode_modifier,
 					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 				if (!err) {
 					slave_node_guid =  mlx4_get_slave_node_guid(dev, slave);
@@ -983,32 +1009,28 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
 			}
 		}
 	}
+
+	/* Non-privileged VFs are only allowed "host" view LID-routed 'Get' MADs.
+	 * These are the MADs used by ib verbs (such as ib_query_gids).
+	 */
 	if (slave != mlx4_master_func_num(dev) &&
-	    ((smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) ||
-	     (smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
-	      smp->method == IB_MGMT_METHOD_SET))) {
-		mlx4_err(dev, "slave %d is trying to execute a Subnet MGMT MAD, "
-			 "class 0x%x, method 0x%x for attr 0x%x. Rejecting\n",
-			 slave, smp->method, smp->mgmt_class,
-			 be16_to_cpu(smp->attr_id));
-		return -EPERM;
+	    !mlx4_vf_smi_enabled(dev, slave, port)) {
+		if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+		      smp->method == IB_MGMT_METHOD_GET) || network_view) {
+			mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n",
+				 slave, smp->mgmt_class, smp->method,
+				 network_view ? "Network" : "Host",
+				 be16_to_cpu(smp->attr_id));
+			return -EPERM;
+		}
 	}
-	/*default:*/
+
 	return mlx4_cmd_box(dev, inbox->dma, outbox->dma,
-				    vhcr->in_modifier, vhcr->op_modifier,
+				    vhcr->in_modifier, opcode_modifier,
 				    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 }
 
-static int MLX4_CMD_DIAG_RPRT_wrapper(struct mlx4_dev *dev, int slave,
-		     struct mlx4_vhcr *vhcr,
-		     struct mlx4_cmd_mailbox *inbox,
-		     struct mlx4_cmd_mailbox *outbox,
-		     struct mlx4_cmd_info *cmd)
-{
-	return -EPERM;
-}
-
-static int MLX4_CMD_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+static int mlx4_CMD_EPERM_wrapper(struct mlx4_dev *dev, int slave,
 		     struct mlx4_vhcr *vhcr,
 		     struct mlx4_cmd_mailbox *inbox,
 		     struct mlx4_cmd_mailbox *outbox,
@@ -1153,16 +1175,6 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = NULL
 	},
-	{
-		.opcode = MLX4_CMD_DIAG_RPRT,
-		.has_inbox = false,
-		.has_outbox = false,
-		.out_is_imm = false,
-		.encode_slave_id = false,
-		.skip_err_print = true,
-		.verify = NULL,
-		.wrapper = MLX4_CMD_DIAG_RPRT_wrapper
-	},
 	{
 		.opcode = MLX4_CMD_NOP,
 		.has_inbox = false,
@@ -1172,6 +1184,15 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = NULL
 	},
+	{
+		.opcode = MLX4_CMD_CONFIG_DEV,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CONFIG_DEV_wrapper
+	},
 	{
 		.opcode = MLX4_CMD_ALLOC_RES,
 		.has_inbox = false,
@@ -1247,7 +1268,7 @@ static struct mlx4_cmd_info cmd_info[] = {
 	{
 		.opcode = MLX4_CMD_HW2SW_EQ,
 		.has_inbox = false,
-		.has_outbox = true,
+		.has_outbox = false,
 		.out_is_imm = false,
 		.encode_slave_id = true,
 		.verify = NULL,
@@ -1462,13 +1483,39 @@ static struct mlx4_cmd_info cmd_info[] = {
 	},
 	{
 		.opcode = MLX4_CMD_UPDATE_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_UPDATE_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_GET_OP_REQ,
 		.has_inbox = false,
 		.has_outbox = false,
 		.out_is_imm = false,
 		.encode_slave_id = false,
-		.skip_err_print = true,
 		.verify = NULL,
-		.wrapper = MLX4_CMD_UPDATE_QP_wrapper
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_ALLOCATE_VPP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_SET_VPORT_QOS,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
 	},
 	{
 		.opcode = MLX4_CMD_CONF_SPECIAL_QP,
@@ -1488,6 +1535,15 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = mlx4_MAD_IFC_wrapper
 	},
+	{
+		.opcode = MLX4_CMD_MAD_DEMUX,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
 	{
 		.opcode = MLX4_CMD_QUERY_IF_STAT,
 		.has_inbox = false,
@@ -1497,6 +1553,24 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = mlx4_QUERY_IF_STAT_wrapper
 	},
+	{
+		.opcode = MLX4_CMD_ACCESS_REG,
+		.has_inbox = true,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ACCESS_REG_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_CONGESTION_CTRL_OPCODE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
 	/* Native multicast commands are not available for guests */
 	{
 		.opcode = MLX4_CMD_QP_ATTACH,
@@ -1572,16 +1646,23 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = mlx4_QP_FLOW_STEERING_DETACH_wrapper
 	},
-	/* wol commands */
 	{
-		.opcode = MLX4_CMD_MOD_STAT_CFG,
+		.opcode = MLX4_FLOW_STEERING_IB_UC_QP_RANGE,
 		.has_inbox = false,
 		.has_outbox = false,
 		.out_is_imm = false,
 		.encode_slave_id = false,
-		.skip_err_print = true,
 		.verify = NULL,
-		.wrapper = mlx4_MOD_STAT_CFG_wrapper
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_VIRT_PORT_MAP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
 	},
 };
 
@@ -1612,8 +1693,10 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
 				      ALIGN(sizeof(struct mlx4_vhcr_cmd),
 					    MLX4_ACCESS_MEM_ALIGN), 1);
 		if (ret) {
-			mlx4_err(dev, "%s:Failed reading vhcr"
-				 "ret: 0x%x\n", __func__, ret);
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s: Failed reading vhcr ret: 0x%x\n",
+					 __func__, ret);
 			kfree(vhcr);
 			return ret;
 		}
@@ -1636,8 +1719,8 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
 		}
 	}
 	if (!cmd) {
-		mlx4_err(dev, "unparavirt command: %s (0x%x) accepted from slave:%d\n",
-			 cmd_to_str(vhcr->op), vhcr->op, slave);
+		mlx4_err(dev, "Unknown command:0x%x accepted from slave:%d\n",
+			 vhcr->op, slave);
 		vhcr_cmd->status = CMD_STAT_BAD_PARAM;
 		goto out_status;
 	}
@@ -1652,11 +1735,14 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
 			goto out_status;
 		}
 
-		if (mlx4_ACCESS_MEM(dev, inbox->dma, slave,
-				    vhcr->in_param,
-				    MLX4_MAILBOX_SIZE, 1)) {
-			mlx4_err(dev, "%s: Failed reading inbox for cmd %s (0x%x)\n",
-				 __func__, cmd_to_str(cmd->opcode), cmd->opcode);
+		ret = mlx4_ACCESS_MEM(dev, inbox->dma, slave,
+				      vhcr->in_param,
+				      MLX4_MAILBOX_SIZE, 1);
+		if (ret) {
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n",
+					 __func__, cmd->opcode);
 			vhcr_cmd->status = CMD_STAT_INTERNAL_ERR;
 			goto out_status;
 		}
@@ -1664,8 +1750,7 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
 
 	/* Apply permission and bound checks if applicable */
 	if (cmd->verify && cmd->verify(dev, slave, vhcr, inbox)) {
-		mlx4_warn(dev, "Command %s (0x%x) from slave: %d failed protection "
-			  "checks for resource_id: %d\n", cmd_to_str(vhcr->op),
+		mlx4_warn(dev, "Command:0x%x from slave: %d failed protection checks for resource_id:%d\n",
 			  vhcr->op, slave, vhcr->in_modifier);
 		vhcr_cmd->status = CMD_STAT_BAD_OP;
 		goto out_status;
@@ -1705,13 +1790,9 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
 	}
 
 	if (err) {
-		if (!cmd->skip_err_print)
-			mlx4_warn(dev, "vhcr command %s (0x%x) slave:%d "
-				  "in_param 0x%llx in_mod=0x%x, op_mod=0x%x "
-				  "failed with error:%d, status %d\n",
-				  cmd_to_str(vhcr->op), vhcr->op, slave,
-				  (unsigned long long) vhcr->in_param, vhcr->in_modifier,
-				  vhcr->op_modifier, vhcr->errno, err);
+		if (!(dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR))
+			mlx4_warn(dev, "vhcr command:0x%x slave:%d failed with error:%d, status %d\n",
+				  vhcr->op, slave, vhcr->errno, err);
 		vhcr_cmd->status = mlx4_errno_to_status(err);
 		goto out_status;
 	}
@@ -1726,7 +1807,9 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
 			/* If we failed to write back the outbox after the
 			 *command was successfully executed, we must fail this
 			 * slave, as it is now in undefined state */
-			mlx4_err(dev, "%s: Failed writing outbox\n", __func__);
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s:Failed writing outbox\n", __func__);
 			goto out;
 		}
 	}
@@ -1744,8 +1827,8 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
 				 __func__);
 		else if (vhcr->e_bit &&
 			 mlx4_GEN_EQE(dev, slave, &priv->mfunc.master.cmd_eqe))
-				mlx4_warn(dev, "Failed to generate command completion "
-					  "eqe for slave %d\n", slave);
+				mlx4_warn(dev, "Failed to generate command completion eqe for slave %d\n",
+					  slave);
 	}
 
 out:
@@ -1761,6 +1844,7 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
 	struct mlx4_vport_oper_state *vp_oper;
 	struct mlx4_vport_state *vp_admin;
 	struct mlx4_vf_immed_vlan_work *work;
+	struct mlx4_dev *dev = &priv->dev;
 	int err;
 	int admin_vlan_ix = NO_INDX;
 
@@ -1768,9 +1852,24 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
 	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
 
 	if (vp_oper->state.default_vlan == vp_admin->default_vlan &&
-	    vp_oper->state.default_qos == vp_admin->default_qos)
+	    vp_oper->state.default_qos == vp_admin->default_qos &&
+	    vp_oper->state.vlan_proto == vp_admin->vlan_proto &&
+	    vp_oper->state.qos_vport == vp_admin->qos_vport)
 		return 0;
 
+	if (!(priv->mfunc.master.slave_state[slave].active &&
+	      dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP)) {
+		/* even if the UPDATE_QP command isn't supported, we still want
+		 * to set this VF link according to the admin directive
+		 */
+		return -1;
+	}
+
+	mlx4_dbg(dev, "updating immediately admin params slave %d port %d\n",
+		 slave, port);
+	mlx4_dbg(dev, "vlan %d QoS %d link down\n",
+		 vp_admin->default_vlan, vp_admin->default_qos);
+
 	work = kzalloc(sizeof(*work), GFP_KERNEL);
 	if (!work)
 		return -ENOMEM;
@@ -1781,17 +1880,17 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
 						   vp_admin->default_vlan,
 						   &admin_vlan_ix);
 			if (err) {
-				mlx4_warn((&priv->dev),
+				kfree(work);
+				mlx4_warn(&priv->dev,
 					  "No vlan resources slave %d, port %d\n",
 					  slave, port);
-				kfree(work);
 				return err;
 			}
 		} else {
 			admin_vlan_ix = NO_INDX;
 		}
 		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_VLAN;
-		mlx4_dbg((&(priv->dev)),
+		mlx4_dbg(&priv->dev,
 			 "alloc vlan %d idx  %d slave %d port %d\n",
 			 (int)(vp_admin->default_vlan),
 			 admin_vlan_ix, slave, port);
@@ -1810,13 +1909,20 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
 
 	vp_oper->state.default_vlan = vp_admin->default_vlan;
 	vp_oper->state.default_qos = vp_admin->default_qos;
+	vp_oper->state.vlan_proto = vp_admin->vlan_proto;
+	vp_oper->state.qos_vport = vp_admin->qos_vport;
+
+	if (1 /* vp_admin->link_state == IFLA_VF_LINK_STATE_DISABLE */)
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE;
 
 	/* iterate over QPs owned by this slave, using UPDATE_QP */
 	work->port = port;
 	work->slave = slave;
 	work->qos = vp_oper->state.default_qos;
+	work->qos_vport = vp_oper->state.qos_vport;
 	work->vlan_id = vp_oper->state.default_vlan;
 	work->vlan_ix = vp_oper->vlan_idx;
+	work->vlan_proto = vp_oper->state.vlan_proto;
 	work->priv = priv;
 	INIT_WORK(&work->work, mlx4_vf_immed_vlan_work_handler);
 	queue_work(priv->mfunc.master.comm_wq, &work->work);
@@ -1824,28 +1930,110 @@ static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
 	return 0;
 }
 
+static void mlx4_set_default_port_qos(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_qos_manager *port_qos_ctl;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	port_qos_ctl = &priv->mfunc.master.qos_ctl[port];
+	bitmap_zero(port_qos_ctl->priority_bm, MLX4_NUM_UP);
+
+	/* Enable only default prio at PF init routine */
+	set_bit(MLX4_DEFAULT_QOS_PRIO, port_qos_ctl->priority_bm);
+}
+
+static void mlx4_allocate_port_vpps(struct mlx4_dev *dev, int port)
+{
+	int i;
+	int err;
+	int num_vfs;
+	u16 availible_vpp;
+	u8 vpp_param[MLX4_NUM_UP];
+	struct mlx4_qos_manager *port_qos;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed query availible VPPs\n");
+		return;
+	}
+
+	port_qos = &priv->mfunc.master.qos_ctl[port];
+	num_vfs = (availible_vpp /
+		   bitmap_weight(port_qos->priority_bm, MLX4_NUM_UP));
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		if (test_bit(i, port_qos->priority_bm))
+			vpp_param[i] = num_vfs;
+	}
+
+	err = mlx4_ALLOCATE_VPP_set(dev, port, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed allocating VPPs\n");
+		return;
+	}
+
+	/* Query actual allocated VPP, just to make sure */
+	err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed query availible VPPs\n");
+		return;
+	}
+
+	port_qos->num_of_qos_vfs = num_vfs;
+	mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, availible_vpp);
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		mlx4_dbg(dev, "Port %d UP %d Allocated %d VPPs\n", port, i,
+			 vpp_param[i]);
+}
 
 static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 {
 	int port, err;
 	struct mlx4_vport_state *vp_admin;
 	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_slave_state *slave_state =
+		&priv->mfunc.master.slave_state[slave];
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
 
-	for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+	for (port = min_port; port <= max_port; port++) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			priv->mfunc.master.vf_admin[slave].enable_smi[port];
 		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 		vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
-		vp_oper->state = *vp_admin;
+		if (vp_admin->vlan_proto != htons(ETH_P_8021AD) ||
+		    slave_state->vst_qinq_supported) {
+			vp_oper->state.vlan_proto   = vp_admin->vlan_proto;
+			vp_oper->state.default_vlan = vp_admin->default_vlan;
+			vp_oper->state.default_qos  = vp_admin->default_qos;
+		}
+		vp_oper->state.mac        = vp_admin->mac;
+		vp_oper->state.spoofchk   = vp_admin->spoofchk;
+		vp_oper->state.tx_rate    = vp_admin->tx_rate;
+		vp_oper->state.qos_vport  = vp_admin->qos_vport;
+		vp_oper->state.guid       = vp_admin->guid;
+
 		if (MLX4_VGT != vp_admin->default_vlan) {
 			err = __mlx4_register_vlan(&priv->dev, port,
-						 vp_admin->default_vlan, &(vp_oper->vlan_idx));
+						   vp_admin->default_vlan, &(vp_oper->vlan_idx));
 			if (err) {
 				vp_oper->vlan_idx = NO_INDX;
-				mlx4_warn((&priv->dev),
-					  "No vlan resorces slave %d, port %d\n",
+				vp_oper->state.default_vlan = MLX4_VGT;
+				vp_oper->state.vlan_proto = htons(ETH_P_8021Q);
+				mlx4_warn(&priv->dev,
+					  "No vlan resources slave %d, port %d\n",
 					  slave, port);
 				return err;
 			}
-			mlx4_dbg((&(priv->dev)), "alloc vlan %d idx  %d slave %d port %d\n",
+			mlx4_dbg(&priv->dev, "alloc vlan %d idx  %d slave %d port %d\n",
 				 (int)(vp_oper->state.default_vlan),
 				 vp_oper->vlan_idx, slave, port);
 		}
@@ -1856,12 +2044,12 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 			if (0 > vp_oper->mac_idx) {
 				err = vp_oper->mac_idx;
 				vp_oper->mac_idx = NO_INDX;
-				mlx4_warn((&priv->dev),
+				mlx4_warn(&priv->dev,
 					  "No mac resources slave %d, port %d\n",
 					  slave, port);
 				return err;
 			}
-			mlx4_dbg((&(priv->dev)), "alloc mac %llx idx  %d slave %d port %d\n",
+			mlx4_dbg(&priv->dev, "alloc mac %llx idx  %d slave %d port %d\n",
 				 (unsigned long long) vp_oper->state.mac, vp_oper->mac_idx, slave, port);
 		}
 	}
@@ -1872,8 +2060,19 @@ static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave
 {
 	int port;
 	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
 
-	for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+
+	for (port = min_port; port <= max_port; port++) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			MLX4_VF_SMI_DISABLED;
 		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 		if (NO_INDX != vp_oper->vlan_idx) {
 			__mlx4_unregister_vlan(&priv->dev,
@@ -1901,14 +2100,15 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 	slave_state[slave].comm_toggle ^= 1;
 	reply = (u32) slave_state[slave].comm_toggle << 31;
 	if (toggle != slave_state[slave].comm_toggle) {
-		mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER"
-			  "STATE COMPROMISIED ***\n", toggle, slave);
+		mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER STATE COMPROMISED ***\n",
+			  toggle, slave);
 		goto reset_slave;
 	}
 	if (cmd == MLX4_COMM_CMD_RESET) {
 		mlx4_warn(dev, "Received reset from slave:%d\n", slave);
 		slave_state[slave].active = false;
 		slave_state[slave].old_vlan_api = false;
+		slave_state[slave].vst_qinq_supported = false;
 		mlx4_master_deactivate_admin_state(priv, slave);
 		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) {
 				slave_state[slave].event_eq[i].eqn = -1;
@@ -1929,8 +2129,8 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 	/*command from slave in the middle of FLR*/
 	if (cmd != MLX4_COMM_CMD_RESET &&
 	    MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) {
-		mlx4_warn(dev, "slave:%d is Trying to run cmd (0x%x) "
-			  "in the middle of FLR\n", slave, cmd);
+		mlx4_warn(dev, "slave:%d is Trying to run cmd(0x%x) in the middle of FLR\n",
+			  slave, cmd);
 		return;
 	}
 
@@ -1962,20 +2162,23 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 		break;
 	case MLX4_COMM_CMD_VHCR_POST:
 		if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
-		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST))
+		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) {
+			mlx4_warn(dev, "slave:%d is out of sync, cmd=0x%x, last command=0x%x, reset is needed\n",
+				  slave, cmd, slave_state[slave].last_cmd);
 			goto reset_slave;
+		}
 
 		mutex_lock(&priv->cmd.slave_cmd_mutex);
 		if (mlx4_master_process_vhcr(dev, slave, NULL)) {
-			mlx4_err(dev, "Failed processing vhcr for slave: %d,"
-				 " resetting slave.\n", slave);
+			mlx4_err(dev, "Failed processing vhcr for slave:%d, resetting slave\n",
+				 slave);
 			mutex_unlock(&priv->cmd.slave_cmd_mutex);
 			goto reset_slave;
 		}
 		mutex_unlock(&priv->cmd.slave_cmd_mutex);
 		break;
 	default:
-		mlx4_warn(dev, "Bad comm cmd: %d from slave: %d\n", cmd, slave);
+		mlx4_warn(dev, "Bad comm cmd:%d from slave:%d\n", cmd, slave);
 		goto reset_slave;
 	}
 	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
@@ -1985,8 +2188,7 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 		is_going_down = 1;
 	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
 	if (is_going_down) {
-		mlx4_warn(dev, "Slave is going down aborting command (%d)"
-			  " executing from slave: %d\n",
+		mlx4_warn(dev, "Slave is going down aborting command(%d) executing from slave:%d\n",
 			  cmd, slave);
 		return;
 	}
@@ -1998,13 +2200,26 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 
 reset_slave:
 	/* cleanup any slave resources */
-	mlx4_delete_all_resources_for_slave(dev, slave);
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_delete_all_resources_for_slave(dev, slave);
+
+	if (cmd != MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Turn on internal error to force reset, slave=%d, cmd=0x%x\n",
+			  slave, cmd);
+		/* Turn on internal error letting slave reset itself immeditaly,
+		 * otherwise it might take till timeout on command is passed
+		 */
+		reply |= ((u32)COMM_CHAN_EVENT_INTERNAL_ERR);
+	}
+
 	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
 	if (!slave_state[slave].is_slave_going_down)
 		slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET;
 	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
 	/*with slave in the middle of flr, no need to clean resources again.*/
 inform_slave_state:
+	memset(&slave_state[slave].event_eq, 0,
+	       sizeof(struct mlx4_slave_event_eq_info));
 	__raw_writel((__force u32) cpu_to_be32(reply),
 		     &priv->mfunc.comm[slave].slave_read);
 	wmb();
@@ -2047,10 +2262,9 @@ void mlx4_master_comm_channel(struct work_struct *work)
 			if (toggle != slt) {
 				if (master->slave_state[slave].comm_toggle
 				    != slt) {
-					mlx4_info(dev, "slave %d out of sync."
-						  " read toggle %d, state toggle %d. "
-						  "Resynching.\n", slave, slt,
-						  master->slave_state[slave].comm_toggle);
+					pr_info("slave %d out of sync. read toggle %d, state toggle %d. Resynching.\n",
+						slave, slt,
+						master->slave_state[slave].comm_toggle);
 					master->slave_state[slave].comm_toggle =
 						slt;
 				}
@@ -2058,30 +2272,13 @@ void mlx4_master_comm_channel(struct work_struct *work)
 						   comm_cmd >> 16 & 0xff,
 						   comm_cmd & 0xffff, toggle);
 				++served;
-			} else
-				mlx4_err(dev, "slave %d out of sync."
-				  " read toggle %d, write toggle %d.\n", slave, slt,
-				  toggle);
+			}
 		}
 	}
 
 	if (reported && reported != served)
-		mlx4_warn(dev, "Got command event with bitmask from %d slaves"
-			  " but %d were served\n",
+		mlx4_warn(dev, "Got command event with bitmask from %d slaves but %d were served\n",
 			  reported, served);
-}
-/* master command processing */
-void mlx4_master_arm_comm_channel(struct work_struct *work)
-{
-	struct mlx4_mfunc_master_ctx *master =
-		container_of(work,
-			     struct mlx4_mfunc_master_ctx,
-			     arm_comm_work);
-	struct mlx4_mfunc *mfunc =
-		container_of(master, struct mlx4_mfunc, master);
-	struct mlx4_priv *priv =
-		container_of(mfunc, struct mlx4_priv, mfunc);
-	struct mlx4_dev *dev = &priv->dev;
 
 	if (mlx4_ARM_COMM_CHANNEL(dev))
 		mlx4_warn(dev, "Failed to arm comm channel events\n");
@@ -2090,17 +2287,28 @@ void mlx4_master_arm_comm_channel(struct work_struct *work)
 static int sync_toggles(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int wr_toggle;
-	int rd_toggle;
+	u32 wr_toggle;
+	u32 rd_toggle;
 	unsigned long end;
 
-	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31;
-	end = jiffies + msecs_to_jiffies(5000);
+	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write));
+	if (wr_toggle == 0xffffffff)
+		end = jiffies + msecs_to_jiffies(30000);
+	else
+		end = jiffies + msecs_to_jiffies(5000);
 
 	while (time_before(jiffies, end)) {
-		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31;
-		if (rd_toggle == wr_toggle) {
-			priv->cmd.comm_toggle = rd_toggle;
+		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) {
+			/* PCI might be offline */
+			msleep(100);
+			wr_toggle = swab32(readl(&priv->mfunc.comm->
+					   slave_write));
+			continue;
+		}
+
+		if (rd_toggle >> 31 == wr_toggle >> 31) {
+			priv->cmd.comm_toggle = rd_toggle >> 31;
 			return 0;
 		}
 
@@ -2129,18 +2337,22 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 
 	if (mlx4_is_master(dev))
 		priv->mfunc.comm =
-		ioremap(pci_resource_start(dev->pdev, priv->fw.comm_bar) +
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.comm_bar) +
 			priv->fw.comm_base, MLX4_COMM_PAGESIZE);
 	else
 		priv->mfunc.comm =
-		ioremap(pci_resource_start(dev->pdev, 2) +
+		ioremap(pci_resource_start(dev->persist->pdev, 2) +
 			MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE);
 	if (!priv->mfunc.comm) {
-		mlx4_err(dev, "Couldn't map communication vector.\n");
+		mlx4_err(dev, "Couldn't map communication vector\n");
 		goto err_vhcr;
 	}
 
 	if (mlx4_is_master(dev)) {
+		struct mlx4_vf_oper_state *vf_oper;
+		struct mlx4_vf_admin_state *vf_admin;
+
 		priv->mfunc.master.slave_state =
 			kzalloc(dev->num_slaves *
 				sizeof(struct mlx4_slave_state), GFP_KERNEL);
@@ -2160,8 +2372,11 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 			goto err_comm_oper;
 
 		for (i = 0; i < dev->num_slaves; ++i) {
+			vf_admin = &priv->mfunc.master.vf_admin[i];
+			vf_oper = &priv->mfunc.master.vf_oper[i];
 			s_state = &priv->mfunc.master.slave_state[i];
 			s_state->last_cmd = MLX4_COMM_CMD_RESET;
+			s_state->vst_qinq_supported = false;
 			mutex_init(&priv->mfunc.master.gen_eqe_mutex[i]);
 			for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j)
 				s_state->event_eq[j].eqn = -1;
@@ -2171,6 +2386,9 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 				     &priv->mfunc.comm[i].slave_read);
 			mmiowb();
 			for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+				struct mlx4_vport_state *admin_vport;
+				struct mlx4_vport_state *oper_vport;
+
 				s_state->vlan_filter[port] =
 					kzalloc(sizeof(struct mlx4_vlan_fltr),
 						GFP_KERNEL);
@@ -2179,21 +2397,37 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 						kfree(s_state->vlan_filter[port]);
 					goto err_slaves;
 				}
+
+				admin_vport = &vf_admin->vport[port];
+				oper_vport = &vf_oper->vport[port].state;
 				INIT_LIST_HEAD(&s_state->mcast_filters[port]);
-				priv->mfunc.master.vf_admin[i].vport[port].default_vlan = MLX4_VGT;
-				priv->mfunc.master.vf_oper[i].vport[port].state.default_vlan = MLX4_VGT;
-				priv->mfunc.master.vf_oper[i].vport[port].vlan_idx = NO_INDX;
-				priv->mfunc.master.vf_oper[i].vport[port].mac_idx = NO_INDX;
+				admin_vport->default_vlan = MLX4_VGT;
+				oper_vport->default_vlan = MLX4_VGT;
+				admin_vport->qos_vport =
+						MLX4_VPP_DEFAULT_VPORT;
+				oper_vport->qos_vport = MLX4_VPP_DEFAULT_VPORT;
+				admin_vport->vlan_proto = htons(ETH_P_8021Q);
+				oper_vport->vlan_proto = htons(ETH_P_8021Q);
+				vf_oper->vport[port].vlan_idx = NO_INDX;
+				vf_oper->vport[port].mac_idx = NO_INDX;
+				mlx4_set_random_admin_guid(dev, i, port);
 			}
 			spin_lock_init(&s_state->lock);
 		}
 
-		memset(&priv->mfunc.master.cmd_eqe, 0, dev->caps.eqe_size);
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP) {
+			for (port = 1; port <= dev->caps.num_ports; port++) {
+				if (mlx4_is_eth(dev, port)) {
+					mlx4_set_default_port_qos(dev, port);
+					mlx4_allocate_port_vpps(dev, port);
+				}
+			}
+		}
+
+		memset(&priv->mfunc.master.cmd_eqe, 0, sizeof(struct mlx4_eqe));
 		priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD;
 		INIT_WORK(&priv->mfunc.master.comm_work,
 			  mlx4_master_comm_channel);
-		INIT_WORK(&priv->mfunc.master.arm_comm_work,
-			  mlx4_master_arm_comm_channel);
 		INIT_WORK(&priv->mfunc.master.slave_event_work,
 			  mlx4_gen_slave_eqe);
 		INIT_WORK(&priv->mfunc.master.slave_flr_event_work,
@@ -2208,13 +2442,6 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 		if (mlx4_init_resource_tracker(dev))
 			goto err_thread;
 
-		err = mlx4_ARM_COMM_CHANNEL(dev);
-		if (err) {
-			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
-				 err);
-			goto err_resource;
-		}
-
 	} else {
 		err = sync_toggles(dev);
 		if (err) {
@@ -2224,13 +2451,11 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 	}
 	return 0;
 
-err_resource:
-	mlx4_free_resource_tracker(dev, RES_TR_FREE_ALL);
 err_thread:
 	flush_workqueue(priv->mfunc.master.comm_wq);
 	destroy_workqueue(priv->mfunc.master.comm_wq);
 err_slaves:
-	while (--i) {
+	while (i--) {
 		for (port = 1; port <= MLX4_MAX_PORTS; port++)
 			kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
 	}
@@ -2241,10 +2466,11 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 	kfree(priv->mfunc.master.slave_state);
 err_comm:
 	iounmap(priv->mfunc.comm);
+	priv->mfunc.comm = NULL;
 err_vhcr:
-	dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
-					     priv->mfunc.vhcr,
-					     priv->mfunc.vhcr_dma);
+	dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+			  priv->mfunc.vhcr,
+			  priv->mfunc.vhcr_dma);
 	priv->mfunc.vhcr = NULL;
 	return -ENOMEM;
 }
@@ -2252,55 +2478,85 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 int mlx4_cmd_init(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
+	int flags = 0;
 
-	mutex_init(&priv->cmd.hcr_mutex);
-	mutex_init(&priv->cmd.slave_cmd_mutex);
-	sema_init(&priv->cmd.poll_sem, 1);
-	priv->cmd.use_events = 0;
-	priv->cmd.toggle     = 1;
-
-	priv->cmd.hcr = NULL;
-	priv->mfunc.vhcr = NULL;
-
-	if (!mlx4_is_slave(dev)) {
-		priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) +
-					MLX4_HCR_BASE, MLX4_HCR_SIZE);
-		if (!priv->cmd.hcr) {
-			mlx4_err(dev, "Couldn't map command register.\n");
-			return -ENOMEM;
-		}
+	if (!priv->cmd.initialized) {
+		init_rwsem(&priv->cmd.switch_sem);
+		mutex_init(&priv->cmd.slave_cmd_mutex);
+		sema_init(&priv->cmd.poll_sem, 1);
+		priv->cmd.use_events = 0;
+		priv->cmd.toggle     = 1;
+		priv->cmd.initialized = 1;
+		flags |= MLX4_CMD_CLEANUP_STRUCT;
 	}
 
-	if (mlx4_is_mfunc(dev)) {
-		priv->mfunc.vhcr = dma_alloc_coherent(&(dev->pdev->dev), PAGE_SIZE,
+	if (!mlx4_is_slave(dev) && !priv->cmd.hcr) {
+		priv->cmd.hcr = ioremap(pci_resource_start(dev->persist->pdev,
+					0) + MLX4_HCR_BASE, MLX4_HCR_SIZE);
+		if (!priv->cmd.hcr) {
+			mlx4_err(dev, "Couldn't map command register\n");
+			goto err;
+		}
+		flags |= MLX4_CMD_CLEANUP_HCR;
+	}
+
+	if (mlx4_is_mfunc(dev) && !priv->mfunc.vhcr) {
+		priv->mfunc.vhcr = dma_alloc_coherent(&dev->persist->pdev->dev,
+						      PAGE_SIZE,
 						      &priv->mfunc.vhcr_dma,
 						      GFP_KERNEL);
-		if (!priv->mfunc.vhcr) {
-			mlx4_err(dev, "Couldn't allocate VHCR.\n");
-			goto err_hcr;
-		}
+		if (!priv->mfunc.vhcr)
+			goto err;
+
+		flags |= MLX4_CMD_CLEANUP_VHCR;
 	}
 
-	priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
-					 MLX4_MAILBOX_SIZE,
-					 MLX4_MAILBOX_SIZE, 0);
-	if (!priv->cmd.pool)
-		goto err_vhcr;
+	if (!priv->cmd.pool) {
+		priv->cmd.pool = pci_pool_create("mlx4_cmd",
+						 dev->persist->pdev,
+						 MLX4_MAILBOX_SIZE,
+						 MLX4_MAILBOX_SIZE, 0);
+		if (!priv->cmd.pool)
+			goto err;
+
+		flags |= MLX4_CMD_CLEANUP_POOL;
+	}
 
 	return 0;
 
-err_vhcr:
-	if (mlx4_is_mfunc(dev))
-		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
-				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
-	priv->mfunc.vhcr = NULL;
-
-err_hcr:
-	if (!mlx4_is_slave(dev))
-		iounmap(priv->cmd.hcr);
+err:
+	mlx4_cmd_cleanup(dev, flags);
 	return -ENOMEM;
 }
 
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int slave;
+	u32 slave_read;
+
+	/* If the comm channel has not yet been initialized,
+	 * skip reporting the internal error event to all
+	 * the communication channels.
+	 */
+	if (!priv->mfunc.comm)
+		return;
+
+	/* Report an internal error event to all
+	 * communication channels.
+	 */
+	for (slave = 0; slave < dev->num_slaves; slave++) {
+		slave_read = swab32(readl(&priv->mfunc.comm[slave].slave_read));
+		slave_read |= (u32)COMM_CHAN_EVENT_INTERNAL_ERR;
+		__raw_writel((__force u32)cpu_to_be32(slave_read),
+			     &priv->mfunc.comm[slave].slave_read);
+		/* Make sure that our comm channel write doesn't
+		 * get mixed in with writes from another CPU.
+		 */
+		mmiowb();
+	}
+}
+
 void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -2316,23 +2572,35 @@ void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
 		kfree(priv->mfunc.master.slave_state);
 		kfree(priv->mfunc.master.vf_admin);
 		kfree(priv->mfunc.master.vf_oper);
+		dev->num_slaves = 0;
 	}
 
 	iounmap(priv->mfunc.comm);
+	priv->mfunc.comm = NULL;
 }
 
-void mlx4_cmd_cleanup(struct mlx4_dev *dev)
+void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	pci_pool_destroy(priv->cmd.pool);
+	if (priv->cmd.pool && (cleanup_mask & MLX4_CMD_CLEANUP_POOL)) {
+		pci_pool_destroy(priv->cmd.pool);
+		priv->cmd.pool = NULL;
+	}
 
-	if (!mlx4_is_slave(dev))
+	if (!mlx4_is_slave(dev) && priv->cmd.hcr &&
+	    (cleanup_mask & MLX4_CMD_CLEANUP_HCR)) {
 		iounmap(priv->cmd.hcr);
-	if (mlx4_is_mfunc(dev))
-		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+		priv->cmd.hcr = NULL;
+	}
+	if (mlx4_is_mfunc(dev) && priv->mfunc.vhcr &&
+	    (cleanup_mask & MLX4_CMD_CLEANUP_VHCR)) {
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
-	priv->mfunc.vhcr = NULL;
+		priv->mfunc.vhcr = NULL;
+	}
+	if (priv->cmd.initialized && (cleanup_mask & MLX4_CMD_CLEANUP_STRUCT))
+		priv->cmd.initialized = 0;
 }
 
 /*
@@ -2351,16 +2619,21 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
 	if (!priv->cmd.context)
 		return -ENOMEM;
 
+	down_write(&priv->cmd.switch_sem);
 	for (i = 0; i < priv->cmd.max_cmds; ++i) {
 		priv->cmd.context[i].token = i;
 		priv->cmd.context[i].next  = i + 1;
+		/* To support fatal error flow, initialize all
+		 * cmd contexts to allow simulating completions
+		 * with complete() at any time.
+		 */
+		init_completion(&priv->cmd.context[i].done);
 	}
 
 	priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
 	priv->cmd.free_head = 0;
 
 	sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds);
-	spin_lock_init(&priv->cmd.context_lock);
 
 	for (priv->cmd.token_mask = 1;
 	     priv->cmd.token_mask < priv->cmd.max_cmds;
@@ -2370,6 +2643,7 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
 
 	down(&priv->cmd.poll_sem);
 	priv->cmd.use_events = 1;
+	up_write(&priv->cmd.switch_sem);
 
 	return err;
 }
@@ -2382,6 +2656,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int i;
 
+	down_write(&priv->cmd.switch_sem);
 	priv->cmd.use_events = 0;
 
 	for (i = 0; i < priv->cmd.max_cmds; ++i)
@@ -2390,6 +2665,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev)
 	kfree(priv->cmd.context);
 
 	up(&priv->cmd.poll_sem);
+	up_write(&priv->cmd.switch_sem);
 }
 
 struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
@@ -2431,14 +2707,254 @@ u32 mlx4_comm_get_version(void)
 
 static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
 {
-	if ((vf < 0) || (vf >= dev->num_vfs)) {
-		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n", vf, dev->num_vfs);
+	if ((vf < 0) || (vf >= dev->persist->num_vfs)) {
+		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n",
+			 vf, dev->persist->num_vfs);
 		return -EINVAL;
 	}
-	return (vf+1);
+
+	return vf+1;
 }
 
-int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac)
+int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
+{
+	if (slave < 1 || slave > dev->persist->num_vfs) {
+		mlx4_err(dev,
+			 "Bad slave number:%d (number of activated slaves: %lu)\n",
+			 slave, dev->num_slaves);
+		return -EINVAL;
+	}
+	return slave - 1;
+}
+
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context;
+	int i;
+
+	spin_lock(&priv->cmd.context_lock);
+	if (priv->cmd.context) {
+		for (i = 0; i < priv->cmd.max_cmds; ++i) {
+			context = &priv->cmd.context[i];
+			context->fw_status = CMD_STAT_INTERNAL_ERR;
+			context->result    =
+				mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+			complete(&context->done);
+		}
+	}
+	spin_unlock(&priv->cmd.context_lock);
+}
+
+struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_active_ports actv_ports;
+	int vf;
+
+	bitmap_zero(actv_ports.ports, MLX4_MAX_PORTS);
+
+	if (slave == 0) {
+		bitmap_fill(actv_ports.ports, dev->caps.num_ports);
+		return actv_ports;
+	}
+
+	vf = mlx4_get_vf_indx(dev, slave);
+	if (vf < 0)
+		return actv_ports;
+
+	bitmap_set(actv_ports.ports, dev->dev_vfs[vf].min_port - 1,
+		   min((int)dev->dev_vfs[mlx4_get_vf_indx(dev, slave)].n_ports,
+		   dev->caps.num_ports));
+
+	return actv_ports;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_active_ports);
+
+int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port)
+{
+	unsigned n;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	unsigned m = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+
+	if (port <= 0 || port > m)
+		return -EINVAL;
+
+	n = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+	if (port <= n)
+		port = n + 1;
+
+	return port;
+}
+EXPORT_SYMBOL_GPL(mlx4_slave_convert_port);
+
+int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	if (test_bit(port - 1, actv_ports.ports))
+		return port -
+			find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+	return -1;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slave_port);
+
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
+						   int port)
+{
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+
+	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
+
+	if (port <= 0 || port > dev->caps.num_ports)
+		return slaves_pport;
+
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, i);
+		if (test_bit(port - 1, actv_ports.ports))
+			set_bit(i, slaves_pport.slaves);
+	}
+
+	return slaves_pport;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport);
+
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
+		struct mlx4_dev *dev,
+		const struct mlx4_active_ports *crit_ports)
+{
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+
+	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
+
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, i);
+		if (bitmap_equal(crit_ports->ports, actv_ports.ports,
+				 dev->caps.num_ports))
+			set_bit(i, slaves_pport.slaves);
+	}
+
+	return slaves_pport;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport_actv);
+
+static int mlx4_slaves_closest_port(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	int min_port = find_first_bit(actv_ports.ports, dev->caps.num_ports)
+			+ 1;
+	int max_port = min_port +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+
+	if (port < min_port)
+		port = min_port;
+	else if (port >= max_port)
+		port = max_port - 1;
+
+	return port;
+}
+
+static int mlx4_set_vport_qos(struct mlx4_priv *priv, int slave, int port,
+			      int max_tx_rate)
+{
+	int i;
+	int err;
+	struct mlx4_qos_manager *port_qos;
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_vport_qos_param vpp_qos[MLX4_NUM_UP];
+
+	port_qos = &priv->mfunc.master.qos_ctl[port];
+	memset(vpp_qos, 0, sizeof(struct mlx4_vport_qos_param) * MLX4_NUM_UP);
+
+	if (slave > port_qos->num_of_qos_vfs) {
+		mlx4_info(dev, "No availible VPP resources for this VF\n");
+		return -EINVAL;
+	}
+
+	/* Query for default QoS values from Vport 0 is needed */
+	err = mlx4_SET_VPORT_QOS_get(dev, port, 0, vpp_qos);
+	if (err) {
+		mlx4_info(dev, "Failed to query Vport 0 QoS values\n");
+		return err;
+	}
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		if (test_bit(i, port_qos->priority_bm) && max_tx_rate) {
+			vpp_qos[i].max_avg_bw = max_tx_rate;
+			vpp_qos[i].enable = 1;
+		} else {
+			/* if user supplied tx_rate == 0, meaning no rate limit
+			 * configuration is required. so we are leaving the
+			 * value of max_avg_bw as queried from Vport 0.
+			 */
+			vpp_qos[i].enable = 0;
+		}
+	}
+
+	err = mlx4_SET_VPORT_QOS_set(dev, port, slave, vpp_qos);
+	if (err) {
+		mlx4_info(dev, "Failed to set Vport %d QoS values\n", slave);
+		return err;
+	}
+
+	return 0;
+}
+
+static bool mlx4_is_vf_vst_and_prio_qos(struct mlx4_dev *dev, int port,
+					struct mlx4_vport_state *vf_admin)
+{
+	struct mlx4_qos_manager *info;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP))
+		return false;
+
+	info = &priv->mfunc.master.qos_ctl[port];
+
+	if (vf_admin->default_vlan != MLX4_VGT &&
+	    test_bit(vf_admin->default_qos, info->priority_bm))
+		return true;
+
+	return false;
+}
+
+static bool mlx4_valid_vf_state_change(struct mlx4_dev *dev, int port,
+				       struct mlx4_vport_state *vf_admin,
+				       int vlan, int qos)
+{
+	struct mlx4_vport_state dummy_admin = {0};
+
+	if (!mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin) ||
+	    !vf_admin->tx_rate)
+		return true;
+
+	dummy_admin.default_qos = qos;
+	dummy_admin.default_vlan = vlan;
+
+	/* VF wants to move to other VST state which is valid with current
+	 * rate limit. Either differnt default vlan in VST or other
+	 * supported QoS priority. Otherwise we don't allow this change when
+	 * the TX rate is still configured.
+	 */
+	if (mlx4_is_vf_vst_and_prio_qos(dev, port, &dummy_admin))
+		return true;
+
+	mlx4_info(dev, "Cannot change VF state to %s while rate is set\n",
+		  (vlan == MLX4_VGT) ? "VGT" : "VST");
+
+	if (vlan != MLX4_VGT)
+		mlx4_info(dev, "VST priority %d not supported for QoS\n", qos);
+
+	mlx4_info(dev, "Please set rate to 0 prior to this VF state change\n");
+
+	return false;
+}
+
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_vport_state *s_info;
@@ -2451,19 +2967,23 @@ int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac)
 	if (slave < 0)
 		return -EINVAL;
 
+	port = mlx4_slaves_closest_port(dev, slave, port);
 	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
-	s_info->mac = mlx4_mac_to_u64(mac);
-	mlx4_info(dev, "default mac on vf %d port %d to %llX will take afect only after vf restart\n",
-		  vf, port, (unsigned long long) s_info->mac);
+	s_info->mac = mac;
+	mlx4_info(dev, "default mac on vf %d port %d to %llX will take effect only after vf restart\n",
+		  vf, port, (unsigned long long)s_info->mac);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_set_vf_mac);
 
-int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
+
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos,
+		     __be16 proto)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct mlx4_vport_oper_state *vf_oper;
 	struct mlx4_vport_state *vf_admin;
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vport_oper_state *vf_oper;
 	int slave;
 
 	if ((!mlx4_is_master(dev)) ||
@@ -2473,38 +2993,137 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
 	if ((vlan > 4095) || (qos > 7))
 		return -EINVAL;
 
+	if (proto == htons(ETH_P_8021AD) &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP))
+		return -EPROTONOSUPPORT;
+
+	if (proto != htons(ETH_P_8021Q) &&
+	    proto != htons(ETH_P_8021AD))
+		return -EINVAL;
+
+	if ((proto == htons(ETH_P_8021AD)) &&
+	    ((vlan == 0) || (vlan == MLX4_VGT)))
+		return -EINVAL;
+
 	slave = mlx4_get_slave_indx(dev, vf);
 	if (slave < 0)
 		return -EINVAL;
 
+	slave_state = &priv->mfunc.master.slave_state[slave];
+	if ((proto == htons(ETH_P_8021AD)) && (slave_state->active) &&
+	    (!slave_state->vst_qinq_supported)) {
+		mlx4_err(dev, "vf %d does not support VST QinQ mode\n", vf);
+		return -EPROTONOSUPPORT;
+	}
+	port = mlx4_slaves_closest_port(dev, slave, port);
 	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
 	vf_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 
+	if (!mlx4_valid_vf_state_change(dev, port, vf_admin, vlan, qos))
+		return -EPERM;
+
 	if ((0 == vlan) && (0 == qos))
 		vf_admin->default_vlan = MLX4_VGT;
 	else
 		vf_admin->default_vlan = vlan;
 	vf_admin->default_qos = qos;
+	vf_admin->vlan_proto = proto;
 
-	if (priv->mfunc.master.slave_state[slave].active &&
-	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP) {
-		mlx4_info(dev, "updating vf %d port %d config params immediately\n",
+	/* If rate was configured prior to VST, we saved the configured rate
+	 * in vf_admin->rate and now, if priority supported we enforce the QoS
+	 */
+	if (mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin) &&
+	    vf_admin->tx_rate)
+		vf_admin->qos_vport = slave;
+
+	/* Try to activate new vf state without restart,
+	 * this option is not supported while moving to VST QinQ mode.
+	 */
+	if ((proto == htons(ETH_P_8021AD) &&
+	     vf_oper->state.vlan_proto != proto) ||
+	    mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
+		mlx4_info(dev,
+			  "updating vf %d port %d config will take effect on next VF restart\n",
 			  vf, port);
-		mlx4_master_immediate_activate_vlan_qos(priv, slave, port);
-	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan);
 
+int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate,
+		     int max_tx_rate)
+{
+	int err;
+	int slave;
+	struct mlx4_vport_state *vf_admin;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP))
+		return -EPROTONOSUPPORT;
+
+	if (min_tx_rate) {
+		mlx4_info(dev, "Minimum BW share not supported\n");
+		return -EPROTONOSUPPORT;
+	}
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	err = mlx4_set_vport_qos(priv, slave, port, max_tx_rate);
+	if (err) {
+		mlx4_info(dev, "vf %d failed to set rate %d\n", vf,
+			  max_tx_rate);
+		return err;
+	}
+
+	vf_admin->tx_rate = max_tx_rate;
+	/* if VF is not in supported mode (VST with supported prio),
+	 * we do not change vport configuration for its QPs, but save
+	 * the rate, so it will be enforced when it moves to supported
+	 * mode next time.
+	 */
+	if (!mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin)) {
+		mlx4_info(dev,
+			  "rate set for VF %d when not in valid state\n", vf);
+
+		if (vf_admin->default_vlan != MLX4_VGT)
+			mlx4_info(dev, "VST priority not supported by QoS\n");
+		else
+			mlx4_info(dev, "VF in VGT mode (needed VST)\n");
+
+		mlx4_info(dev,
+			  "rate %d take affect when VF moves to valid state\n",
+			  max_tx_rate);
+		return 0;
+	}
+
+	/* If user sets rate 0 assigning default vport for its QPs */
+	vf_admin->qos_vport = max_tx_rate ? slave : MLX4_VPP_DEFAULT_VPORT;
+
+	if (priv->mfunc.master.slave_state[slave].active &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP)
+		mlx4_master_immediate_activate_vlan_qos(priv, slave, port);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_rate);
+
  /* mlx4_get_slave_default_vlan -
- * retrun true if VST ( default vlan)
- * if VST will fill vlan & qos (if not NULL) */
-bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, u16 *vlan, u8 *qos)
+ * return true if VST ( default vlan)
+ * if VST, will return vlan & qos (if not NULL)
+ */
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
+				 u16 *vlan, u8 *qos)
 {
 	struct mlx4_vport_oper_state *vp_oper;
 	struct mlx4_priv *priv;
 
 	priv = mlx4_priv(dev);
+	port = mlx4_slaves_closest_port(dev, slave, port);
 	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 
 	if (MLX4_VGT != vp_oper->state.default_vlan) {
@@ -2532,6 +3151,7 @@ int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
 	if (slave < 0)
 		return -EINVAL;
 
+	port = mlx4_slaves_closest_port(dev, slave, port);
 	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
 	s_info->spoofchk = setting;
 
@@ -2539,68 +3159,117 @@ int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
 }
 EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk);
 
-int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state)
+int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
+			   struct mlx4_counter *counter_stats, int reset)
+{
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	struct mlx4_counter *tmp_counter;
+	int err;
+	u32 if_stat_in_mod;
+
+	if (!counter_stats)
+		return -EINVAL;
+
+	if (counter_index == MLX4_SINK_COUNTER_INDEX(dev))
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, sizeof(struct mlx4_counter));
+	if_stat_in_mod = counter_index;
+	if (reset)
+		if_stat_in_mod |= MLX4_QUERY_IF_STAT_RESET;
+	err = mlx4_cmd_box(dev, 0, mailbox->dma,
+			   if_stat_in_mod, 0,
+			   MLX4_CMD_QUERY_IF_STAT,
+			   MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_dbg(dev, "%s: failed to read statistics for counter index %d\n",
+			 __func__, counter_index);
+		goto if_stat_out;
+	}
+	tmp_counter = (struct mlx4_counter *)mailbox->buf;
+	counter_stats->counter_mode = tmp_counter->counter_mode;
+	if (counter_stats->counter_mode == 0) {
+		counter_stats->rx_frames =
+			cpu_to_be64(be64_to_cpu(counter_stats->rx_frames) +
+				    be64_to_cpu(tmp_counter->rx_frames));
+		counter_stats->tx_frames =
+			cpu_to_be64(be64_to_cpu(counter_stats->tx_frames) +
+				    be64_to_cpu(tmp_counter->tx_frames));
+		counter_stats->rx_bytes =
+			cpu_to_be64(be64_to_cpu(counter_stats->rx_bytes) +
+				    be64_to_cpu(tmp_counter->rx_bytes));
+		counter_stats->tx_bytes =
+			cpu_to_be64(be64_to_cpu(counter_stats->tx_bytes) +
+				    be64_to_cpu(tmp_counter->tx_bytes));
+	}
+
+if_stat_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_counter_stats);
+
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct mlx4_vport_state *s_info;
-	struct mlx4_vport_oper_state *vp_oper;
-	int slave;
-	u8 link_stat_event;
 
-	slave = mlx4_get_slave_indx(dev, vf);
-	if (slave < 0)
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_oper[slave].smi_enabled[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled);
+
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 1;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_admin[slave].enable_smi[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_get_enable_smi_admin);
+
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enabled)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 0;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS ||
+	    enabled < 0 || enabled > 1)
 		return -EINVAL;
 
-	switch (link_state) {
-	case IFLA_VF_LINK_STATE_AUTO:
-		/* get link curent state */
-		if (!priv->sense.do_sense_port[port])
-			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
-		else
-			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
-	    break;
-
-	case IFLA_VF_LINK_STATE_ENABLE:
-		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
-	    break;
-
-	case IFLA_VF_LINK_STATE_DISABLE:
-		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
-	    break;
-
-	default:
-		mlx4_warn(dev, "unknown value for link_state %02x on slave %d port %d\n",
-			  link_state, slave, port);
-		return -EINVAL;
+	if (min_port == max_port && dev->caps.num_ports > 1) {
+		mlx4_info(dev, "SMI access disallowed for single ported VFs\n");
+		return -EPROTONOSUPPORT;
 	}
-	/* update the admin & oper state on the link state */
-	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
-	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
-	s_info->link_state = link_state;
-	vp_oper->state.link_state = link_state;
 
-	/* send event */
-	mlx4_gen_port_state_change_eqe(dev, slave, port, link_stat_event);
+	priv->mfunc.master.vf_admin[slave].enable_smi[port] = enabled;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state);
-
-int mlx4_get_vf_link_state(struct mlx4_dev *dev, int port, int vf)
-{
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct mlx4_vport_state *s_info;
-	int slave;
-
-	if (!mlx4_is_master(dev))
-		return -EPROTONOSUPPORT;
-
-	slave = mlx4_get_slave_indx(dev, vf);
-	if (slave < 0)
-		return -EINVAL;
-
-	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
-
-	return s_info->link_state;
-}
-EXPORT_SYMBOL_GPL(mlx4_get_vf_link_state);
+EXPORT_SYMBOL_GPL(mlx4_vf_set_enable_smi_admin);
 
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_cq.c b/sys/dev/mlx4/mlx4_core/mlx4_cq.c
index 9c13820bfa96..4905c8d0a657 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_cq.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_cq.c
@@ -53,18 +53,10 @@
 
 void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
 {
-	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
 	struct mlx4_cq *cq;
 
-	read_lock(&cq_table->cq_table_lock);
-
 	cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
 			       cqn & (dev->caps.num_cqs - 1));
-	if (cq)
-		atomic_inc(&cq->refcount);
-
-	read_unlock(&cq_table->cq_table_lock);
-
 	if (!cq) {
 		mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
 		return;
@@ -73,9 +65,6 @@ void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
 	++cq->arm_sn;
 
 	cq->comp(cq);
-
-	if (atomic_dec_and_test(&cq->refcount))
-		complete(&cq->free);
 }
 
 void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
@@ -83,13 +72,13 @@ void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
 	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
 	struct mlx4_cq *cq;
 
-	read_lock(&cq_table->cq_table_lock);
+	spin_lock(&cq_table->lock);
 
 	cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
 	if (cq)
 		atomic_inc(&cq->refcount);
 
-	read_unlock(&cq_table->cq_table_lock);
+	spin_unlock(&cq_table->lock);
 
 	if (!cq) {
 		mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn);
@@ -137,8 +126,6 @@ int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
 		return PTR_ERR(mailbox);
 
 	cq_context = mailbox->buf;
-	memset(cq_context, 0, sizeof *cq_context);
-
 	cq_context->cq_max_count = cpu_to_be16(count);
 	cq_context->cq_period    = cpu_to_be16(period);
 
@@ -162,8 +149,6 @@ int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
 		return PTR_ERR(mailbox);
 
 	cq_context = mailbox->buf;
-	memset(cq_context, 0, sizeof *cq_context);
-
 	cq_context->logsize_usrpage = cpu_to_be32(ilog2(entries) << 24);
 	cq_context->log_page_size   = mtt->page_shift - 12;
 	mtt_addr = mlx4_mtt_addr(dev, mtt);
@@ -177,28 +162,6 @@ int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
 }
 EXPORT_SYMBOL_GPL(mlx4_cq_resize);
 
-int mlx4_cq_ignore_overrun(struct mlx4_dev *dev, struct mlx4_cq *cq)
-{
-	struct mlx4_cmd_mailbox *mailbox;
-	struct mlx4_cq_context *cq_context;
-	int err;
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
-
-	cq_context = mailbox->buf;
-	memset(cq_context, 0, sizeof *cq_context);
-
-	cq_context->flags |= cpu_to_be32(MLX4_CQ_FLAG_OI);
-
-	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 3);
-
-	mlx4_free_cmd_mailbox(dev, mailbox);
-	return err;
-}
-EXPORT_SYMBOL_GPL(mlx4_cq_ignore_overrun);
-
 int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -209,11 +172,11 @@ int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
 	if (*cqn == -1)
 		return -ENOMEM;
 
-	err = mlx4_table_get(dev, &cq_table->table, *cqn);
+	err = mlx4_table_get(dev, &cq_table->table, *cqn, GFP_KERNEL);
 	if (err)
 		goto err_out;
 
-	err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn);
+	err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn, GFP_KERNEL);
 	if (err)
 		goto err_put;
 	return 0;
@@ -283,7 +246,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	u64 mtt_addr;
 	int err;
 
-	if (vector > dev->caps.num_comp_vectors + dev->caps.comp_pool)
+	if (vector >= dev->caps.num_comp_vectors)
 		return -EINVAL;
 
 	cq->vector = vector;
@@ -305,14 +268,14 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	}
 
 	cq_context = mailbox->buf;
-	memset(cq_context, 0, sizeof *cq_context);
-
 	cq_context->flags	    = cpu_to_be32(!!collapsed << 18);
 	if (timestamp_en)
 		cq_context->flags  |= cpu_to_be32(1 << 19);
 
-	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
-	cq_context->comp_eqn	    = priv->eq_table.eq[vector].eqn;
+	cq_context->logsize_usrpage =
+		cpu_to_be32((ilog2(nent) << 24) |
+			    mlx4_to_hw_uar_index(dev, uar->index));
+	cq_context->comp_eqn	    = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn;
 	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 
 	mtt_addr = mlx4_mtt_addr(dev, mtt);
@@ -331,9 +294,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	atomic_set(&cq->refcount, 1);
 	init_completion(&cq->free);
 
-	cq->eqn = priv->eq_table.eq[cq->vector].eqn;
-	cq->irq = priv->eq_table.eq[cq->vector].irq;
-
+	cq->irq = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].irq;
 	return 0;
 
 err_radix:
@@ -358,7 +319,10 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
 	if (err)
 		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
 
-	synchronize_irq(priv->eq_table.eq[cq->vector].irq);
+	synchronize_irq(priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq->vector)].irq);
+	if (priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq->vector)].irq !=
+	    priv->eq_table.eq[MLX4_EQ_ASYNC].irq)
+		synchronize_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
 
 	spin_lock_irq(&cq_table->lock);
 	radix_tree_delete(&cq_table->tree, cq->cqn);
@@ -378,7 +342,6 @@ int mlx4_init_cq_table(struct mlx4_dev *dev)
 	int err;
 
 	spin_lock_init(&cq_table->lock);
-	rwlock_init(&cq_table->cq_table_lock);
 	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
 	if (mlx4_is_slave(dev))
 		return 0;
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_eq.c b/sys/dev/mlx4/mlx4_core/mlx4_eq.c
index 4765ea1d2437..5b331f9a34b9 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_eq.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_eq.c
@@ -36,6 +36,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
+#include <linux/hardirq.h>
 
 #include <dev/mlx4/cmd.h>
 
@@ -102,21 +103,24 @@ static void eq_set_ci(struct mlx4_eq *eq, int req_not)
 	mb();
 }
 
-static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor)
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor,
+				u8 eqe_size)
 {
 	/* (entry & (eq->nent - 1)) gives us a cyclic array */
-	unsigned long offset = (entry & (eq->nent - 1)) * (MLX4_EQ_ENTRY_SIZE << eqe_factor);
-	/* CX3 is capable of extending the EQE from 32 to 64 bytes.
-	 * When this feature is enabled, the first (in the lower addresses)
+	unsigned long offset = (entry & (eq->nent - 1)) * eqe_size;
+	/* CX3 is capable of extending the EQE from 32 to 64 bytes with
+	 * strides of 64B,128B and 256B.
+	 * When 64B EQE is used, the first (in the lower addresses)
 	 * 32 bytes in the 64 byte EQE are reserved and the next 32 bytes
 	 * contain the legacy EQE information.
+	 * In all other cases, the first 32B contains the legacy EQE info.
 	 */
 	return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE;
 }
 
-static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor)
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor, u8 size)
 {
-	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor);
+	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor, size);
 	return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
 }
 
@@ -141,24 +145,47 @@ void mlx4_gen_slave_eqe(struct work_struct *work)
 	struct mlx4_slave_event_eq *slave_eq = &mfunc->master.slave_eq;
 	struct mlx4_eqe *eqe;
 	u8 slave;
-	int i;
+	int i, phys_port, slave_port;
 
 	for (eqe = next_slave_event_eqe(slave_eq); eqe;
 	      eqe = next_slave_event_eqe(slave_eq)) {
 		slave = eqe->slave_id;
 
+		if (eqe->type == MLX4_EVENT_TYPE_PORT_CHANGE &&
+		    eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN &&
+		    mlx4_is_bonded(dev)) {
+			struct mlx4_port_cap port_cap;
+
+			if (!mlx4_QUERY_PORT(dev, 1, &port_cap) && port_cap.link_state)
+				goto consume;
+
+			if (!mlx4_QUERY_PORT(dev, 2, &port_cap) && port_cap.link_state)
+				goto consume;
+		}
 		/* All active slaves need to receive the event */
 		if (slave == ALL_SLAVES) {
-			for (i = 0; i < dev->num_slaves; i++) {
+			for (i = 0; i <= dev->persist->num_vfs; i++) {
+				phys_port = 0;
+				if (eqe->type == MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT &&
+				    eqe->subtype == MLX4_DEV_PMC_SUBTYPE_PORT_INFO) {
+					phys_port  = eqe->event.port_mgmt_change.port;
+					slave_port = mlx4_phys_to_slave_port(dev, i, phys_port);
+					if (slave_port < 0) /* VF doesn't have this port */
+						continue;
+					eqe->event.port_mgmt_change.port = slave_port;
+				}
 				if (mlx4_GEN_EQE(dev, i, eqe))
-					mlx4_warn(dev, "Failed to generate "
-						  "event for slave %d\n", i);
+					mlx4_warn(dev, "Failed to generate event for slave %d\n",
+						  i);
+				if (phys_port)
+					eqe->event.port_mgmt_change.port = phys_port;
 			}
 		} else {
 			if (mlx4_GEN_EQE(dev, slave, eqe))
-				mlx4_warn(dev, "Failed to generate event "
-					       "for slave %d\n", slave);
+				mlx4_warn(dev, "Failed to generate event for slave %d\n",
+					  slave);
 		}
+consume:
 		++slave_eq->cons;
 	}
 }
@@ -175,13 +202,13 @@ static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe)
 	s_eqe = &slave_eq->event_eqe[slave_eq->prod & (SLAVE_EVENT_EQ_SIZE - 1)];
 	if ((!!(s_eqe->owner & 0x80)) ^
 	    (!!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE))) {
-		mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. "
-			  "No free EQE on slave events queue\n", slave);
+		mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. No free EQE on slave events queue\n",
+			  slave);
 		spin_unlock_irqrestore(&slave_eq->event_lock, flags);
 		return;
 	}
 
-	memcpy(s_eqe, eqe, dev->caps.eqe_size - 1);
+	memcpy(s_eqe, eqe, sizeof(struct mlx4_eqe) - 1);
 	s_eqe->slave_id = slave;
 	/* ensure all information is written before setting the ownersip bit */
 	wmb();
@@ -198,16 +225,26 @@ static void mlx4_slave_event(struct mlx4_dev *dev, int slave,
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	if (slave < 0 || slave >= dev->num_slaves ||
-	    slave == dev->caps.function)
-		return;
-
-	if (!priv->mfunc.master.slave_state[slave].active)
+	if (slave < 0 || slave > dev->persist->num_vfs ||
+	    slave == dev->caps.function ||
+	    !priv->mfunc.master.slave_state[slave].active)
 		return;
 
 	slave_event(dev, slave, eqe);
 }
 
+static void mlx4_set_eq_affinity_hint(struct mlx4_priv *priv, int vec)
+{
+	int hint_err;
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_eq *eq = &priv->eq_table.eq[vec];
+
+	hint_err = bind_irq_to_cpu(eq->irq, eq->affinity_cpu_id);
+
+	if (hint_err)
+		mlx4_warn(dev, "bind_irq_to_cpu failed, err %d\n", hint_err);
+}
+
 int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
 {
 	struct mlx4_eqe eqe;
@@ -222,7 +259,7 @@ int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
 
 	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
 	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE;
-	eqe.event.port_mgmt_change.port = port;
+	eqe.event.port_mgmt_change.port = mlx4_phys_to_slave_port(dev, slave, port);
 
 	return mlx4_GEN_EQE(dev, slave, &eqe);
 }
@@ -233,13 +270,13 @@ int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
 	struct mlx4_eqe eqe;
 
 	/*don't send if we don't have the that slave */
-	if (dev->num_vfs < slave)
+	if (dev->persist->num_vfs < slave)
 		return 0;
 	memset(&eqe, 0, sizeof eqe);
 
 	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
 	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO;
-	eqe.event.port_mgmt_change.port = port;
+	eqe.event.port_mgmt_change.port = mlx4_phys_to_slave_port(dev, slave, port);
 
 	return mlx4_GEN_EQE(dev, slave, &eqe);
 }
@@ -249,15 +286,16 @@ int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
 				   u8 port_subtype_change)
 {
 	struct mlx4_eqe eqe;
+	u8 slave_port = mlx4_phys_to_slave_port(dev, slave, port);
 
 	/*don't send if we don't have the that slave */
-	if (dev->num_vfs < slave)
+	if (dev->persist->num_vfs < slave)
 		return 0;
 	memset(&eqe, 0, sizeof eqe);
 
 	eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE;
 	eqe.subtype = port_subtype_change;
-	eqe.event.port_change.port = cpu_to_be32(port << 28);
+	eqe.event.port_change.port = cpu_to_be32(slave_port << 28);
 
 	mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__,
 		 port_subtype_change, slave, port);
@@ -269,7 +307,10 @@ enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave,
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
-	if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS) {
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
 		pr_err("%s: Error: asking for slave:%d, port:%d\n",
 		       __func__, slave, port);
 		return SLAVE_PORT_DOWN;
@@ -283,8 +324,10 @@ static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port,
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
 
-	if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) {
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
 		pr_err("%s: Error: asking for slave:%d, port:%d\n",
 		       __func__, slave, port);
 		return -1;
@@ -298,9 +341,13 @@ static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
 {
 	int i;
 	enum slave_port_gen_event gen_event;
+	struct mlx4_slaves_pport slaves_pport = mlx4_phys_to_slaves_pport(dev,
+									  port);
 
-	for (i = 0; i < dev->num_slaves; i++)
-		set_and_calc_slave_port_state(dev, i, port, event, &gen_event);
+	for (i = 0; i < dev->persist->num_vfs + 1; i++)
+		if (test_bit(i, slaves_pport.slaves))
+			set_and_calc_slave_port_state(dev, i, port,
+						      event, &gen_event);
 }
 /**************************************************************************
 	The function get as input the new event to that port,
@@ -319,12 +366,14 @@ int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
 	struct mlx4_slave_state *ctx = NULL;
 	unsigned long flags;
 	int ret = -1;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
 	enum slave_port_state cur_state =
 		mlx4_get_slave_port_state(dev, slave, port);
 
 	*gen_event = SLAVE_PORT_GEN_EVENT_NONE;
 
-	if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) {
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
 		pr_err("%s: Error: asking for slave:%d, port:%d\n",
 		       __func__, slave, port);
 		return ret;
@@ -362,9 +411,9 @@ int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
 		}
 		break;
 	default:
-		pr_err("%s: BUG!!! UNKNOWN state: "
-		       "slave:%d, port:%d\n", __func__, slave, port);
-			goto out;
+		pr_err("%s: BUG!!! UNKNOWN state: slave:%d, port:%d\n",
+		       __func__, slave, port);
+		goto out;
 	}
 	ret = mlx4_get_slave_port_state(dev, slave, port);
 
@@ -375,7 +424,7 @@ int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
 
 EXPORT_SYMBOL(set_and_calc_slave_port_state);
 
-int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr, u16 sm_lid, u8 sm_sl)
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr)
 {
 	struct mlx4_eqe eqe;
 
@@ -386,12 +435,6 @@ int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr, u16 sm_
 	eqe.event.port_mgmt_change.port = port;
 	eqe.event.port_mgmt_change.params.port_info.changed_attr =
 		cpu_to_be32((u32) attr);
-	if (attr & MSTR_SM_CHANGE_MASK) {
-		eqe.event.port_mgmt_change.params.port_info.mstr_sm_lid =
-			cpu_to_be16(sm_lid);
-		eqe.event.port_mgmt_change.params.port_info.mstr_sm_sl =
-			sm_sl;
-	}
 
 	slave_event(dev, ALL_SLAVES, &eqe);
 	return 0;
@@ -418,10 +461,16 @@ void mlx4_master_handle_slave_flr(struct work_struct *work)
 	for (i = 0 ; i < dev->num_slaves; i++) {
 
 		if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) {
-			mlx4_dbg(dev, "mlx4_handle_slave_flr: "
-				 "clean slave: %d\n", i);
-
-			mlx4_delete_all_resources_for_slave(dev, i);
+			mlx4_dbg(dev, "mlx4_handle_slave_flr: clean slave: %d\n",
+				 i);
+			/* In case of 'Reset flow' FLR can be generated for
+			 * a slave before mlx4_load_one is done.
+			 * make sure interface is up before trying to delete
+			 * slave resources which weren't allocated yet.
+			 */
+			if (dev->persist->interface_state &
+			    MLX4_INTERFACE_STATE_UP)
+				mlx4_delete_all_resources_for_slave(dev, i);
 			/*return the slave to running mode*/
 			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
 			slave_state[i].last_cmd = MLX4_COMM_CMD_RESET;
@@ -431,8 +480,8 @@ void mlx4_master_handle_slave_flr(struct work_struct *work)
 			err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE,
 				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 			if (err)
-				mlx4_warn(dev, "Failed to notify FW on "
-					  "FLR done (slave:%d)\n", i);
+				mlx4_warn(dev, "Failed to notify FW on FLR done (slave:%d)\n",
+					  i);
 		}
 	}
 }
@@ -441,7 +490,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_eqe *eqe;
-	int cqn;
+	int cqn = -1;
 	int eqes_found = 0;
 	int set_ci = 0;
 	int port;
@@ -453,8 +502,9 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 	enum slave_port_gen_event gen_event;
 	unsigned long flags;
 	struct mlx4_vport_state *s_info;
+	int eqe_size = dev->caps.eqe_size;
 
-	while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor))) {
+	while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor, eqe_size))) {
 		/*
 		 * Make sure we read EQ entry contents after we've
 		 * checked the ownership bit.
@@ -483,9 +533,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 						be32_to_cpu(eqe->event.qp.qpn)
 						& 0xffffff, &slave);
 				if (ret && ret != -ENOENT) {
-					mlx4_dbg(dev, "QP event %02x(%02x) on "
-						 "EQ %d at index %u: could "
-						 "not get slave id (%d)\n",
+					mlx4_dbg(dev, "QP event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
 						 eqe->type, eqe->subtype,
 						 eq->eqn, eq->cons_index, ret);
 					break;
@@ -504,7 +552,6 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 		case MLX4_EVENT_TYPE_SRQ_LIMIT:
 			mlx4_dbg(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT\n",
 				 __func__);
-		/* fall through */
 		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
 			if (mlx4_is_master(dev)) {
 				/* forward only to slave owning the SRQ */
@@ -514,22 +561,20 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 						& 0xffffff,
 						&slave);
 				if (ret && ret != -ENOENT) {
-					mlx4_warn(dev, "SRQ event %02x(%02x) "
-						  "on EQ %d at index %u: could"
-						  " not get slave id (%d)\n",
+					mlx4_warn(dev, "SRQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
 						  eqe->type, eqe->subtype,
 						  eq->eqn, eq->cons_index, ret);
 					break;
 				}
-				mlx4_dbg(dev, "%s: slave:%d, srq_no:0x%x, event: %02x(%02x)\n",
-					 __func__, slave,
-					 be32_to_cpu(eqe->event.srq.srqn),
-					 eqe->type, eqe->subtype);
+				mlx4_warn(dev, "%s: slave:%d, srq_no:0x%x, event: %02x(%02x)\n",
+					  __func__, slave,
+					  be32_to_cpu(eqe->event.srq.srqn),
+					  eqe->type, eqe->subtype);
 
 				if (!ret && slave != dev->caps.function) {
-					mlx4_dbg(dev, "%s: sending event %02x(%02x) to slave:%d\n",
-						 __func__, eqe->type,
-						 eqe->subtype, slave);
+					mlx4_warn(dev, "%s: sending event %02x(%02x) to slave:%d\n",
+						  __func__, eqe->type,
+						  eqe->subtype, slave);
 					mlx4_slave_event(dev, slave, eqe);
 					break;
 				}
@@ -545,24 +590,35 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 				       be64_to_cpu(eqe->event.cmd.out_param));
 			break;
 
-		case MLX4_EVENT_TYPE_PORT_CHANGE:
+		case MLX4_EVENT_TYPE_PORT_CHANGE: {
+			struct mlx4_slaves_pport slaves_port;
 			port = be32_to_cpu(eqe->event.port_change.port) >> 28;
+			slaves_port = mlx4_phys_to_slaves_pport(dev, port);
 			if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) {
 				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
 						    port);
 				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
 				if (!mlx4_is_master(dev))
 					break;
-				for (i = 0; i < dev->num_slaves; i++) {
+				for (i = 0; i < dev->persist->num_vfs + 1;
+				     i++) {
+					int reported_port = mlx4_is_bonded(dev) ? 1 : mlx4_phys_to_slave_port(dev, i, port);
+
+					if (!test_bit(i, slaves_port.slaves) && !mlx4_is_bonded(dev))
+						continue;
 					if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
 						if (i == mlx4_master_func_num(dev))
 							continue;
-						mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN"
-							 " to slave: %d, port:%d\n",
+						mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN to slave: %d, port:%d\n",
 							 __func__, i, port);
-						s_info = &priv->mfunc.master.vf_oper[slave].vport[port].state;
-						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state)
+						s_info = &priv->mfunc.master.vf_oper[i].vport[port].state;
+						if (0 /*IFLA_VF_LINK_STATE_AUTO == s_info->link_state*/) {
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (reported_port << 28));
 							mlx4_slave_event(dev, i, eqe);
+						}
 					} else {  /* IB port */
 						set_and_calc_slave_port_state(dev, i, port,
 									      MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
@@ -571,6 +627,10 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 						if (SLAVE_PORT_GEN_EVENT_DOWN ==  gen_event) {
 							if (i == mlx4_master_func_num(dev))
 								continue;
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (mlx4_phys_to_slave_port(dev, i, port) << 28));
 							mlx4_slave_event(dev, i, eqe);
 						}
 					}
@@ -583,12 +643,23 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 				if (!mlx4_is_master(dev))
 					break;
 				if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
-					for (i = 0; i < dev->num_slaves; i++) {
+					for (i = 0;
+					     i < dev->persist->num_vfs + 1;
+					     i++) {
+						int reported_port = mlx4_is_bonded(dev) ? 1 : mlx4_phys_to_slave_port(dev, i, port);
+
+						if (!test_bit(i, slaves_port.slaves) && !mlx4_is_bonded(dev))
+							continue;
 						if (i == mlx4_master_func_num(dev))
 							continue;
-						s_info = &priv->mfunc.master.vf_oper[slave].vport[port].state;
-						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state)
+						s_info = &priv->mfunc.master.vf_oper[i].vport[port].state;
+						if (0 /*IFLA_VF_LINK_STATE_AUTO == s_info->link_state*/) {
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (reported_port << 28));
 							mlx4_slave_event(dev, i, eqe);
+						}
 					}
 				else /* IB port */
 					/* port-up event will be sent to a slave when the
@@ -597,6 +668,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 					set_all_slave_state(dev, port, MLX4_DEV_EVENT_PORT_UP);
 			}
 			break;
+		}
 
 		case MLX4_EVENT_TYPE_CQ_ERROR:
 			mlx4_warn(dev, "CQ %s on CQN %06x\n",
@@ -609,11 +681,9 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 					be32_to_cpu(eqe->event.cq_err.cqn)
 					& 0xffffff, &slave);
 				if (ret && ret != -ENOENT) {
-					mlx4_dbg(dev, "CQ event %02x(%02x) on "
-						 "EQ %d at index %u: could "
-						  "not get slave id (%d)\n",
-						  eqe->type, eqe->subtype,
-						  eq->eqn, eq->cons_index, ret);
+					mlx4_dbg(dev, "CQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						 eqe->type, eqe->subtype,
+						 eq->eqn, eq->cons_index, ret);
 					break;
 				}
 
@@ -635,35 +705,27 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 		case MLX4_EVENT_TYPE_OP_REQUIRED:
 			atomic_inc(&priv->opreq_count);
 			/* FW commands can't be executed from interrupt context
-			   working in deferred task */
+			 * working in deferred task
+			 */
 			queue_work(mlx4_wq, &priv->opreq_task);
 			break;
 
 		case MLX4_EVENT_TYPE_COMM_CHANNEL:
 			if (!mlx4_is_master(dev)) {
-				mlx4_warn(dev, "Received comm channel event "
-					       "for non master device\n");
+				mlx4_warn(dev, "Received comm channel event for non master device\n");
 				break;
 			}
-
 			memcpy(&priv->mfunc.master.comm_arm_bit_vector,
 			       eqe->event.comm_channel_arm.bit_vec,
 			       sizeof eqe->event.comm_channel_arm.bit_vec);
-
-			if (!queue_work(priv->mfunc.master.comm_wq,
-				   &priv->mfunc.master.comm_work))
-				mlx4_warn(dev, "Failed to queue comm channel work\n");
-
-			if (!queue_work(priv->mfunc.master.comm_wq,
-				   &priv->mfunc.master.arm_comm_work))
-				mlx4_warn(dev, "Failed to queue arm comm channel work\n");
+			queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.comm_work);
 			break;
 
 		case MLX4_EVENT_TYPE_FLR_EVENT:
 			flr_slave = be32_to_cpu(eqe->event.flr_event.slave_id);
 			if (!mlx4_is_master(dev)) {
-				mlx4_warn(dev, "Non-master function received"
-					       "FLR event\n");
+				mlx4_warn(dev, "Non-master function received FLR event\n");
 				break;
 			}
 
@@ -684,6 +746,8 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 				priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1;
 			}
 			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
+					    flr_slave);
 			queue_work(priv->mfunc.master.comm_wq,
 				   &priv->mfunc.master.slave_flr_event_work);
 			break;
@@ -692,22 +756,17 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 			if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) {
 				if (mlx4_is_master(dev))
 					for (i = 0; i < dev->num_slaves; i++) {
-						mlx4_dbg(dev, "%s: Sending "
-							"MLX4_FATAL_WARNING_SUBTYPE_WARMING"
-							" to slave: %d\n", __func__, i);
+						mlx4_dbg(dev, "%s: Sending MLX4_FATAL_WARNING_SUBTYPE_WARMING to slave: %d\n",
+							 __func__, i);
 						if (i == dev->caps.function)
 							continue;
 						mlx4_slave_event(dev, i, eqe);
 					}
-				mlx4_err(dev, "Temperature Threshold was reached! "
-					"Threshold: %d celsius degrees; "
-					"Current Temperature: %d\n",
-					be16_to_cpu(eqe->event.warming.warning_threshold),
-					be16_to_cpu(eqe->event.warming.current_temperature));
+				mlx4_err(dev, "Temperature Threshold was reached! Threshold: %d celsius degrees; Current Temperature: %d\n",
+					 be16_to_cpu(eqe->event.warming.warning_threshold),
+					 be16_to_cpu(eqe->event.warming.current_temperature));
 			} else
-				mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), "
-					  "subtype %02x on EQ %d at index %u. owner=%x, "
-					  "nent=0x%x, slave=%x, ownership=%s\n",
+				mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), subtype %02x on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n",
 					  eqe->type, eqe->subtype, eq->eqn,
 					  eq->cons_index, eqe->owner, eq->nent,
 					  eqe->slave_id,
@@ -731,9 +790,8 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 				mlx4_warn(dev, "Unsupported cable detected\n");
 				break;
 			default:
-				mlx4_dbg(dev, "Unhandled recoverable error event "
-					 "detected: %02x(%02x) on EQ %d at index %u. "
-					 "owner=%x, nent=0x%x, ownership=%s\n",
+				mlx4_dbg(dev,
+					 "Unhandled recoverable error event detected: %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, ownership=%s\n",
 					 eqe->type, eqe->subtype, eq->eqn,
 					 eq->cons_index, eqe->owner, eq->nent,
 					 !!(eqe->owner & 0x80) ^
@@ -745,9 +803,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
 		case MLX4_EVENT_TYPE_ECC_DETECT:
 		default:
-			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at "
-				  "index %u. owner=%x, nent=0x%x, slave=%x, "
-				  "ownership=%s\n",
+			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n",
 				  eqe->type, eqe->subtype, eq->eqn,
 				  eq->cons_index, eqe->owner, eq->nent,
 				  eqe->slave_id,
@@ -847,12 +903,10 @@ static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
 			MLX4_CMD_WRAPPED);
 }
 
-static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
-			 int eq_num)
+static int mlx4_HW2SW_EQ(struct mlx4_dev *dev,  int eq_num)
 {
-	return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num,
-			    0, MLX4_CMD_HW2SW_EQ, MLX4_CMD_TIME_CLASS_A,
-			    MLX4_CMD_WRAPPED);
+	return mlx4_cmd(dev, 0, eq_num, 1, MLX4_CMD_HW2SW_EQ,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 }
 
 static int mlx4_num_eq_uar(struct mlx4_dev *dev)
@@ -862,8 +916,8 @@ static int mlx4_num_eq_uar(struct mlx4_dev *dev)
 	 * we need to map, take the difference of highest index and
 	 * the lowest index we'll use and add 1.
 	 */
-	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs +
-		 dev->caps.comp_pool)/4 - dev->caps.reserved_eqs/4 + 1;
+	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 -
+		dev->caps.reserved_eqs / 4 + 1;
 }
 
 static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
@@ -875,9 +929,9 @@ static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
 
 	if (!priv->eq_table.uar_map[index]) {
 		priv->eq_table.uar_map[index] =
-			ioremap(pci_resource_start(dev->pdev, 2) +
-				((eq->eqn / 4) << PAGE_SHIFT),
-				PAGE_SIZE);
+			ioremap(pci_resource_start(dev->persist->pdev, 2) +
+				((eq->eqn / 4) << (dev->uar_page_shift)),
+				(1 << (dev->uar_page_shift)));
 		if (!priv->eq_table.uar_map[index]) {
 			mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
 				 eq->eqn);
@@ -915,8 +969,10 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 
 	eq->dev   = dev;
 	eq->nent  = roundup_pow_of_two(max(nent, 2));
-	/* CX3 is capable of extending the CQE\EQE from 32 to 64 bytes */
-	npages = PAGE_ALIGN(eq->nent * (MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor)) / PAGE_SIZE;
+	/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
+	 * strides of 64B,128B and 256B.
+	 */
+	npages = PAGE_ALIGN(eq->nent * dev->caps.eqe_size) / PAGE_SIZE;
 
 	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
 				GFP_KERNEL);
@@ -936,8 +992,10 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 	eq_context = mailbox->buf;
 
 	for (i = 0; i < npages; ++i) {
-		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
-							  PAGE_SIZE, &t, GFP_KERNEL);
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->persist->
+							  pdev->dev,
+							  PAGE_SIZE, &t,
+							  GFP_KERNEL);
 		if (!eq->page_list[i].buf)
 			goto err_out_free_pages;
 
@@ -965,7 +1023,6 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 	if (err)
 		goto err_out_free_mtt;
 
-	memset(eq_context, 0, sizeof *eq_context);
 	eq_context->flags	  = cpu_to_be32(MLX4_EQ_STATUS_OK   |
 						MLX4_EQ_STATE_ARMED);
 	eq_context->log_eq_size	  = ilog2(eq->nent);
@@ -998,7 +1055,7 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 err_out_free_pages:
 	for (i = 0; i < npages; ++i)
 		if (eq->page_list[i].buf)
-			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+			dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 					  eq->page_list[i].buf,
 					  eq->page_list[i].map);
 
@@ -1016,71 +1073,44 @@ static void mlx4_free_eq(struct mlx4_dev *dev,
 			 struct mlx4_eq *eq)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct mlx4_cmd_mailbox *mailbox;
 	int err;
 	int i;
-	/* CX3 is capable of extending the CQE\EQE from 32 to 64 bytes */
-	int npages = PAGE_ALIGN((MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor) * eq->nent) / PAGE_SIZE;
+	/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
+	 * strides of 64B,128B and 256B
+	 */
+	int npages = PAGE_ALIGN(dev->caps.eqe_size  * eq->nent) / PAGE_SIZE;
 
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox))
-		return;
-
-	err = mlx4_HW2SW_EQ(dev, mailbox, eq->eqn);
+	err = mlx4_HW2SW_EQ(dev, eq->eqn);
 	if (err)
 		mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err);
 
-	if (0) {
-		mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
-		for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) {
-			if (i % 4 == 0)
-				pr_cont("[%02x] ", i * 4);
-			pr_cont(" %08x", be32_to_cpup(mailbox->buf + i * 4));
-			if ((i + 1) % 4 == 0)
-				pr_cont("\n");
-		}
-	}
+	synchronize_irq(eq->irq);
 
 	mlx4_mtt_cleanup(dev, &eq->mtt);
 	for (i = 0; i < npages; ++i)
-		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
-				    eq->page_list[i].buf,
-				    eq->page_list[i].map);
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  eq->page_list[i].buf,
+				  eq->page_list[i].map);
 
 	kfree(eq->page_list);
 	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
-	mlx4_free_cmd_mailbox(dev, mailbox);
 }
 
 static void mlx4_free_irqs(struct mlx4_dev *dev)
 {
 	struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	int	i, vec;
+	int	i;
 
 	if (eq_table->have_irq)
-		free_irq(dev->pdev->irq, dev);
+		free_irq(dev->persist->pdev->irq, dev);
 
 	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		if (eq_table->eq[i].have_irq) {
+			eq_table->eq[i].affinity_cpu_id = NOCPU;
 			free_irq(eq_table->eq[i].irq, eq_table->eq + i);
 			eq_table->eq[i].have_irq = 0;
 		}
 
-	for (i = 0; i < dev->caps.comp_pool; i++) {
-		/*
-		 * Freeing the assigned irq's
-		 * all bits should be 0, but we need to validate
-		 */
-		if (priv->msix_ctl.pool_bm & 1ULL << i) {
-			/* NO need protecting*/
-			vec = dev->caps.num_comp_vectors + 1 + i;
-			free_irq(priv->eq_table.eq[vec].irq,
-				 &priv->eq_table.eq[vec]);
-		}
-	}
-
-
 	kfree(eq_table->irq_names);
 }
 
@@ -1088,10 +1118,11 @@ static int mlx4_map_clr_int(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) +
+	priv->clr_base = ioremap(pci_resource_start(dev->persist->pdev,
+				 priv->fw.clr_int_bar) +
 				 priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
 	if (!priv->clr_base) {
-		mlx4_err(dev, "Couldn't map interrupt clear register, aborting.\n");
+		mlx4_err(dev, "Couldn't map interrupt clear register, aborting\n");
 		return -ENOMEM;
 	}
 
@@ -1137,11 +1168,11 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 	}
 
 	err = mlx4_bitmap_init(&priv->eq_table.bitmap,
-				roundup_pow_of_two(dev->caps.num_eqs),
-				dev->caps.num_eqs - 1,
-				dev->caps.reserved_eqs,
-				roundup_pow_of_two(dev->caps.num_eqs) -
-					dev->caps.num_eqs);
+			       roundup_pow_of_two(dev->caps.num_eqs),
+			       dev->caps.num_eqs - 1,
+			       dev->caps.reserved_eqs,
+			       roundup_pow_of_two(dev->caps.num_eqs) -
+			       dev->caps.num_eqs);
 	if (err)
 		goto err_out_free;
 
@@ -1160,109 +1191,107 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 	}
 
 	priv->eq_table.irq_names =
-		kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1 +
-					     dev->caps.comp_pool),
+		kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1),
 			GFP_KERNEL);
 	if (!priv->eq_table.irq_names) {
 		err = -ENOMEM;
 		goto err_out_clr_int;
 	}
 
-	for (i = 0; i < dev->caps.num_comp_vectors; ++i) {
-		err = mlx4_create_eq(dev, dev->quotas.cq +
-					  MLX4_NUM_SPARE_EQE,
-				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
-				     &priv->eq_table.eq[i]);
-		if (err) {
-			--i;
-			goto err_out_unmap;
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
+		if (i == MLX4_EQ_ASYNC) {
+			err = mlx4_create_eq(dev,
+					     MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+					     0, &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+		} else {
+			struct mlx4_eq	*eq = &priv->eq_table.eq[i];
+#ifdef CONFIG_RFS_ACCEL
+			int port = find_first_bit(eq->actv_ports.ports,
+						  dev->caps.num_ports) + 1;
+
+			if (port <= dev->caps.num_ports) {
+				struct mlx4_port_info *info =
+					&mlx4_priv(dev)->port[port];
+
+				if (!info->rmap) {
+					info->rmap = alloc_irq_cpu_rmap(
+						mlx4_get_eqs_per_port(dev, port));
+					if (!info->rmap) {
+						mlx4_warn(dev, "Failed to allocate cpu rmap\n");
+						err = -ENOMEM;
+						goto err_out_unmap;
+					}
+				}
+
+				err = irq_cpu_rmap_add(
+					info->rmap, eq->irq);
+				if (err)
+					mlx4_warn(dev, "Failed adding irq rmap\n");
+			}
+#endif
+			err = mlx4_create_eq(dev, dev->quotas.cq +
+						  MLX4_NUM_SPARE_EQE,
+					     (dev->flags & MLX4_FLAG_MSI_X) ?
+					     i + 1 - !!(i > MLX4_EQ_ASYNC) : 0,
+					     eq);
 		}
-	}
-
-	err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
-			     (dev->flags & MLX4_FLAG_MSI_X) ? dev->caps.num_comp_vectors : 0,
-			     &priv->eq_table.eq[dev->caps.num_comp_vectors]);
-	if (err)
-		goto err_out_comp;
-
-	/*if additional completion vectors poolsize is 0 this loop will not run*/
-	for (i = dev->caps.num_comp_vectors + 1;
-	      i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) {
-
-		err = mlx4_create_eq(dev, dev->quotas.cq +
-					  MLX4_NUM_SPARE_EQE,
-				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
-				     &priv->eq_table.eq[i]);
-		if (err) {
-			--i;
+		if (err)
 			goto err_out_unmap;
-		}
 	}
 
-
 	if (dev->flags & MLX4_FLAG_MSI_X) {
 		const char *eq_name;
 
-		for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
-			if (i < dev->caps.num_comp_vectors) {
-				snprintf(priv->eq_table.irq_names +
-					 i * MLX4_IRQNAME_SIZE,
-					 MLX4_IRQNAME_SIZE,
-					 "mlx4-comp-%d@pci:%s", i,
-					 pci_name(dev->pdev));
-			} else {
-				snprintf(priv->eq_table.irq_names +
-					 i * MLX4_IRQNAME_SIZE,
-					 MLX4_IRQNAME_SIZE,
-					 "mlx4-async@pci:%s",
-					 pci_name(dev->pdev));
-			}
+		snprintf(priv->eq_table.irq_names +
+			 MLX4_EQ_ASYNC * MLX4_IRQNAME_SIZE,
+			 MLX4_IRQNAME_SIZE,
+			 "mlx4-async@pci:%s",
+			 pci_name(dev->persist->pdev));
+		eq_name = priv->eq_table.irq_names +
+			MLX4_EQ_ASYNC * MLX4_IRQNAME_SIZE;
 
-			eq_name = priv->eq_table.irq_names +
-				  i * MLX4_IRQNAME_SIZE;
-			err = request_irq(priv->eq_table.eq[i].irq,
-					  mlx4_msi_x_interrupt, 0, eq_name,
-					  priv->eq_table.eq + i);
-			if (err)
-				goto err_out_async;
+		err = request_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq,
+				  mlx4_msi_x_interrupt, 0, eq_name,
+				  priv->eq_table.eq + MLX4_EQ_ASYNC);
+		if (err)
+			goto err_out_unmap;
 
-			priv->eq_table.eq[i].have_irq = 1;
-		}
+		priv->eq_table.eq[MLX4_EQ_ASYNC].have_irq = 1;
 	} else {
 		snprintf(priv->eq_table.irq_names,
 			 MLX4_IRQNAME_SIZE,
 			 DRV_NAME "@pci:%s",
-			 pci_name(dev->pdev));
-		err = request_irq(dev->pdev->irq, mlx4_interrupt,
+			 pci_name(dev->persist->pdev));
+		err = request_irq(dev->persist->pdev->irq, mlx4_interrupt,
 				  IRQF_SHARED, priv->eq_table.irq_names, dev);
 		if (err)
-			goto err_out_async;
+			goto err_out_unmap;
 
 		priv->eq_table.have_irq = 1;
 	}
 
 	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
-			  priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+			  priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
 	if (err)
 		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
-			   priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err);
+			   priv->eq_table.eq[MLX4_EQ_ASYNC].eqn, err);
 
-	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
-		eq_set_ci(&priv->eq_table.eq[i], 1);
+	/* arm ASYNC eq */
+	eq_set_ci(&priv->eq_table.eq[MLX4_EQ_ASYNC], 1);
 
 	return 0;
 
-err_out_async:
-	mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]);
-
-err_out_comp:
-	i = dev->caps.num_comp_vectors - 1;
-
 err_out_unmap:
-	while (i >= 0) {
-		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
-		--i;
+	while (i > 0)
+		mlx4_free_eq(dev, &priv->eq_table.eq[--i]);
+#ifdef CONFIG_RFS_ACCEL
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_priv(dev)->port[i].rmap) {
+			free_irq_cpu_rmap(mlx4_priv(dev)->port[i].rmap);
+			mlx4_priv(dev)->port[i].rmap = NULL;
+		}
 	}
+#endif
 	mlx4_free_irqs(dev);
 
 err_out_clr_int:
@@ -1285,11 +1314,19 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
 	int i;
 
 	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 1,
-		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
 
+#ifdef CONFIG_RFS_ACCEL
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_priv(dev)->port[i].rmap) {
+			free_irq_cpu_rmap(mlx4_priv(dev)->port[i].rmap);
+			mlx4_priv(dev)->port[i].rmap = NULL;
+		}
+	}
+#endif
 	mlx4_free_irqs(dev);
 
-	for (i = 0; i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i)
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
 
 	if (!mlx4_is_slave(dev))
@@ -1301,108 +1338,200 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
 	kfree(priv->eq_table.uar_map);
 }
 
-/* A test that verifies that we can accept interrupts on all
- * the irq vectors of the device.
+/* A test that verifies that we can accept interrupts
+ * on the vector allocated for asynchronous events
+ */
+int mlx4_test_async(struct mlx4_dev *dev)
+{
+	return mlx4_NOP(dev);
+}
+EXPORT_SYMBOL(mlx4_test_async);
+
+/* A test that verifies that we can accept interrupts
+ * on the given irq vector of the tested port.
  * Interrupts are checked using the NOP command.
  */
-int mlx4_test_interrupts(struct mlx4_dev *dev)
+int mlx4_test_interrupt(struct mlx4_dev *dev, int vector)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int i;
 	int err;
 
-	err = mlx4_NOP(dev);
-	/* When not in MSI_X, there is only one irq to check */
-	if (!(dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(dev))
-		return err;
+	/* Temporary use polling for command completions */
+	mlx4_cmd_use_polling(dev);
 
-	/* A loop over all completion vectors, for each vector we will check
-	 * whether it works by mapping command completions to that vector
-	 * and performing a NOP command
-	 */
-	for(i = 0; !err && (i < dev->caps.num_comp_vectors); ++i) {
-		/* Temporary use polling for command completions */
-		mlx4_cmd_use_polling(dev);
-
-		/* Map the new eq to handle all asyncronous events */
-		err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
-				  priv->eq_table.eq[i].eqn);
-		if (err) {
-			mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
-			mlx4_cmd_use_events(dev);
-			break;
-		}
-
-		/* Go back to using events */
-		mlx4_cmd_use_events(dev);
-		err = mlx4_NOP(dev);
+	/* Map the new eq to handle all asynchronous events */
+	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+			  priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn);
+	if (err) {
+		mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
+		goto out;
 	}
 
+	/* Go back to using events */
+	mlx4_cmd_use_events(dev);
+	err = mlx4_NOP(dev);
+
 	/* Return to default */
+	mlx4_cmd_use_polling(dev);
+out:
 	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
-		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+	mlx4_cmd_use_events(dev);
+
 	return err;
 }
-EXPORT_SYMBOL(mlx4_test_interrupts);
+EXPORT_SYMBOL(mlx4_test_interrupt);
 
-int mlx4_assign_eq(struct mlx4_dev *dev, char* name, int * vector)
+bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector)
 {
-
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int vec = 0, err = 0, i;
+
+	vector = MLX4_CQ_TO_EQ_VECTOR(vector);
+	if (vector < 0 || (vector >= dev->caps.num_comp_vectors + 1) ||
+	    (vector == MLX4_EQ_ASYNC))
+		return false;
+
+	return test_bit(port - 1, priv->eq_table.eq[vector].actv_ports.ports);
+}
+EXPORT_SYMBOL(mlx4_is_eq_vector_valid);
+
+u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned int i;
+	unsigned int sum = 0;
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; i++)
+		sum += !!test_bit(port - 1,
+				  priv->eq_table.eq[i].actv_ports.ports);
+
+	return sum;
+}
+EXPORT_SYMBOL(mlx4_get_eqs_per_port);
+
+int mlx4_is_eq_shared(struct mlx4_dev *dev, int vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	vector = MLX4_CQ_TO_EQ_VECTOR(vector);
+	if (vector <= 0 || (vector >= dev->caps.num_comp_vectors + 1))
+		return -EINVAL;
+
+	return !!(bitmap_weight(priv->eq_table.eq[vector].actv_ports.ports,
+				dev->caps.num_ports) > 1);
+}
+EXPORT_SYMBOL(mlx4_is_eq_shared);
+
+int mlx4_assign_eq(struct mlx4_dev *dev, u8 port, int *vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err = 0, i = 0;
+	u32 min_ref_count_val = (u32)-1;
+	int requested_vector = MLX4_CQ_TO_EQ_VECTOR(*vector);
+	int *prequested_vector = NULL;
+
 
 	mutex_lock(&priv->msix_ctl.pool_lock);
-	for (i = 0; !vec && i < dev->caps.comp_pool; i++) {
-		if (~priv->msix_ctl.pool_bm & 1ULL << i) {
-			priv->msix_ctl.pool_bm |= 1ULL << i;
-			vec = dev->caps.num_comp_vectors + 1 + i;
-			snprintf(priv->eq_table.irq_names +
-					vec * MLX4_IRQNAME_SIZE,
-					MLX4_IRQNAME_SIZE, "%s", name);
-			err = request_irq(priv->eq_table.eq[vec].irq,
-					  mlx4_msi_x_interrupt, 0,
-					  &priv->eq_table.irq_names[vec<<5],
-					  priv->eq_table.eq + vec);
-			if (err) {
-				/*zero out bit by fliping it*/
-				priv->msix_ctl.pool_bm ^= 1 << i;
-				vec = 0;
-				continue;
-				/*we dont want to break here*/
+	if (requested_vector < (dev->caps.num_comp_vectors + 1) &&
+	    (requested_vector >= 0) &&
+	    (requested_vector != MLX4_EQ_ASYNC)) {
+		if (test_bit(port - 1,
+			     priv->eq_table.eq[requested_vector].actv_ports.ports)) {
+			prequested_vector = &requested_vector;
+		} else {
+			struct mlx4_eq *eq;
+
+			for (i = 1; i < port;
+			     requested_vector += mlx4_get_eqs_per_port(dev, i++))
+				;
+
+			eq = &priv->eq_table.eq[requested_vector];
+			if (requested_vector < dev->caps.num_comp_vectors + 1 &&
+			    test_bit(port - 1, eq->actv_ports.ports)) {
+				prequested_vector = &requested_vector;
 			}
-			eq_set_ci(&priv->eq_table.eq[vec], 1);
 		}
 	}
+
+	if  (!prequested_vector) {
+		requested_vector = -1;
+		for (i = 0; min_ref_count_val && i < dev->caps.num_comp_vectors + 1;
+		     i++) {
+			struct mlx4_eq *eq = &priv->eq_table.eq[i];
+
+			if (min_ref_count_val > eq->ref_count &&
+			    test_bit(port - 1, eq->actv_ports.ports)) {
+				min_ref_count_val = eq->ref_count;
+				requested_vector = i;
+			}
+		}
+
+		if (requested_vector < 0) {
+			err = -ENOSPC;
+			goto err_unlock;
+		}
+
+		prequested_vector = &requested_vector;
+	}
+
+	if (!test_bit(*prequested_vector, priv->msix_ctl.pool_bm) &&
+	    dev->flags & MLX4_FLAG_MSI_X) {
+		set_bit(*prequested_vector, priv->msix_ctl.pool_bm);
+		snprintf(priv->eq_table.irq_names +
+			 *prequested_vector * MLX4_IRQNAME_SIZE,
+			 MLX4_IRQNAME_SIZE, "mlx4-%d@%s",
+			 *prequested_vector, dev_name(&dev->persist->pdev->dev));
+
+		err = request_irq(priv->eq_table.eq[*prequested_vector].irq,
+				  mlx4_msi_x_interrupt, 0,
+				  &priv->eq_table.irq_names[*prequested_vector << 5],
+				  priv->eq_table.eq + *prequested_vector);
+
+		if (err) {
+			clear_bit(*prequested_vector, priv->msix_ctl.pool_bm);
+			*prequested_vector = -1;
+		} else {
+			mlx4_set_eq_affinity_hint(priv, *prequested_vector);
+			eq_set_ci(&priv->eq_table.eq[*prequested_vector], 1);
+			priv->eq_table.eq[*prequested_vector].have_irq = 1;
+		}
+	}
+
+	if (!err && *prequested_vector >= 0)
+		priv->eq_table.eq[*prequested_vector].ref_count++;
+
+err_unlock:
 	mutex_unlock(&priv->msix_ctl.pool_lock);
 
-	if (vec) {
-		*vector = vec;
-	} else {
+	if (!err && *prequested_vector >= 0)
+		*vector = MLX4_EQ_TO_CQ_VECTOR(*prequested_vector);
+	else
 		*vector = 0;
-		err = (i == dev->caps.comp_pool) ? -ENOSPC : err;
-	}
+
 	return err;
 }
 EXPORT_SYMBOL(mlx4_assign_eq);
 
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int cq_vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq_vec)].irq;
+}
+EXPORT_SYMBOL(mlx4_eq_get_irq);
+
 void mlx4_release_eq(struct mlx4_dev *dev, int vec)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	/*bm index*/
-	int i = vec - dev->caps.num_comp_vectors - 1;
+	int eq_vec = MLX4_CQ_TO_EQ_VECTOR(vec);
 
-	if (likely(i >= 0)) {
-		/*sanity check , making sure were not trying to free irq's
-		  Belonging to a legacy EQ*/
-		mutex_lock(&priv->msix_ctl.pool_lock);
-		if (priv->msix_ctl.pool_bm & 1ULL << i) {
-			free_irq(priv->eq_table.eq[vec].irq,
-				 &priv->eq_table.eq[vec]);
-			priv->msix_ctl.pool_bm &= ~(1ULL << i);
-		}
-		mutex_unlock(&priv->msix_ctl.pool_lock);
-	}
+	mutex_lock(&priv->msix_ctl.pool_lock);
+	priv->eq_table.eq[eq_vec].ref_count--;
 
+	/* once we allocated EQ, we don't release it because it might be binded
+	 * to cpu_rmap.
+	 */
+	mutex_unlock(&priv->msix_ctl.pool_lock);
 }
 EXPORT_SYMBOL(mlx4_release_eq);
 
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_fw.c b/sys/dev/mlx4/mlx4_core/mlx4_fw.c
index 5d137d1637f4..edd76e924e2b 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_fw.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_fw.c
@@ -39,6 +39,8 @@
 #include <linux/module.h>
 #include <linux/cache.h>
 
+#include <net/ipv6.h>
+
 #include "fw.h"
 #include "icm.h"
 
@@ -51,18 +53,21 @@ enum {
 extern void __buggy_use_of_MLX4_GET(void);
 extern void __buggy_use_of_MLX4_PUT(void);
 
-static u8 enable_qos;
-module_param(enable_qos, byte, 0444);
-MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)");
+static bool enable_qos;
+module_param(enable_qos, bool, 0444);
+MODULE_PARM_DESC(enable_qos, "Enable Enhanced QoS support (default: off)");
 
 #define MLX4_GET(dest, source, offset)				      \
 	do {							      \
 		void *__p = (char *) (source) + (offset);	      \
+		typedef struct { u64 value; } __packed u64_p_t;	      \
+		u64 val;                                              \
 		switch (sizeof (dest)) {			      \
 		case 1: (dest) = *(u8 *) __p;	    break;	      \
 		case 2: (dest) = be16_to_cpup(__p); break;	      \
 		case 4: (dest) = be32_to_cpup(__p); break;	      \
-		case 8: (dest) = be64_to_cpup(__p); break;	      \
+		case 8: val = ((u64_p_t *)__p)->value;                \
+			(dest) = be64_to_cpu(val);  break;            \
 		default: __buggy_use_of_MLX4_GET();		      \
 		}						      \
 	} while (0)
@@ -86,14 +91,11 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
 		[ 1] = "UC transport",
 		[ 2] = "UD transport",
 		[ 3] = "XRC transport",
-		[ 4] = "reliable multicast",
-		[ 5] = "FCoIB support",
 		[ 6] = "SRQ support",
 		[ 7] = "IPoIB checksum offload",
 		[ 8] = "P_Key violation counter",
 		[ 9] = "Q_Key violation counter",
-		[10] = "VMM",
-		[12] = "DPDP",
+		[12] = "Dual Port Different Protocol (DPDP) support",
 		[15] = "Big LSO headers",
 		[16] = "MW support",
 		[17] = "APM support",
@@ -101,19 +103,19 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
 		[19] = "Raw multicast support",
 		[20] = "Address vector port checking support",
 		[21] = "UD multicast support",
-		[24] = "Demand paging support",
-		[25] = "Router support",
 		[30] = "IBoE support",
 		[32] = "Unicast loopback support",
 		[34] = "FCS header control",
-		[38] = "Wake On LAN support",
+		[37] = "Wake On LAN (port1) support",
+		[38] = "Wake On LAN (port2) support",
 		[40] = "UDP RSS support",
 		[41] = "Unicast VEP steering support",
 		[42] = "Multicast VEP steering support",
-		[44] = "Cross-channel (sync_qp) operations support",
 		[48] = "Counters support",
+		[52] = "RSS IP fragments support",
+		[53] = "Port ETS Scheduler support",
+		[55] = "Port link type sensing support",
 		[59] = "Port management change event support",
-		[60] = "eSwitch support",
 		[61] = "64 byte EQE support",
 		[62] = "64 byte CQE support",
 	};
@@ -131,18 +133,39 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[0] = "RSS support",
 		[1] = "RSS Toeplitz Hash Function support",
 		[2] = "RSS XOR Hash Function support",
-		[3] = "Device manage flow steering support",
-		[4] = "FSM (MAC unti-spoofing) support",
-		[5] = "VST (control vlan insertion/stripping) support",
-		[6] = "Dynamic QP updates support",
-		[7] = "Loopback source checks support",
-		[8] = "Device managed flow steering IPoIB support",
-		[9] = "ETS configuration support",
-		[10] = "ETH backplane autoneg report",
-		[11] = "Ethernet Flow control statistics support",
-		[12] = "Recoverable error events support",
-		[13] = "Time stamping support",
-		[14] = "Report driver version to FW support"
+		[3] = "Device managed flow steering support",
+		[4] = "Automatic MAC reassignment support",
+		[5] = "Time stamping support",
+		[6] = "VST (control vlan insertion/stripping) support",
+		[7] = "FSM (MAC anti-spoofing) support",
+		[8] = "Dynamic QP updates support",
+		[9] = "Device managed flow steering IPoIB support",
+		[10] = "TCP/IP offloads/flow-steering for VXLAN support",
+		[11] = "MAD DEMUX (Secure-Host) support",
+		[12] = "Large cache line (>64B) CQE stride support",
+		[13] = "Large cache line (>64B) EQE stride support",
+		[14] = "Ethernet protocol control support",
+		[15] = "Ethernet Backplane autoneg support",
+		[16] = "CONFIG DEV support",
+		[17] = "Asymmetric EQs support",
+		[18] = "More than 80 VFs support",
+		[19] = "Performance optimized for limited rule configuration flow steering support",
+		[20] = "Recoverable error events support",
+		[21] = "Port Remap support",
+		[22] = "QCN support",
+		[23] = "QP rate limiting support",
+		[24] = "Ethernet Flow control statistics support",
+		[25] = "Granular QoS per VF support",
+		[26] = "Port ETS Scheduler support",
+		[27] = "Port beacon support",
+		[28] = "RX-ALL support",
+		[29] = "802.1ad offload support",
+		[31] = "Modifying loopback source checks using UPDATE_QP support",
+		[32] = "Loopback source checks support",
+		[33] = "RoCEv2 support",
+		[34] = "DMFS Sniffer support (UC & MC)",
+		[35] = "QinQ VST mode support",
+		[36] = "sl to vl mapping table change event support"
 	};
 	int i;
 
@@ -167,8 +190,6 @@ int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
 		return PTR_ERR(mailbox);
 	inbox = mailbox->buf;
 
-	memset(inbox, 0, MOD_STAT_CFG_IN_SIZE);
-
 	MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET);
 	MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
 
@@ -204,11 +225,11 @@ int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave)
 	in_modifier = slave;
 
 	err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, 0,
-			MLX4_CMD_QUERY_FUNC,
-			MLX4_CMD_TIME_CLASS_A,
-			MLX4_CMD_NATIVE);
+			   MLX4_CMD_QUERY_FUNC,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
 	if (err)
-	        goto out;
+		goto out;
 
 	MLX4_GET(field, outbox, QUERY_FUNC_BUS_OFFSET);
 	func->bus = field & 0xf;
@@ -226,13 +247,80 @@ int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave)
 	func->rsvd_uars = field & 0x0f;
 
 	mlx4_dbg(dev, "Bus: %d, Device: %d, Function: %d, Physical function: %d, Max EQs: %d, Reserved EQs: %d, Reserved UARs: %d\n",
-		func->bus, func->device, func->function, func->physical_function,
-		func->max_eq, func->rsvd_eqs, func->rsvd_uars);
+		 func->bus, func->device, func->function, func->physical_function,
+		 func->max_eq, func->rsvd_eqs, func->rsvd_uars);
+
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
 
+static int mlx4_activate_vst_qinq(struct mlx4_priv *priv, int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_vport_state *vp_admin;
+	int err;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (vp_admin->default_vlan != vp_oper->state.default_vlan) {
+		err = __mlx4_register_vlan(&priv->dev, port,
+					   vp_admin->default_vlan,
+					   &vp_oper->vlan_idx);
+		if (err) {
+			vp_oper->vlan_idx = NO_INDX;
+			mlx4_warn(&priv->dev,
+				  "No vlan resources slave %d, port %d\n",
+				  slave, port);
+			return err;
+		}
+		mlx4_dbg(&priv->dev, "alloc vlan %d idx  %d slave %d port %d\n",
+			 (int)(vp_oper->state.default_vlan),
+			 vp_oper->vlan_idx, slave, port);
+	}
+	vp_oper->state.vlan_proto   = vp_admin->vlan_proto;
+	vp_oper->state.default_vlan = vp_admin->default_vlan;
+	vp_oper->state.default_qos  = vp_admin->default_qos;
+
+	return 0;
+}
+
+static int mlx4_handle_vst_qinq(struct mlx4_priv *priv, int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vport_state *vp_admin;
+	int err;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+	slave_state = &priv->mfunc.master.slave_state[slave];
+
+	if ((vp_admin->vlan_proto != htons(ETH_P_8021AD)) ||
+	    (!slave_state->active))
+		return 0;
+
+	if (vp_oper->state.vlan_proto == vp_admin->vlan_proto &&
+	    vp_oper->state.default_vlan == vp_admin->default_vlan &&
+	    vp_oper->state.default_qos == vp_admin->default_qos)
+		return 0;
+
+	if (!slave_state->vst_qinq_supported) {
+		/* Warn and revert the request to set vst QinQ mode */
+		vp_admin->vlan_proto   = vp_oper->state.vlan_proto;
+		vp_admin->default_vlan = vp_oper->state.default_vlan;
+		vp_admin->default_qos  = vp_oper->state.default_qos;
+
+		mlx4_warn(&priv->dev,
+			  "Slave %d does not support VST QinQ mode\n", slave);
+		return 0;
+	}
+
+	err = mlx4_activate_vst_qinq(priv, slave, port);
+	return err;
+}
+
 int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 				struct mlx4_vhcr *vhcr,
 				struct mlx4_cmd_mailbox *inbox,
@@ -241,7 +329,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	u8	field, port;
-	u32	size;
+	u32	size, proxy_qp, qkey;
 	int	err = 0;
 	struct mlx4_func func;
 
@@ -257,6 +345,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP	0x28
 #define QUERY_FUNC_CAP_MAX_EQ_OFFSET		0x2c
 #define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET	0x30
+#define QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET	0x48
 
 #define QUERY_FUNC_CAP_QP_QUOTA_OFFSET		0x50
 #define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET		0x54
@@ -271,46 +360,66 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_FLAG_RDMA		0x40
 #define QUERY_FUNC_CAP_FLAG_ETH			0x80
 #define QUERY_FUNC_CAP_FLAG_QUOTAS		0x10
+#define QUERY_FUNC_CAP_FLAG_RESD_LKEY		0x08
 #define QUERY_FUNC_CAP_FLAG_VALID_MAILBOX	0x04
 
 #define QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG	(1UL << 31)
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG	(1UL << 30)
 
 /* when opcode modifier = 1 */
 #define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
+#define QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET	0x4
 #define QUERY_FUNC_CAP_FLAGS0_OFFSET		0x8
 #define QUERY_FUNC_CAP_FLAGS1_OFFSET		0xc
-#define QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET	0xd
 
 #define QUERY_FUNC_CAP_QP0_TUNNEL		0x10
 #define QUERY_FUNC_CAP_QP0_PROXY		0x14
 #define QUERY_FUNC_CAP_QP1_TUNNEL		0x18
 #define QUERY_FUNC_CAP_QP1_PROXY		0x1c
+#define QUERY_FUNC_CAP_PHYS_PORT_ID		0x28
 
-#define QUERY_FUNC_CAP_ETH_PROPS_FORCE_MAC	0x40
-#define QUERY_FUNC_CAP_ETH_PROPS_FORCE_VLAN	0x80
-#define QUERY_FUNC_CAP_PROPS_DEF_COUNTER	0x20
+#define QUERY_FUNC_CAP_FLAGS1_FORCE_MAC		0x40
+#define QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN	0x80
+#define QUERY_FUNC_CAP_FLAGS1_NIC_INFO			0x10
+#define QUERY_FUNC_CAP_VF_ENABLE_QP0		0x08
 
-#define QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID 0x80
-#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31)
+#define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
+#define QUERY_FUNC_CAP_PHV_BIT			0x40
+#define QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE	0x20
+
+#define QUERY_FUNC_CAP_SUPPORTS_VST_QINQ	BIT(30)
+#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS BIT(31)
 
 	if (vhcr->op_modifier == 1) {
-		port = vhcr->in_modifier; /* phys-port = logical-port */
-		MLX4_PUT(outbox->buf, port, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, slave);
+		int converted_port = mlx4_slave_convert_port(
+				dev, slave, vhcr->in_modifier);
+		struct mlx4_vport_oper_state *vp_oper;
 
-		field = 0;
-		/* ensure that phy_wqe_gid bit is not set */
-		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+		if (converted_port < 0)
+			return -EINVAL;
 
-		/* ensure force vlan and force mac bits are not set
-		 * and that default counter bit is set
-		 */
-		field = QUERY_FUNC_CAP_PROPS_DEF_COUNTER; /* def counter */
+		vhcr->in_modifier = converted_port;
+		/* phys-port = logical-port */
+		field = vhcr->in_modifier -
+			find_first_bit(actv_ports.ports, dev->caps.num_ports);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+
+		port = vhcr->in_modifier;
+		proxy_qp = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1;
+
+		/* Set nic_info bit to mark new fields support */
+		field  = QUERY_FUNC_CAP_FLAGS1_NIC_INFO;
+
+		if (mlx4_vf_smi_enabled(dev, slave, port) &&
+		    !mlx4_get_parav_qkey(dev, proxy_qp, &qkey)) {
+			field |= QUERY_FUNC_CAP_VF_ENABLE_QP0;
+			MLX4_PUT(outbox->buf, qkey,
+				 QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		}
 		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET);
 
-		/* There is always default counter legal or sink counter */
-		field = mlx4_get_default_counter_index(dev, slave, vhcr->in_modifier);
-		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET);
-
 		/* size is now the QP number */
 		size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1;
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL);
@@ -318,19 +427,42 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		size += 2;
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL);
 
-		size = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1;
-		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_PROXY);
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP0_PROXY);
+		proxy_qp += 2;
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP1_PROXY);
 
-		size += 2;
-		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_PROXY);
+		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
+			 QUERY_FUNC_CAP_PHYS_PORT_ID);
+
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		err = mlx4_handle_vst_qinq(priv, slave, port);
+		if (err)
+			return err;
+
+		field = 0;
+		if (dev->caps.phv_bit[port])
+			field |= QUERY_FUNC_CAP_PHV_BIT;
+		if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD))
+			field |= QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE;
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS0_OFFSET);
 
 	} else if (vhcr->op_modifier == 0) {
-		/* enable rdma and ethernet interfaces, and new quota locations */
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, slave);
+		struct mlx4_slave_state *slave_state =
+			&priv->mfunc.master.slave_state[slave];
+
+		/* enable rdma and ethernet interfaces, new quota locations,
+		 * and reserved lkey
+		 */
 		field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA |
-			 QUERY_FUNC_CAP_FLAG_QUOTAS | QUERY_FUNC_CAP_FLAG_VALID_MAILBOX);
+			 QUERY_FUNC_CAP_FLAG_QUOTAS | QUERY_FUNC_CAP_FLAG_VALID_MAILBOX |
+			 QUERY_FUNC_CAP_FLAG_RESD_LKEY);
 		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET);
 
-		field = dev->caps.num_ports;
+		field = min(
+			bitmap_weight(actv_ports.ports, dev->caps.num_ports),
+			dev->caps.num_ports);
 		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
 
 		size = dev->caps.function_caps; /* set PF behaviours */
@@ -387,27 +519,37 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
 
-		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG;
+		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG |
+			QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG;
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
+
+		size = dev->caps.reserved_lkey + ((slave << 8) & 0xFF00);
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET);
+
+		if (vhcr->in_modifier & QUERY_FUNC_CAP_SUPPORTS_VST_QINQ)
+			slave_state->vst_qinq_supported = true;
+
 	} else
 		err = -EINVAL;
 
 	return err;
 }
 
-int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 			struct mlx4_func_cap *func_cap)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	u32			*outbox;
 	u8			field, op_modifier;
-	u32			size;
+	u32			size, qkey;
 	int			err = 0, quotas = 0;
 	u32                     in_modifier;
+	u32			slave_caps;
 
 	op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
-	in_modifier = op_modifier ? gen_or_port :
+	slave_caps = QUERY_FUNC_CAP_SUPPORTS_VST_QINQ |
 		QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS;
+	in_modifier = op_modifier ? gen_or_port : slave_caps;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -481,6 +623,13 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
 		MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
 		func_cap->reserved_eq = size & 0xFFFFFF;
 
+		if (func_cap->flags & QUERY_FUNC_CAP_FLAG_RESD_LKEY) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET);
+			func_cap->reserved_lkey = size;
+		} else {
+			func_cap->reserved_lkey = 0;
+		}
+
 		func_cap->extra_flags = 0;
 
 		/* Mailbox data from 0x6c and onward should only be treated if
@@ -490,6 +639,8 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
 			MLX4_GET(size, outbox, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
 			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG)
 				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_BF_RES_QP;
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_A0_RES_QP;
 		}
 
 		goto out;
@@ -501,24 +652,23 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
 		goto out;
 	}
 
+	MLX4_GET(func_cap->flags1, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET);
 	if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) {
-		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET);
-		if (field & QUERY_FUNC_CAP_ETH_PROPS_FORCE_VLAN) {
+		if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN) {
 			mlx4_err(dev, "VLAN is enforced on this port\n");
 			err = -EPROTONOSUPPORT;
 			goto out;
 		}
 
-		if (field & QUERY_FUNC_CAP_ETH_PROPS_FORCE_MAC) {
+		if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_MAC) {
 			mlx4_err(dev, "Force mac is enabled on this port\n");
 			err = -EPROTONOSUPPORT;
 			goto out;
 		}
 	} else if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_IB) {
 		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
-		if (field & QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID) {
-			mlx4_err(dev, "phy_wqe_gid is "
-				 "enforced on this ib port\n");
+		if (field & QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID) {
+			mlx4_err(dev, "phy_wqe_gid is enforced on this ib port\n");
 			err = -EPROTONOSUPPORT;
 			goto out;
 		}
@@ -531,12 +681,11 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
 		goto out;
 	}
 
-	MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET);
-	if (field & QUERY_FUNC_CAP_PROPS_DEF_COUNTER) {
-		MLX4_GET(field, outbox, QUERY_FUNC_CAP_COUNTER_INDEX_OFFSET);
-		func_cap->def_counter_index = field;
+	if (func_cap->flags1 & QUERY_FUNC_CAP_VF_ENABLE_QP0) {
+		MLX4_GET(qkey, outbox, QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		func_cap->qp0_qkey = qkey;
 	} else {
-		func_cap->def_counter_index = MLX4_SINK_COUNTER_INDEX;
+		func_cap->qp0_qkey = 0;
 	}
 
 	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL);
@@ -551,6 +700,12 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
 	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY);
 	func_cap->qp1_proxy_qpn = size & 0xFFFFFF;
 
+	if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_NIC_INFO)
+		MLX4_GET(func_cap->phys_port_id, outbox,
+			 QUERY_FUNC_CAP_PHYS_PORT_ID);
+
+	MLX4_GET(func_cap->flags0, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+
 	/* All other resources are allocated by the master, but we still report
 	 * 'num' and 'reserved' capabilities as follows:
 	 * - num remains the maximum resource index
@@ -564,6 +719,8 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
 	return err;
 }
 
+static void disable_unsupported_roce_caps(void *buf);
+
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	struct mlx4_cmd_mailbox *mailbox;
@@ -602,6 +759,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_RSS_OFFSET		0x2e
 #define QUERY_DEV_CAP_MAX_RDMA_OFFSET		0x2f
 #define QUERY_DEV_CAP_RSZ_SRQ_OFFSET		0x33
+#define QUERY_DEV_CAP_PORT_BEACON_OFFSET	0x34
 #define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
 #define QUERY_DEV_CAP_MTU_WIDTH_OFFSET		0x36
 #define QUERY_DEV_CAP_VL_PORT_OFFSET		0x37
@@ -611,7 +769,6 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET	0x3e
 #define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
 #define QUERY_DEV_CAP_EXT_FLAGS_OFFSET		0x40
-#define QUERY_DEV_CAP_SYNC_QP_OFFSET		0x42
 #define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
 #define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
@@ -624,6 +781,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET	0x52
 #define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET		0x55
 #define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET	0x56
+#define QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET	0x5D
 #define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET		0x61
 #define QUERY_DEV_CAP_RSVD_MCG_OFFSET		0x62
 #define QUERY_DEV_CAP_MAX_MCG_OFFSET		0x63
@@ -631,13 +789,15 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_PD_OFFSET		0x65
 #define QUERY_DEV_CAP_RSVD_XRC_OFFSET		0x66
 #define QUERY_DEV_CAP_MAX_XRC_OFFSET		0x67
-#define QUERY_DEV_CAP_MAX_BASIC_COUNTERS_OFFSET	0x68
-#define QUERY_DEV_CAP_MAX_EXTENDED_COUNTERS_OFFSET	0x6c
+#define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET	0x68
 #define QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET	0x70
-#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
-#define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET		0x70
+#define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET	0x70
 #define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET	0x74
+#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
 #define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET	0x77
+#define QUERY_DEV_CAP_SL2VL_EVENT_OFFSET	0x78
+#define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE	0x7a
+#define QUERY_DEV_CAP_ECN_QCN_VER_OFFSET	0x7b
 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
@@ -649,9 +809,21 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
+#define QUERY_DEV_CAP_CONFIG_DEV_OFFSET		0x94
+#define QUERY_DEV_CAP_PHV_EN_OFFSET		0x96
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
-#define QUERY_DEV_CAP_ETS_CFG_OFFSET		0x9c
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
+#define QUERY_DEV_CAP_ETH_BACKPL_OFFSET		0x9c
+#define QUERY_DEV_CAP_DIAG_RPRT_PER_PORT	0x9c
+#define QUERY_DEV_CAP_FW_REASSIGN_MAC		0x9d
+#define QUERY_DEV_CAP_VXLAN			0x9e
+#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET	0xa8
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET	0xac
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET	0xcc
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET	0xd0
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET	0xd2
+
 
 	dev_cap->flags2 = 0;
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
@@ -664,6 +836,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (err)
 		goto out;
 
+	if (mlx4_is_mfunc(dev))
+		disable_unsupported_roce_caps(outbox);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
 	dev_cap->reserved_qps = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
@@ -681,17 +855,13 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
 	dev_cap->max_mpts = 1 << (field & 0x3f);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
-	dev_cap->reserved_eqs = field & 0xf;
+	dev_cap->reserved_eqs = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
 	dev_cap->max_eqs = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
 	dev_cap->reserved_mtts = 1 << (field >> 4);
-	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET);
-	dev_cap->max_mrw_sz = 1 << field;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET);
 	dev_cap->reserved_mrws = 1 << (field & 0xf);
-	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET);
-	dev_cap->max_mtt_seg = 1 << (field & 0x3f);
 	MLX4_GET(size, outbox, QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET);
 	dev_cap->num_sys_eqs = size & 0xfff;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
@@ -730,12 +900,23 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
 	if (field & 0x80)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN;
+	dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f;
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_BEACON;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
 	if (field & 0x80)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_IPOIB;
-	dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET);
 	dev_cap->fs_max_num_qp_per_entry = field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_SL2VL_EVENT_OFFSET);
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+	if (field & 0x1)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QCN;
 	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
 	dev_cap->stat_rate_support = stat_rate;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
@@ -744,8 +925,6 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
 	MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
 	dev_cap->flags = flags | (u64)ext_flags << 32;
-	MLX4_GET(field, outbox, QUERY_DEV_CAP_SYNC_QP_OFFSET);
-	dev_cap->sync_qp = field & 0x10;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
 	dev_cap->reserved_uars = field >> 4;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
@@ -761,11 +940,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size))
 			field = 3;
 		dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
-		mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
-			 dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
 	} else {
 		dev_cap->bf_reg_size = 0;
-		mlx4_dbg(dev, "BlueFlame not available\n");
 	}
 
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET);
@@ -773,6 +949,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
 	dev_cap->max_sq_desc_sz = size;
 
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET);
+	if (field & 0x1)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
 	dev_cap->max_qp_per_mcg = 1 << field;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
@@ -819,54 +998,202 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev_cap->max_rq_sg = field;
 	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
 	dev_cap->max_rq_desc_sz = size;
-
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+	if (field & (1 << 4))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QOS_VPP;
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL;
+	if (field & (1 << 6))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+	if (field & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
 	MLX4_GET(dev_cap->bmme_flags, outbox,
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	if (dev_cap->bmme_flags & MLX4_FLAG_ROCE_V1_V2)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ROCE_V1_V2;
+	if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
+	if (field & (1 << 2))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PHV_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PHV_EN;
+	if (field & 0x40)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN;
+
 	MLX4_GET(dev_cap->reserved_lkey, outbox,
 		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
-	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETS_CFG_OFFSET);
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
 	if (field32 & (1 << 0))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
 	if (field32 & (1 << 7))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
-	if (field32 & (1 << 8))
-		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW;
-	if (field32 & (1 << 13))
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_DIAG_RPRT_PER_PORT);
+	if (field32 & (1 << 17))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
+	if (field & (1 << 6))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VXLAN);
+	if (field & (1 << 3))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS;
+	if (field & (1 << 5))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETS_CFG;
-
 	MLX4_GET(dev_cap->max_icm_sz, outbox,
 		 QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
 	if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS)
-		MLX4_GET(dev_cap->max_basic_counters, outbox,
-			 QUERY_DEV_CAP_MAX_BASIC_COUNTERS_OFFSET);
-	/* FW reports 256 however real value is 255 */
-	dev_cap->max_basic_counters = min_t(u32, dev_cap->max_basic_counters, 255);
-	if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT)
-		MLX4_GET(dev_cap->max_extended_counters, outbox,
-			 QUERY_DEV_CAP_MAX_EXTENDED_COUNTERS_OFFSET);
+		MLX4_GET(dev_cap->max_counters, outbox,
+			 QUERY_DEV_CAP_MAX_COUNTERS_OFFSET);
+
+	MLX4_GET(field32, outbox,
+		 QUERY_DEV_CAP_MAD_DEMUX_OFFSET);
+	if (field32 & (1 << 0))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX;
+
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_base, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_base &= MGM_QPN_MASK;
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_range, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_range &= MGM_QPN_MASK;
+
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET);
+	dev_cap->rl_caps.num_rates = size;
+	if (dev_cap->rl_caps.num_rates) {
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT;
+		MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET);
+		dev_cap->rl_caps.max_val  = size & 0xfff;
+		dev_cap->rl_caps.max_unit = size >> 14;
+		MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET);
+		dev_cap->rl_caps.min_val  = size & 0xfff;
+		dev_cap->rl_caps.min_unit = size >> 14;
+	}
 
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
 	if (field32 & (1 << 16))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
+	if (field32 & (1 << 18))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB;
 	if (field32 & (1 << 19))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_LB_SRC_CHK;
-	if (field32 & (1 << 20))
-		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM;
 	if (field32 & (1 << 26))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL;
+	if (field32 & (1 << 20))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM;
+	if (field32 & (1 << 21))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_80_VFS;
+
+	for (i = 1; i <= dev_cap->num_ports; i++) {
+		err = mlx4_QUERY_PORT(dev, i, dev_cap->port_cap + i);
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
+	 * we can't use any EQs whose doorbell falls on that page,
+	 * even if the EQ itself isn't reserved.
+	 */
+	if (dev_cap->num_sys_eqs == 0)
+		dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+					    dev_cap->reserved_eqs);
+	else
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SYS_EQS;
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+void mlx4_dev_cap_dump(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	if (dev_cap->bf_reg_size > 0)
+		mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
+			 dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
+	else
+		mlx4_dbg(dev, "BlueFlame not available\n");
+
+	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
+		 dev_cap->bmme_flags, dev_cap->reserved_lkey);
+	mlx4_dbg(dev, "Max ICM size %lld MB\n",
+		 (unsigned long long) dev_cap->max_icm_sz >> 20);
+	mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		 dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
+	mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		 dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
+	mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		 dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
+	mlx4_dbg(dev, "Num sys EQs: %d, max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		 dev_cap->num_sys_eqs, dev_cap->max_eqs, dev_cap->reserved_eqs,
+		 dev_cap->eqc_entry_sz);
+	mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		 dev_cap->reserved_mrws, dev_cap->reserved_mtts);
+	mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
+	mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_mgms);
+	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
+	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->port_cap[1].ib_mtu,
+		 dev_cap->port_cap[1].max_port_width);
+	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
+		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
+	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
+		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
+	mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters);
+	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
+	mlx4_dbg(dev, "DMFS high rate steer QPn base: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_base);
+	mlx4_dbg(dev, "DMFS high rate steer QPn range: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_range);
+
+	if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT) {
+		struct mlx4_rate_limit_caps *rl_caps = &dev_cap->rl_caps;
+
+		mlx4_dbg(dev, "QP Rate-Limit: #rates %d, unit/val max %d/%d, min %d/%d\n",
+			 rl_caps->num_rates, rl_caps->max_unit, rl_caps->max_val,
+			 rl_caps->min_unit, rl_caps->min_val);
+	}
+
+	dump_dev_cap_flags(dev, dev_cap->flags);
+	dump_dev_cap_flags2(dev, dev_cap->flags2);
+}
+
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
 
 	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
-		for (i = 1; i <= dev_cap->num_ports; ++i) {
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
-			dev_cap->max_vl[i]	   = field >> 4;
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
-			dev_cap->ib_mtu[i]	   = field >> 4;
-			dev_cap->max_port_width[i] = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
-			dev_cap->max_gids[i]	   = 1 << (field & 0xf);
-			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
-			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
-		}
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+				   MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+		port_cap->max_vl	   = field >> 4;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+		port_cap->ib_mtu	   = field >> 4;
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+		port_cap->max_gids	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
 	} else {
 #define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
 #define QUERY_PORT_MTU_OFFSET			0x01
@@ -880,91 +1207,49 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
 #define QUERY_PORT_TRANS_CODE_OFFSET		0x20
 
-		for (i = 1; i <= dev_cap->num_ports; ++i) {
-			err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT,
-					   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
-			if (err)
-				goto out;
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
 
-			MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
-			dev_cap->supported_port_types[i] = field & 3;
-			dev_cap->suggested_type[i] = (field >> 3) & 1;
-			dev_cap->default_sense[i] = (field >> 4) & 1;
-			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
-			dev_cap->ib_mtu[i]	   = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
-			dev_cap->max_port_width[i] = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
-			dev_cap->max_gids[i]	   = 1 << (field >> 4);
-			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
-			dev_cap->max_vl[i]	   = field & 0xf;
-			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
-			dev_cap->log_max_macs[i]  = field & 0xf;
-			dev_cap->log_max_vlans[i] = field >> 4;
-			MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET);
-			MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET);
-			MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
-			dev_cap->trans_type[i] = field32 >> 24;
-			dev_cap->vendor_oui[i] = field32 & 0xffffff;
-			MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET);
-			MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET);
-		}
+		MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+		port_cap->link_state = (field & 0x80) >> 7;
+		port_cap->supported_port_types = field & 3;
+		port_cap->suggested_type = (field >> 3) & 1;
+		port_cap->default_sense = (field >> 4) & 1;
+		port_cap->dmfs_optimized_state = (field >> 5) & 1;
+		MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
+		port_cap->ib_mtu	   = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
+		port_cap->max_gids	   = 1 << (field >> 4);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
+		port_cap->max_vl	   = field & 0xf;
+		port_cap->max_tc_eth	   = field >> 4;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+		port_cap->log_max_macs  = field & 0xf;
+		port_cap->log_max_vlans = field >> 4;
+		MLX4_GET(port_cap->eth_mtu, outbox, QUERY_PORT_ETH_MTU_OFFSET);
+		MLX4_GET(port_cap->def_mac, outbox, QUERY_PORT_MAC_OFFSET);
+		MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
+		port_cap->trans_type = field32 >> 24;
+		port_cap->vendor_oui = field32 & 0xffffff;
+		MLX4_GET(port_cap->wavelength, outbox, QUERY_PORT_WAVELENGTH_OFFSET);
+		MLX4_GET(port_cap->trans_code, outbox, QUERY_PORT_TRANS_CODE_OFFSET);
 	}
 
-	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
-		 dev_cap->bmme_flags, dev_cap->reserved_lkey);
-
-	/*
-	 * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
-	 * we can't use any EQs whose doorbell falls on that page,
-	 * even if the EQ itself isn't reserved.
-	 */
-	if (dev_cap->num_sys_eqs == 0)
-		dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
-					    dev_cap->reserved_eqs);
-	else
-		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SYS_EQS;
-
-	mlx4_dbg(dev, "Max ICM size %lld MB\n",
-		 (unsigned long long) dev_cap->max_icm_sz >> 20);
-	mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
-		 dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
-	mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
-		 dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
-	mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
-		 dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
-	mlx4_dbg(dev, "Num sys EQs: %d, max EQs: %d, reserved EQs: %d, entry size: %d\n",
-		dev_cap->num_sys_eqs, dev_cap->max_eqs, dev_cap->reserved_eqs,
-		dev_cap->eqc_entry_sz);
-	mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
-		 dev_cap->reserved_mrws, dev_cap->reserved_mtts);
-	mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
-		 dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
-	mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
-		 dev_cap->max_pds, dev_cap->reserved_mgms);
-	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
-		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
-	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
-		 dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1],
-		 dev_cap->max_port_width[1]);
-	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
-		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
-	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
-		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
-	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
-	mlx4_dbg(dev, "Max basic counters: %d\n", dev_cap->max_basic_counters);
-	mlx4_dbg(dev, "Max extended counters: %d\n", dev_cap->max_extended_counters);
-	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
-
-	dump_dev_cap_flags(dev, dev_cap->flags);
-	dump_dev_cap_flags2(dev, dev_cap->flags2);
-
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
 
+#define DEV_CAP_EXT_2_FLAG_PFC_COUNTERS	(1 << 28)
+#define DEV_CAP_EXT_2_FLAG_VLAN_CONTROL (1 << 26)
+#define DEV_CAP_EXT_2_FLAG_80_VFS	(1 << 21)
+#define DEV_CAP_EXT_2_FLAG_FSM		(1 << 20)
+
 int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 			       struct mlx4_vhcr *vhcr,
 			       struct mlx4_cmd_mailbox *inbox,
@@ -974,22 +1259,74 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	u64	flags;
 	int	err = 0;
 	u8	field;
+	u16	field16;
+	u32	bmme_flags, field32;
+	int	real_port;
+	int	slave_port;
+	int	first_port;
+	struct mlx4_active_ports actv_ports;
 
 	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
 			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 	if (err)
 		return err;
 
-	/* add port mng change event capability unconditionally to slaves */
+	disable_unsupported_roce_caps(outbox->buf);
+	/* add port mng change event capability and disable mw type 1
+	 * unconditionally to slaves
+	 */
 	MLX4_GET(flags, outbox->buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
 	flags |= MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV;
+	flags &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW;
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+	for (slave_port = 0, real_port = first_port;
+	     real_port < first_port +
+	     bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	     ++real_port, ++slave_port) {
+		if (flags & (MLX4_DEV_CAP_FLAG_WOL_PORT1 << real_port))
+			flags |= MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port;
+		else
+			flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port);
+	}
+	for (; slave_port < dev->caps.num_ports; ++slave_port)
+		flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port);
+
+	/* Not exposing RSS IP fragments to guests */
+	flags &= ~MLX4_DEV_CAP_FLAG_RSS_IP_FRAG;
 	MLX4_PUT(outbox->buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
 
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	field &= ~0x0F;
+	field |= bitmap_weight(actv_ports.ports, dev->caps.num_ports) & 0x0F;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VL_PORT_OFFSET);
+
+	/* For guests, disable timestamp */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+
+	/* For guests, disable vxlan tunneling and QoS support */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VXLAN);
+	field &= 0xd7;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VXLAN);
+
+	/* For guests, disable port BEACON */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+
 	/* For guests, report Blueflame disabled */
 	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET);
 	field &= 0x7f;
 	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET);
 
+	/* For guests, disable mw type 2 and port remap*/
+	MLX4_GET(bmme_flags, outbox->buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN;
+	bmme_flags &= ~MLX4_FLAG_PORT_REMAP;
+	MLX4_PUT(outbox->buf, bmme_flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+
 	/* turn off device-managed steering capability if not enabled */
 	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		MLX4_GET(field, outbox->buf,
@@ -998,9 +1335,55 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, field,
 			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
 	}
+
+	/* turn off ipoib managed steering for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+	field &= ~0x80;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+
+	/* turn off host side virt features (VST, FSM, etc) for guests */
+	MLX4_GET(field32, outbox->buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	field32 &= ~(DEV_CAP_EXT_2_FLAG_VLAN_CONTROL | DEV_CAP_EXT_2_FLAG_80_VFS |
+		     DEV_CAP_EXT_2_FLAG_FSM | DEV_CAP_EXT_2_FLAG_PFC_COUNTERS);
+	MLX4_PUT(outbox->buf, field32, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+
+	/* turn off QCN for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+	field &= 0xfe;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+
+	/* turn off QP max-rate limiting for guests */
+	field16 = 0;
+	MLX4_PUT(outbox->buf, field16, QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET);
+
+	/* turn off QoS per VF support for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+	field &= 0xef;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+
+	/* turn off ignore FCS feature for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+	field &= 0xfb;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+
 	return 0;
 }
 
+static void disable_unsupported_roce_caps(void *buf)
+{
+	u32 flags;
+
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	flags &= ~(1UL << 31);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	flags &= ~(1UL << 24);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	flags &= ~(MLX4_FLAG_ROCE_V1_V2);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+}
+
 int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
 			    struct mlx4_vhcr *vhcr,
 			    struct mlx4_cmd_mailbox *inbox,
@@ -1012,19 +1395,30 @@ int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
 	u8 port_type;
 	u16 short_field;
 	int err;
-	int admin_link_state;
+	int port = mlx4_slave_convert_port(dev, slave,
+					   vhcr->in_modifier & 0xFF);
 
 #define MLX4_VF_PORT_NO_LINK_SENSE_MASK	0xE0
 #define MLX4_PORT_LINK_UP_MASK		0x80
 #define QUERY_PORT_CUR_MAX_PKEY_OFFSET	0x0c
 #define QUERY_PORT_CUR_MAX_GID_OFFSET	0x0e
 
+	if (port < 0)
+		return -EINVAL;
+
+	/* Protect against untrusted guests: enforce that this is the
+	 * QUERY_PORT general query.
+	 */
+	if (vhcr->op_modifier || vhcr->in_modifier & ~0xFF)
+		return -EINVAL;
+
+	vhcr->in_modifier = port;
+
 	err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0,
 			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
 			   MLX4_CMD_NATIVE);
 
 	if (!err && dev->caps.function != slave) {
-		/* set slave default_mac address to be zero MAC */
 		def_mac = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac;
 		MLX4_PUT(outbox->buf, def_mac, QUERY_PORT_MAC_OFFSET);
 
@@ -1037,17 +1431,25 @@ int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
 		/* set port type to currently operating port type */
 		port_type |= (dev->caps.port_type[vhcr->in_modifier] & 0x3);
 
-		admin_link_state = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.link_state;
-		if (IFLA_VF_LINK_STATE_ENABLE == admin_link_state)
+		if (0 /* IFLA_VF_LINK_STATE_ENABLE == admin_link_state */)
 			port_type |= MLX4_PORT_LINK_UP_MASK;
-		else if (IFLA_VF_LINK_STATE_DISABLE == admin_link_state)
+		else if (1 /* IFLA_VF_LINK_STATE_DISABLE == admin_link_state */)
 			port_type &= ~MLX4_PORT_LINK_UP_MASK;
+		else if (0 /* IFLA_VF_LINK_STATE_AUTO == admin_link_state && mlx4_is_bonded(dev) */) {
+			int other_port = (port == 1) ? 2 : 1;
+			struct mlx4_port_cap port_cap;
+
+			err = mlx4_QUERY_PORT(dev, other_port, &port_cap);
+			if (err)
+				goto out;
+			port_type |= (port_cap.link_state << 7);
+		}
 
 		MLX4_PUT(outbox->buf, port_type,
 			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
 
 		if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH)
-			short_field = mlx4_get_slave_num_gids(dev, slave);
+			short_field = mlx4_get_slave_num_gids(dev, slave, port);
 		else
 			short_field = 1; /* slave max gids */
 		MLX4_PUT(outbox->buf, short_field,
@@ -1057,7 +1459,7 @@ int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, short_field,
 			 QUERY_PORT_CUR_MAX_PKEY_OFFSET);
 	}
-
+out:
 	return err;
 }
 
@@ -1107,7 +1509,6 @@ int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
-	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
 	pages = mailbox->buf;
 
 	for (mlx4_icm_first(icm, &iter);
@@ -1120,10 +1521,10 @@ int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
 		 */
 		lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1;
 		if (lg < MLX4_ICM_PAGE_SHIFT) {
-			mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
-				   MLX4_ICM_PAGE_SIZE,
-				   (unsigned long long) mlx4_icm_addr(&iter),
-				   mlx4_icm_size(&iter));
+			mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx)\n",
+				  MLX4_ICM_PAGE_SIZE,
+				  (unsigned long long) mlx4_icm_addr(&iter),
+				  mlx4_icm_size(&iter));
 			err = -EINVAL;
 			goto out;
 		}
@@ -1159,14 +1560,14 @@ int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
 
 	switch (op) {
 	case MLX4_CMD_MAP_FA:
-		mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW\n", tc, ts);
 		break;
 	case MLX4_CMD_MAP_ICM_AUX:
-		mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux\n", tc, ts);
 		break;
 	case MLX4_CMD_MAP_ICM:
-		mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
-			  tc, ts, (unsigned long long) virt - (ts << 10));
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM\n",
+			 tc, ts, (unsigned long long) virt - (ts << 10));
 		break;
 	}
 
@@ -1252,14 +1653,13 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev)
 	MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET);
 	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV ||
 	    cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) {
-		mlx4_err(dev, "Installed FW has unsupported "
-			 "command interface revision %d.\n",
+		mlx4_err(dev, "Installed FW has unsupported command interface revision %d\n",
 			 cmd_if_rev);
 		mlx4_err(dev, "(Installed FW version is %d.%d.%03d)\n",
 			 (int) (dev->caps.fw_ver >> 32),
 			 (int) (dev->caps.fw_ver >> 16) & 0xffff,
 			 (int) dev->caps.fw_ver & 0xffff);
-		mlx4_err(dev, "This driver version supports only revisions %d to %d.\n",
+		mlx4_err(dev, "This driver version supports only revisions %d to %d\n",
 			 MLX4_COMMAND_INTERFACE_MIN_REV, MLX4_COMMAND_INTERFACE_MAX_REV);
 		err = -ENODEV;
 		goto out;
@@ -1301,7 +1701,7 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev)
 	MLX4_GET(fw->clock_bar,    outbox, QUERY_FW_CLOCK_BAR);
 	fw->clock_bar = (fw->clock_bar >> 6) * 2;
 	mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n",
-		 fw->comm_bar, (unsigned long long)fw->comm_base);
+		 fw->clock_bar, (unsigned long long)fw->clock_offset);
 
 	/*
 	 * Round up number of system pages needed in case
@@ -1343,7 +1743,7 @@ int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
 	return 0;
 }
 
-static void get_board_id(void *vsd, char *board_id, char *vsdstr)
+static void get_board_id(void *vsd, char *board_id)
 {
 	int i;
 
@@ -1351,16 +1751,9 @@ static void get_board_id(void *vsd, char *board_id, char *vsdstr)
 #define VSD_OFFSET_SIG2		0xde
 #define VSD_OFFSET_MLX_BOARD_ID	0xd0
 #define VSD_OFFSET_TS_BOARD_ID	0x20
-#define VSD_LEN			0xd0
 
 #define VSD_SIGNATURE_TOPSPIN	0x5ad
 
-	memset(vsdstr, 0, MLX4_VSD_LEN);
-
-	for (i = 0; i < VSD_LEN / 4; i++)
-		((u32 *)vsdstr)[i] =
-			swab32(*(u32 *)(vsd + i * 4));
-
 	memset(board_id, 0, MLX4_BOARD_ID_LEN);
 
 	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
@@ -1372,9 +1765,19 @@ static void get_board_id(void *vsd, char *board_id, char *vsdstr)
 		 * swaps each 4-byte word before passing it back to
 		 * us.  Therefore we need to swab it before printing.
 		 */
-		for (i = 0; i < 4; ++i)
-			((u32 *) board_id)[i] =
-				swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+		u32 *bid_u32 = (u32 *)board_id;
+
+		for (i = 0; i < 4; ++i) {
+			typedef struct { u32 value; } __packed u64_p_t;
+
+			u32 *addr;
+			u32 val;
+
+			addr = (u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4);
+			val = ((u64_p_t *)addr)->value;
+			val = swab32(val);
+			((u64_p_t *)&bid_u32[i])->value = val;
+		}
 	}
 }
 
@@ -1387,7 +1790,6 @@ int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
 #define QUERY_ADAPTER_OUT_SIZE             0x100
 #define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
 #define QUERY_ADAPTER_VSD_OFFSET           0x20
-#define QUERY_ADAPTER_VSD_VENDOR_ID_OFFSET 0x1e
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -1401,11 +1803,8 @@ int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
 
 	MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
 
-	adapter->vsd_vendor_id = be16_to_cpup((u16 *)outbox +
-				QUERY_ADAPTER_VSD_VENDOR_ID_OFFSET / 2);
-
 	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
-		     adapter->board_id, adapter->vsd);
+		     adapter->board_id);
 
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
@@ -1416,13 +1815,18 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	__be32 *inbox;
-	u32 mw_enable;
 	int err;
+	static const u8 a0_dmfs_hw_steering[] =  {
+		[MLX4_STEERING_DMFS_A0_DEFAULT]		= 0,
+		[MLX4_STEERING_DMFS_A0_DYNAMIC]		= 1,
+		[MLX4_STEERING_DMFS_A0_STATIC]		= 2,
+		[MLX4_STEERING_DMFS_A0_DISABLE]		= 3
+	};
 
 #define INIT_HCA_IN_SIZE		 0x200
-#define INIT_HCA_DRV_NAME_FOR_FW_MAX_SIZE 64
 #define INIT_HCA_VERSION_OFFSET		 0x000
 #define	 INIT_HCA_VERSION		 2
+#define INIT_HCA_VXLAN_OFFSET		 0x0c
 #define INIT_HCA_CACHELINE_SZ_OFFSET	 0x0e
 #define INIT_HCA_FLAGS_OFFSET		 0x014
 #define INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET 0x018
@@ -1434,6 +1838,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define	 INIT_HCA_CQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x30)
 #define	 INIT_HCA_LOG_CQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x37)
 #define	 INIT_HCA_EQE_CQE_OFFSETS	 (INIT_HCA_QPC_OFFSET + 0x38)
+#define	 INIT_HCA_EQE_CQE_STRIDE_OFFSET  (INIT_HCA_QPC_OFFSET + 0x3b)
 #define	 INIT_HCA_ALTC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x40)
 #define	 INIT_HCA_AUXC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x50)
 #define	 INIT_HCA_EQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x60)
@@ -1448,10 +1853,10 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define  INIT_HCA_UC_STEERING_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x18)
 #define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
 #define  INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN	0x6
-#define  INIT_HCA_DRIVER_VERSION_OFFSET   0x140
 #define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
 #define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
 #define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
+#define  INIT_HCA_FS_A0_OFFSET		  (INIT_HCA_FS_PARAM_OFFSET + 0x18)
 #define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
 #define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
 #define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
@@ -1460,7 +1865,6 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define INIT_HCA_TPT_OFFSET		 0x0f0
 #define	 INIT_HCA_DMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x00)
 #define  INIT_HCA_TPT_MW_OFFSET		 (INIT_HCA_TPT_OFFSET + 0x08)
-#define  INIT_HCA_TPT_MW_ENABLE		 (1 << 31)
 #define	 INIT_HCA_LOG_MPT_SZ_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x0b)
 #define	 INIT_HCA_MTT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x10)
 #define	 INIT_HCA_CMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x18)
@@ -1473,12 +1877,10 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		return PTR_ERR(mailbox);
 	inbox = mailbox->buf;
 
-	memset(inbox, 0, INIT_HCA_IN_SIZE);
-
 	*((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
 
 	*((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) =
-		((ilog2(cache_line_size()) - 4) << 5) | (1 << 4);
+		(ilog2(cache_line_size()) - 4) << 5;
 
 #if defined(__LITTLE_ENDIAN)
 	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
@@ -1495,18 +1897,18 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3);
 
 	/* Enable QoS support if module parameter set */
-	if (enable_qos)
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG && enable_qos)
 		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2);
 
-	/* Enable fast drop performance optimization */
-	if (dev->caps.fast_drop)
-		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 7);
-
 	/* enable counters */
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)
 		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4);
 
-	/* CX3 is capable of extending CQEs\EQEs from 32 to 64 bytes */
+	/* Enable RSS spread to fragmented IP packets when supported */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_RSS_IP_FRAG)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 13);
+
+	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) {
 		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29);
 		dev->caps.eqe_size   = 64;
@@ -1519,22 +1921,31 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) {
 		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30);
 		dev->caps.cqe_size   = 64;
-		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
 	} else {
 		dev->caps.cqe_size   = 32;
 	}
 
+#if 0
+	/* XXX not currently supported by the FreeBSD's mlxen */
+	/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
+	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) &&
+	    (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) {
+		dev->caps.eqe_size = cache_line_size();
+		dev->caps.cqe_size = cache_line_size();
+		dev->caps.eqe_factor = 0;
+		MLX4_PUT(inbox, (u8)((ilog2(dev->caps.eqe_size) - 5) << 4 |
+				      (ilog2(dev->caps.eqe_size) - 5)),
+			 INIT_HCA_EQE_CQE_STRIDE_OFFSET);
+
+		/* User still need to know to support CQE > 32B */
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	}
+#endif
+
 	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
 		*(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31);
 
-	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW) {
-		strncpy((u8 *)mailbox->buf + INIT_HCA_DRIVER_VERSION_OFFSET,
-			DRV_NAME_FOR_FW,
-			INIT_HCA_DRV_NAME_FOR_FW_MAX_SIZE - 1);
-		mlx4_dbg(dev, "Reporting Driver Version to FW: %s\n",
-			 (u8 *)mailbox->buf + INIT_HCA_DRIVER_VERSION_OFFSET);
-	}
-
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
 	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
@@ -1566,8 +1977,11 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		/* Enable Ethernet flow steering
 		 * with udp unicast and tcp unicast
 		 */
-		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
-			 INIT_HCA_FS_ETH_BITS_OFFSET);
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_STATIC)
+			MLX4_PUT(inbox,
+				 (u8)(MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+				 INIT_HCA_FS_ETH_BITS_OFFSET);
 		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
 			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
 		/* Enable IPoIB flow steering
@@ -1577,6 +1991,13 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 			 INIT_HCA_FS_IB_BITS_OFFSET);
 		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
 			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			MLX4_PUT(inbox,
+				 ((u8)(a0_dmfs_hw_steering[dev->caps.dmfs_high_steer_mode]
+				       << 6)),
+				 INIT_HCA_FS_A0_OFFSET);
 	} else {
 		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
 		MLX4_PUT(inbox, param->log_mc_entry_sz,
@@ -1593,8 +2014,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	/* TPT attributes */
 
 	MLX4_PUT(inbox, param->dmpt_base,  INIT_HCA_DMPT_BASE_OFFSET);
-	mw_enable = param->mw_enable ? INIT_HCA_TPT_MW_ENABLE : 0;
-	MLX4_PUT(inbox, mw_enable,	   INIT_HCA_TPT_MW_OFFSET);
+	MLX4_PUT(inbox, param->mw_enabled, INIT_HCA_TPT_MW_OFFSET);
 	MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
 	MLX4_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
 	MLX4_PUT(inbox, param->cmpt_base,  INIT_HCA_CMPT_BASE_OFFSET);
@@ -1604,8 +2024,14 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	MLX4_PUT(inbox, param->uar_page_sz,	INIT_HCA_UAR_PAGE_SZ_OFFSET);
 	MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
 
-	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000,
-		       MLX4_CMD_NATIVE);
+	/* set parser VXLAN attributes */
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS) {
+		u8 parser_params = 0;
+		MLX4_PUT(inbox, parser_params,	INIT_HCA_VXLAN_OFFSET);
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA,
+		       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 
 	if (err)
 		mlx4_err(dev, "INIT_HCA returns %d\n", err);
@@ -1620,9 +2046,14 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	struct mlx4_cmd_mailbox *mailbox;
 	__be32 *outbox;
 	u32 dword_field;
-	u32 mw_enable;
 	int err;
 	u8 byte_field;
+	static const u8 a0_dmfs_query_hw_steering[] =  {
+		[0] = MLX4_STEERING_DMFS_A0_DEFAULT,
+		[1] = MLX4_STEERING_DMFS_A0_DYNAMIC,
+		[2] = MLX4_STEERING_DMFS_A0_STATIC,
+		[3] = MLX4_STEERING_DMFS_A0_DISABLE
+	};
 
 #define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
 #define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
@@ -1668,6 +2099,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 		else
 			param->steering_mode = MLX4_STEERING_MODE_A0;
 	}
+
+	if (dword_field & (1 << 13))
+		param->rss_ip_frags = 1;
+
 	/* steering attributes */
 	if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET);
@@ -1675,6 +2110,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
 		MLX4_GET(param->log_mc_table_sz, outbox,
 			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		MLX4_GET(byte_field, outbox,
+			 INIT_HCA_FS_A0_OFFSET);
+		param->dmfs_high_steer_mode =
+			a0_dmfs_query_hw_steering[(byte_field >> 6) & 3];
 	} else {
 		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
 		MLX4_GET(param->log_mc_entry_sz, outbox,
@@ -1685,19 +2124,28 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
 	}
 
-	/* CX3 is capable of extending CQEs\EQEs from 32 to 64 bytes */
+	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
 	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS);
 	if (byte_field & 0x20) /* 64-bytes eqe enabled */
 		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED;
 	if (byte_field & 0x40) /* 64-bytes cqe enabled */
 		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED;
 
+	/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
+	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET);
+	if (byte_field) {
+		param->dev_cap_enabled |= MLX4_DEV_CAP_EQE_STRIDE_ENABLED;
+		param->dev_cap_enabled |= MLX4_DEV_CAP_CQE_STRIDE_ENABLED;
+		param->cqe_size = 1 << ((byte_field &
+					 MLX4_CQE_SIZE_MASK_STRIDE) + 5);
+		param->eqe_size = 1 << (((byte_field &
+					  MLX4_EQE_SIZE_MASK_STRIDE) >> 4) + 5);
+	}
+
 	/* TPT attributes */
 
 	MLX4_GET(param->dmpt_base,  outbox, INIT_HCA_DMPT_BASE_OFFSET);
-	MLX4_GET(mw_enable,	    outbox, INIT_HCA_TPT_MW_OFFSET);
-	param->mw_enable = (mw_enable & INIT_HCA_TPT_MW_ENABLE) ==
-			   INIT_HCA_TPT_MW_ENABLE;
+	MLX4_GET(param->mw_enabled, outbox, INIT_HCA_TPT_MW_OFFSET);
 	MLX4_GET(param->log_mpt_sz, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET);
 	MLX4_GET(param->mtt_base,   outbox, INIT_HCA_MTT_BASE_OFFSET);
 	MLX4_GET(param->cmpt_base,  outbox, INIT_HCA_CMPT_BASE_OFFSET);
@@ -1707,6 +2155,40 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
 	MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
 
+	/* phv_check enable */
+	MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET);
+	if (byte_field & 0x2)
+		param->phv_check_en = 1;
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+static int mlx4_hca_core_clock_update(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *outbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_warn(dev, "hca_core_clock mailbox allocation failed\n");
+		return PTR_ERR(mailbox);
+	}
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+			   MLX4_CMD_QUERY_HCA,
+			   MLX4_CMD_TIME_CLASS_B,
+			   !mlx4_is_slave(dev));
+	if (err) {
+		mlx4_warn(dev, "hca_core_clock update failed\n");
+		goto out;
+	}
+
+	MLX4_GET(dev->caps.hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET);
+
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
@@ -1733,9 +2215,12 @@ int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
 			   struct mlx4_cmd_info *cmd)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int port = vhcr->in_modifier;
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier);
 	int err;
 
+	if (port < 0)
+		return -EINVAL;
+
 	if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port))
 		return 0;
 
@@ -1794,8 +2279,6 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
 			return PTR_ERR(mailbox);
 		inbox = mailbox->buf;
 
-		memset(inbox, 0, INIT_PORT_IN_SIZE);
-
 		flags = 0;
 		flags |= (dev->caps.vl_cap[port] & 0xf) << INIT_PORT_VL_SHIFT;
 		flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
@@ -1816,6 +2299,9 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
 		err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
 			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 
+	if (!err)
+		mlx4_hca_core_clock_update(dev);
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
@@ -1827,9 +2313,12 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 			    struct mlx4_cmd_info *cmd)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int port = vhcr->in_modifier;
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier);
 	int err;
 
+	if (port < 0)
+		return -EINVAL;
+
 	if (!(priv->mfunc.master.slave_state[slave].init_port_mask &
 	    (1 << port)))
 		return 0;
@@ -1837,7 +2326,7 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
 		if (priv->mfunc.master.init_port_ref[port] == 1) {
 			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
-				       1000, MLX4_CMD_NATIVE);
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 			if (err)
 				return err;
 		}
@@ -1848,7 +2337,7 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 			if (!priv->mfunc.master.qp0_state[port].qp0_active &&
 			    priv->mfunc.master.qp0_state[port].port_active) {
 				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
-					       1000, MLX4_CMD_NATIVE);
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 				if (err)
 					return err;
 				priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
@@ -1863,17 +2352,188 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
 {
-	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000,
-			MLX4_CMD_WRAPPED);
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 }
 EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
 
 int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
 {
-	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000,
-			MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA,
+			MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 }
 
+struct mlx4_config_dev {
+	__be32	update_flags;
+	__be32	rsvd1[3];
+	__be16	vxlan_udp_dport;
+	__be16	rsvd2;
+	__be16  roce_v2_entropy;
+	__be16  roce_v2_udp_dport;
+	__be32	roce_flags;
+	__be32	rsvd4[25];
+	__be16	rsvd5;
+	u8	rsvd6;
+	u8	rx_checksum_val;
+};
+
+#define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_ROCE_V2_UDP_DPORT BIT(3)
+#define MLX4_DISABLE_RX_PORT BIT(18)
+
+static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, config_dev, sizeof(*config_dev));
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_CONFIG_DEV,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_CONFIG_DEV_get(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 1, MLX4_CMD_CONFIG_DEV,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (!err)
+		memcpy(config_dev, mailbox->buf, sizeof(*config_dev));
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+/* Conversion between the HW values and the actual functionality.
+ * The value represented by the array index,
+ * and the functionality determined by the flags.
+ */
+static const u8 config_dev_csum_flags[] = {
+	[0] =	0,
+	[1] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP,
+	[2] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_L4,
+	[3] =	MLX4_RX_CSUM_MODE_L4			|
+		MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_MULTI_VLAN
+};
+
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params)
+{
+	struct mlx4_config_dev config_dev = {0};
+	int err;
+	u8 csum_mask;
+
+#define CONFIG_DEV_RX_CSUM_MODE_MASK			0x7
+#define CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET	0
+#define CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET	4
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CONFIG_DEV))
+		return -ENOTSUPP;
+
+	err = mlx4_CONFIG_DEV_get(dev, &config_dev);
+	if (err)
+		return err;
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= sizeof(config_dev_csum_flags)/sizeof(config_dev_csum_flags[0]))
+		return -EINVAL;
+	params->rx_csum_flags_port_1 = config_dev_csum_flags[csum_mask];
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= sizeof(config_dev_csum_flags)/sizeof(config_dev_csum_flags[0]))
+		return -EINVAL;
+	params->rx_csum_flags_port_2 = config_dev_csum_flags[csum_mask];
+
+	params->vxlan_udp_dport = be16_to_cpu(config_dev.vxlan_udp_dport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_config_dev_retrieval);
+
+int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_VXLAN_UDP_DPORT);
+	config_dev.vxlan_udp_dport = udp_port;
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port);
+
+#define CONFIG_DISABLE_RX_PORT BIT(15)
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags = cpu_to_be32(MLX4_DISABLE_RX_PORT);
+	if (dis)
+		config_dev.roce_flags =
+			cpu_to_be32(CONFIG_DISABLE_RX_PORT);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_ROCE_V2_UDP_DPORT);
+	config_dev.roce_v2_udp_dport = cpu_to_be16(udp_port);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_roce_v2_port);
+
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct {
+		__be32 v_port1;
+		__be32 v_port2;
+	} *v2p;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	v2p = mailbox->buf;
+	v2p->v_port1 = cpu_to_be32(port1);
+	v2p->v_port2 = cpu_to_be32(port2);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0,
+		       MLX4_SET_PORT_VIRT2PHY, MLX4_CMD_VIRT_PORT_MAP,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+
 int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
 {
 	int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
@@ -1895,51 +2555,81 @@ int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
 int mlx4_NOP(struct mlx4_dev *dev)
 {
 	/* Input modifier of 0x1f means "finish as soon as possible." */
-	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
 }
 
-int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length,
-			     u8 op_modifier, u32 in_offset[],
-			     u32 counter_out[])
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+			     const u32 offset[],
+			     u32 value[], size_t array_len, u8 port)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	u32 *outbox;
+	size_t i;
 	int ret;
-	int i;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
+
 	outbox = mailbox->buf;
 
-	ret = mlx4_cmd_box(dev, 0, mailbox->dma, 0, op_modifier,
+	ret = mlx4_cmd_box(dev, 0, mailbox->dma, port, op_modifier,
 			   MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A,
 			   MLX4_CMD_NATIVE);
 	if (ret)
 		goto out;
 
-	for (i = 0; i < array_length; i++) {
-		if (in_offset[i] > MLX4_MAILBOX_SIZE) {
+	for (i = 0; i < array_len; i++) {
+		if (offset[i] > MLX4_MAILBOX_SIZE) {
 			ret = -EINVAL;
 			goto out;
 		}
 
-		MLX4_GET(counter_out[i], outbox, in_offset[i]);
+		MLX4_GET(value[i], outbox, offset[i]);
 	}
 
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(mlx4_query_diag_counters);
+EXPORT_SYMBOL(mlx4_query_diag_counters);
 
-int mlx4_MOD_STAT_CFG_wrapper(struct mlx4_dev *dev, int slave,
-			  struct mlx4_vhcr *vhcr,
-			  struct mlx4_cmd_mailbox *inbox,
-			  struct mlx4_cmd_mailbox *outbox,
-			  struct mlx4_cmd_info *cmd)
+int mlx4_get_phys_port_id(struct mlx4_dev *dev)
 {
-	return -EPERM;
+	u8 port;
+	u32 *outbox;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	u32 guid_hi, guid_lo;
+	int err, ret = 0;
+#define MOD_STAT_CFG_PORT_OFFSET 8
+#define MOD_STAT_CFG_GUID_H	 0X14
+#define MOD_STAT_CFG_GUID_L	 0X1c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		in_mod = port << MOD_STAT_CFG_PORT_OFFSET;
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, in_mod, 0x2,
+				   MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Fail to get port %d uplink guid\n",
+				 port);
+			ret = err;
+		} else {
+			MLX4_GET(guid_hi, outbox, MOD_STAT_CFG_GUID_H);
+			MLX4_GET(guid_lo, outbox, MOD_STAT_CFG_GUID_L);
+			dev->caps.phys_port_id[port] = (u64)guid_lo |
+						       (u64)guid_hi << 32;
+		}
+	}
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
 }
 
 #define MLX4_WOL_SETUP_MODE (5 << 28)
@@ -1969,7 +2659,8 @@ enum {
 
 void mlx4_opreq_action(struct work_struct *work)
 {
-	struct mlx4_priv *priv = container_of(work, struct mlx4_priv, opreq_task);
+	struct mlx4_priv *priv = container_of(work, struct mlx4_priv,
+					      opreq_task);
 	struct mlx4_dev *dev = &priv->dev;
 	int num_tasks = atomic_read(&priv->opreq_count);
 	struct mlx4_cmd_mailbox *mailbox;
@@ -1977,7 +2668,6 @@ void mlx4_opreq_action(struct work_struct *work)
 	u32 *outbox;
 	u32 modifier;
 	u16 token;
-	u16 type_m;
 	u16 type;
 	int err;
 	u32 num_qps;
@@ -2003,34 +2693,42 @@ void mlx4_opreq_action(struct work_struct *work)
 				   MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
 				   MLX4_CMD_NATIVE);
 		if (err) {
-			mlx4_err(dev, "Failed to retrieve required operation: %d\n", err);
+			mlx4_err(dev, "Failed to retrieve required operation: %d\n",
+				 err);
 			return;
 		}
 		MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET);
 		MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET);
 		MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET);
-		type_m = type >> 12;
 		type &= 0xfff;
 
 		switch (type) {
 		case ADD_TO_MCG:
-			if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
-				mlx4_warn(dev, "ADD MCG operation is not supported in "
-					       "DEVICE_MANAGED steerign mode\n");
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				mlx4_warn(dev, "ADD MCG operation is not supported in DEVICE_MANAGED steering mode\n");
 				err = EPERM;
 				break;
 			}
-			mgm = (struct mlx4_mgm *) ((u8 *) (outbox) + GET_OP_REQ_DATA_OFFSET);
-			num_qps = be32_to_cpu(mgm->members_count) & MGM_QPN_MASK;
-			rem_mcg = ((u8 *) (&mgm->members_count))[0] & 1;
-			prot = ((u8 *) (&mgm->members_count))[0] >> 6;
+			mgm = (struct mlx4_mgm *)((u8 *)(outbox) +
+						  GET_OP_REQ_DATA_OFFSET);
+			num_qps = be32_to_cpu(mgm->members_count) &
+				  MGM_QPN_MASK;
+			rem_mcg = ((u8 *)(&mgm->members_count))[0] & 1;
+			prot = ((u8 *)(&mgm->members_count))[0] >> 6;
 
 			for (i = 0; i < num_qps; i++) {
 				qp.qpn = be32_to_cpu(mgm->qp[i]);
 				if (rem_mcg)
-					err = mlx4_multicast_detach(dev, &qp, mgm->gid, prot, 0);
+					err = mlx4_multicast_detach(dev, &qp,
+								    mgm->gid,
+								    prot, 0);
 				else
-					err = mlx4_multicast_attach(dev, &qp, mgm->gid, mgm->gid[5] ,0, prot, NULL);
+					err = mlx4_multicast_attach(dev, &qp,
+								    mgm->gid,
+								    mgm->gid[5]
+								    , 0, prot,
+								    NULL);
 				if (err)
 					break;
 			}
@@ -2040,11 +2738,13 @@ void mlx4_opreq_action(struct work_struct *work)
 			err = EINVAL;
 			break;
 		}
-		err = mlx4_cmd(dev, 0, ((u32) err | cpu_to_be32(token) << 16), 1,
-			       MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+		err = mlx4_cmd(dev, 0, ((u32) err |
+					(__force u32)cpu_to_be32(token) << 16),
+			       1, MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
 			       MLX4_CMD_NATIVE);
 		if (err) {
-			mlx4_err(dev, "Failed to acknowledge required request: %d\n", err);
+			mlx4_err(dev, "Failed to acknowledge required request: %d\n",
+				 err);
 			goto out;
 		}
 		memset(outbox, 0, 0xffc);
@@ -2054,3 +2754,300 @@ void mlx4_opreq_action(struct work_struct *work)
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
 }
+
+static int mlx4_check_smp_firewall_active(struct mlx4_dev *dev,
+					  struct mlx4_cmd_mailbox *mailbox)
+{
+#define MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET		0x10
+#define MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET		0x20
+#define MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET		0x40
+#define MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET	0x70
+
+	u32 set_attr_mask, getresp_attr_mask;
+	u32 trap_attr_mask, traprepress_attr_mask;
+
+	MLX4_GET(set_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall set_attribute_mask = 0x%x\n",
+		 set_attr_mask);
+
+	MLX4_GET(getresp_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall getresp_attribute_mask = 0x%x\n",
+		 getresp_attr_mask);
+
+	MLX4_GET(trap_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall trap_attribute_mask = 0x%x\n",
+		 trap_attr_mask);
+
+	MLX4_GET(traprepress_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall traprepress_attribute_mask = 0x%x\n",
+		 traprepress_attr_mask);
+
+	if (set_attr_mask && getresp_attr_mask && trap_attr_mask &&
+	    traprepress_attr_mask)
+		return 1;
+
+	return 0;
+}
+
+int mlx4_config_mad_demux(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	/* Check if mad_demux is supported */
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_MAD_DEMUX))
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_DEMUX");
+		return -ENOMEM;
+	}
+
+	/* Query mad_demux to find out which MADs are handled by internal sma */
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn mgmt class */,
+			   MLX4_CMD_MAD_DEMUX_QUERY_RESTR, MLX4_CMD_MAD_DEMUX,
+			   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: query restrictions failed (%d)\n",
+			  err);
+		goto out;
+	}
+
+	if (mlx4_check_smp_firewall_active(dev, mailbox))
+		dev->flags |= MLX4_FLAG_SECURE_HOST;
+
+	/* Config mad_demux to handle all MADs returned by the query above */
+	err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn mgmt class */,
+		       MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: configure failed (%d)\n", err);
+		goto out;
+	}
+
+	if (dev->flags & MLX4_FLAG_SECURE_HOST)
+		mlx4_warn(dev, "HCA operating in secure-host mode. SMP firewall activated.\n");
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+/* Access Reg commands */
+enum mlx4_access_reg_masks {
+	MLX4_ACCESS_REG_STATUS_MASK = 0x7f,
+	MLX4_ACCESS_REG_METHOD_MASK = 0x7f,
+	MLX4_ACCESS_REG_LEN_MASK = 0x7ff
+};
+
+struct mlx4_access_reg {
+	__be16 constant1;
+	u8 status;
+	u8 resrvd1;
+	__be16 reg_id;
+	u8 method;
+	u8 constant2;
+	__be32 resrvd2[2];
+	__be16 len_const;
+	__be16 resrvd3;
+#define MLX4_ACCESS_REG_HEADER_SIZE (20)
+	u8 reg_data[MLX4_MAILBOX_SIZE-MLX4_ACCESS_REG_HEADER_SIZE];
+} __attribute__((__packed__));
+
+/**
+ * mlx4_ACCESS_REG - Generic access reg command.
+ * @dev: mlx4_dev.
+ * @reg_id: register ID to access.
+ * @method: Access method Read/Write.
+ * @reg_len: register length to Read/Write in bytes.
+ * @reg_data: reg_data pointer to Read/Write From/To.
+ *
+ * Access ConnectX registers FW command.
+ * Returns 0 on success and copies outbox mlx4_access_reg data
+ * field into reg_data or a negative error code.
+ */
+static int mlx4_ACCESS_REG(struct mlx4_dev *dev, u16 reg_id,
+			   enum mlx4_access_reg_method method,
+			   u16 reg_len, void *reg_data)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_access_reg *inbuf, *outbuf;
+	int err;
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox))
+		return PTR_ERR(inbox);
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		return PTR_ERR(outbox);
+	}
+
+	inbuf = inbox->buf;
+	outbuf = outbox->buf;
+
+	inbuf->constant1 = cpu_to_be16(0x1<<11 | 0x4);
+	inbuf->constant2 = 0x1;
+	inbuf->reg_id = cpu_to_be16(reg_id);
+	inbuf->method = method & MLX4_ACCESS_REG_METHOD_MASK;
+
+	reg_len = min(reg_len, (u16)(sizeof(inbuf->reg_data)));
+	inbuf->len_const =
+		cpu_to_be16(((reg_len/4 + 1) & MLX4_ACCESS_REG_LEN_MASK) |
+			    ((0x3) << 12));
+
+	memcpy(inbuf->reg_data, reg_data, reg_len);
+	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, 0, 0,
+			   MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	if (outbuf->status & MLX4_ACCESS_REG_STATUS_MASK) {
+		err = outbuf->status & MLX4_ACCESS_REG_STATUS_MASK;
+		mlx4_err(dev,
+			 "MLX4_CMD_ACCESS_REG(%x) returned REG status (%x)\n",
+			 reg_id, err);
+		goto out;
+	}
+
+	memcpy(reg_data, outbuf->reg_data, reg_len);
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return err;
+}
+
+/* ConnectX registers IDs */
+enum mlx4_reg_id {
+	MLX4_REG_ID_PTYS = 0x5004,
+};
+
+/**
+ * mlx4_ACCESS_PTYS_REG - Access PTYs (Port Type and Speed)
+ * register
+ * @dev: mlx4_dev.
+ * @method: Access method Read/Write.
+ * @ptys_reg: PTYS register data pointer.
+ *
+ * Access ConnectX PTYS register, to Read/Write Port Type/Speed
+ * configuration
+ * Returns 0 on success or a negative error code.
+ */
+int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
+			 enum mlx4_access_reg_method method,
+			 struct mlx4_ptys_reg *ptys_reg)
+{
+	return mlx4_ACCESS_REG(dev, MLX4_REG_ID_PTYS,
+			       method, sizeof(*ptys_reg), ptys_reg);
+}
+EXPORT_SYMBOL_GPL(mlx4_ACCESS_PTYS_REG);
+
+int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_access_reg *inbuf = inbox->buf;
+	u8 method = inbuf->method & MLX4_ACCESS_REG_METHOD_MASK;
+	u16 reg_id = be16_to_cpu(inbuf->reg_id);
+
+	if (slave != mlx4_master_func_num(dev) &&
+	    method == MLX4_ACCESS_REG_WRITE)
+		return -EPERM;
+
+	if (reg_id == MLX4_REG_ID_PTYS) {
+		struct mlx4_ptys_reg *ptys_reg =
+			(struct mlx4_ptys_reg *)inbuf->reg_data;
+
+		ptys_reg->local_port =
+			mlx4_slave_convert_port(dev, slave,
+						ptys_reg->local_port);
+	}
+
+	return mlx4_cmd_box(dev, inbox->dma, outbox->dma, vhcr->in_modifier,
+			    0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
+			    MLX4_CMD_NATIVE);
+}
+
+static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit)
+{
+#define SET_PORT_GEN_PHV_VALID	0x10
+#define SET_PORT_GEN_PHV_EN	0x80
+
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+
+	context->v_ignore_fcs |=  SET_PORT_GEN_PHV_VALID;
+	if (phv_bit)
+		context->phv_en |=  SET_PORT_GEN_PHV_EN;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv)
+{
+	int err;
+	struct mlx4_func_cap func_cap;
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, port, &func_cap);
+	if (!err)
+		*phv = func_cap.flags0 & QUERY_FUNC_CAP_PHV_BIT;
+	return err;
+}
+EXPORT_SYMBOL(get_phv_bit);
+
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val)
+{
+	int ret;
+
+	if (mlx4_is_slave(dev))
+		return -EPERM;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		ret = mlx4_SET_PORT_phv_bit(dev, port, new_val);
+		if (!ret)
+			dev->caps.phv_bit[port] = new_val;
+		return ret;
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(set_phv_bit);
+
+void mlx4_replace_zero_macs(struct mlx4_dev *dev)
+{
+	int i;
+	u8 mac_addr[ETH_ALEN];
+
+	dev->port_random_macs = 0;
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		if (!dev->caps.def_mac[i] &&
+		    dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) {
+			random_ether_addr(mac_addr);
+			dev->port_random_macs |= 1 << i;
+			dev->caps.def_mac[i] = mlx4_mac_to_u64(mac_addr);
+		}
+}
+EXPORT_SYMBOL_GPL(mlx4_replace_zero_macs);
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_fw_qos.c b/sys/dev/mlx4/mlx4_core/mlx4_fw_qos.c
new file mode 100644
index 000000000000..63bdbfb69b90
--- /dev/null
+++ b/sys/dev/mlx4/mlx4_core/mlx4_fw_qos.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include "fw_qos.h"
+#include "fw.h"
+
+enum {
+	/* allocate vpp opcode modifiers */
+	MLX4_ALLOCATE_VPP_ALLOCATE	= 0x0,
+	MLX4_ALLOCATE_VPP_QUERY		= 0x1
+};
+
+enum {
+	/* set vport qos opcode modifiers */
+	MLX4_SET_VPORT_QOS_SET		= 0x0,
+	MLX4_SET_VPORT_QOS_QUERY	= 0x1
+};
+
+struct mlx4_set_port_prio2tc_context {
+	u8 prio2tc[4];
+};
+
+struct mlx4_port_scheduler_tc_cfg_be {
+	__be16 pg;
+	__be16 bw_precentage;
+	__be16 max_bw_units; /* 3-100Mbps, 4-1Gbps, other values - reserved */
+	__be16 max_bw_value;
+};
+
+struct mlx4_set_port_scheduler_context {
+	struct mlx4_port_scheduler_tc_cfg_be tc[MLX4_NUM_TC];
+};
+
+/* Granular Qos (per VF) section */
+struct mlx4_alloc_vpp_param {
+	__be32 availible_vpp;
+	__be32 vpp_p_up[MLX4_NUM_UP];
+};
+
+struct mlx4_prio_qos_param {
+	__be32 bw_share;
+	__be32 max_avg_bw;
+	__be32 reserved;
+	__be32 enable;
+	__be32 reserved1[4];
+};
+
+struct mlx4_set_vport_context {
+	__be32 reserved[8];
+	struct mlx4_prio_qos_param qos_p_up[MLX4_NUM_UP];
+};
+
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_prio2tc_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	context = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i += 2)
+		context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1];
+
+	in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC);
+
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+			    u8 *pg, u16 *ratelimit)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_scheduler_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	context = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_TC; i++) {
+		struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i];
+		u16 r;
+
+		if (ratelimit && ratelimit[i]) {
+			if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) {
+				r = ratelimit[i];
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_100M_UNITS);
+			} else {
+				r = ratelimit[i] / 10;
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_1G_UNITS);
+			}
+			tc->max_bw_value = htons(r);
+		} else {
+			tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT);
+			tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS);
+		}
+
+		tc->pg = htons(pg[i]);
+		tc->bw_precentage = htons(tc_tx_bw[i]);
+	}
+
+	in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
+
+int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
+			  u16 *availible_vpp, u8 *vpp_p_up)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_alloc_vpp_param *out_param;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	out_param = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, port,
+			   MLX4_ALLOCATE_VPP_QUERY,
+			   MLX4_CMD_ALLOCATE_VPP,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	/* Total number of supported VPPs */
+	*availible_vpp = (u16)be32_to_cpu(out_param->availible_vpp);
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		vpp_p_up[i] = (u8)be32_to_cpu(out_param->vpp_p_up[i]);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_ALLOCATE_VPP_get);
+
+int mlx4_ALLOCATE_VPP_set(struct mlx4_dev *dev, u8 port, u8 *vpp_p_up)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_alloc_vpp_param *in_param;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	in_param = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		in_param->vpp_p_up[i] = cpu_to_be32(vpp_p_up[i]);
+
+	err = mlx4_cmd(dev, mailbox->dma, port,
+		       MLX4_ALLOCATE_VPP_ALLOCATE,
+		       MLX4_CMD_ALLOCATE_VPP,
+		       MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_ALLOCATE_VPP_set);
+
+int mlx4_SET_VPORT_QOS_get(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *out_param)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vport_context *ctx;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ctx = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, (vport << 8) | port,
+			   MLX4_SET_VPORT_QOS_QUERY,
+			   MLX4_CMD_SET_VPORT_QOS,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		out_param[i].bw_share = be32_to_cpu(ctx->qos_p_up[i].bw_share);
+		out_param[i].max_avg_bw =
+			be32_to_cpu(ctx->qos_p_up[i].max_avg_bw);
+		out_param[i].enable =
+			!!(be32_to_cpu(ctx->qos_p_up[i].enable) & 31);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_VPORT_QOS_get);
+
+int mlx4_SET_VPORT_QOS_set(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *in_param)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vport_context *ctx;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ctx = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		ctx->qos_p_up[i].bw_share = cpu_to_be32(in_param[i].bw_share);
+		ctx->qos_p_up[i].max_avg_bw =
+				cpu_to_be32(in_param[i].max_avg_bw);
+		ctx->qos_p_up[i].enable =
+				cpu_to_be32(in_param[i].enable << 31);
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma, (vport << 8) | port,
+		       MLX4_SET_VPORT_QOS_SET,
+		       MLX4_CMD_SET_VPORT_QOS,
+		       MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_VPORT_QOS_set);
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_icm.c b/sys/dev/mlx4/mlx4_core/mlx4_icm.c
index 02af1589ec00..3e3b25a0f6b6 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_icm.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_icm.c
@@ -57,7 +57,7 @@ static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chu
 	int i;
 
 	if (chunk->nsg > 0)
-		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+		pci_unmap_sg(dev->persist->pdev, chunk->mem, chunk->npages,
 			     PCI_DMA_BIDIRECTIONAL);
 
 	for (i = 0; i < chunk->npages; ++i)
@@ -70,7 +70,8 @@ static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *
 	int i;
 
 	for (i = 0; i < chunk->npages; ++i)
-		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+		dma_free_coherent(&dev->persist->pdev->dev,
+				  chunk->mem[i].length,
 				  lowmem_page_address(sg_page(&chunk->mem[i])),
 				  sg_dma_address(&chunk->mem[i]));
 }
@@ -135,10 +136,12 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 	/* We use sg_set_buf for coherent allocs, which assumes low memory */
 	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
 
-	icm = kmalloc_node(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN),
+	icm = kmalloc_node(sizeof(*icm),
+			   gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN),
 			   dev->numa_node);
 	if (!icm) {
-		icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+		icm = kmalloc(sizeof(*icm),
+			      gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
 		if (!icm)
 			return NULL;
 	}
@@ -150,12 +153,14 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 
 	while (npages > 0) {
 		if (!chunk) {
-			chunk = kmalloc_node(sizeof *chunk,
-					     gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN),
+			chunk = kmalloc_node(sizeof(*chunk),
+					     gfp_mask & ~(__GFP_HIGHMEM |
+							  __GFP_NOWARN),
 					     dev->numa_node);
 			if (!chunk) {
-				chunk = kmalloc(sizeof *chunk,
-						gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+				chunk = kmalloc(sizeof(*chunk),
+						gfp_mask & ~(__GFP_HIGHMEM |
+							     __GFP_NOWARN));
 				if (!chunk)
 					goto fail;
 			}
@@ -170,7 +175,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 			--cur_order;
 
 		if (coherent)
-			ret = mlx4_alloc_icm_coherent(&dev->pdev->dev,
+			ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev,
 						      &chunk->mem[chunk->npages],
 						      cur_order, gfp_mask);
 		else
@@ -190,7 +195,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 		if (coherent)
 			++chunk->nsg;
 		else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
-			chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+			chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
 						chunk->npages,
 						PCI_DMA_BIDIRECTIONAL);
 
@@ -205,7 +210,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 	}
 
 	if (!coherent && chunk) {
-		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+		chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
 					chunk->npages,
 					PCI_DMA_BIDIRECTIONAL);
 
@@ -242,7 +247,8 @@ int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
 			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
-int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj,
+		   gfp_t gfp)
 {
 	u32 i = (obj & (table->num_obj - 1)) /
 			(MLX4_TABLE_CHUNK_SIZE / table->obj_size);
@@ -256,7 +262,7 @@ int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
 	}
 
 	table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
-				       (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+				       (table->lowmem ? gfp : GFP_HIGHUSER) |
 				       __GFP_NOWARN, table->coherent);
 	if (!table->icm[i]) {
 		ret = -ENOMEM;
@@ -289,14 +295,10 @@ void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
 
 	if (--table->icm[i]->refcount == 0) {
 		offset = (u64) i * MLX4_TABLE_CHUNK_SIZE;
-
-		if (!mlx4_UNMAP_ICM(dev, table->virt + offset,
-				    MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE)) {
-			mlx4_free_icm(dev, table->icm[i], table->coherent);
-			table->icm[i] = NULL;
-		} else {
-			pr_warn("mlx4_core: mlx4_UNMAP_ICM failed.\n");
-		}
+		mlx4_UNMAP_ICM(dev, table->virt + offset,
+			       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
 	}
 
 	mutex_unlock(&table->mutex);
@@ -357,7 +359,7 @@ int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 	u32 i;
 
 	for (i = start; i <= end; i += inc) {
-		err = mlx4_table_get(dev, table, i);
+		err = mlx4_table_get(dev, table, i, GFP_KERNEL);
 		if (err)
 			goto fail;
 	}
@@ -383,7 +385,7 @@ void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 }
 
 int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
-			u64 virt, int obj_size,	u64 nobj, int reserved,
+			u64 virt, int obj_size,	u32 nobj, int reserved,
 			int use_lowmem, int use_coherent)
 {
 	int obj_per_chunk;
@@ -393,7 +395,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 	u64 size;
 
 	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
-	num_icm = div_u64((nobj + obj_per_chunk - 1), obj_per_chunk);
+	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
 
 	table->icm      = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL);
 	if (!table->icm)
@@ -436,15 +438,11 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 err:
 	for (i = 0; i < num_icm; ++i)
 		if (table->icm[i]) {
-			if (!mlx4_UNMAP_ICM(dev,
-					    virt + i * MLX4_TABLE_CHUNK_SIZE,
-					    MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE)) {
-				mlx4_free_icm(dev, table->icm[i], use_coherent);
-			} else {
-				pr_warn("mlx4_core: mlx4_UNMAP_ICM failed.\n");
-				return -ENOMEM;
-			}
+			mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
 		}
+
 	kfree(table->icm);
 
 	return -ENOMEM;
@@ -452,22 +450,14 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 
 void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
 {
-	int i, err = 0;
+	int i;
 
 	for (i = 0; i < table->num_icm; ++i)
 		if (table->icm[i]) {
-			err = mlx4_UNMAP_ICM(dev,
-					     table->virt + i * MLX4_TABLE_CHUNK_SIZE,
-					     MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
-			if (!err) {
-				mlx4_free_icm(dev, table->icm[i],
-					      table->coherent);
-			} else {
-				pr_warn("mlx4_core: mlx4_UNMAP_ICM failed.\n");
-				break;
-			}
+			mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], table->coherent);
 		}
 
-	if (!err)
-		kfree(table->icm);
+	kfree(table->icm);
 }
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_intf.c b/sys/dev/mlx4/mlx4_core/mlx4_intf.c
index 141629801d2f..2791cb1ce717 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_intf.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_intf.c
@@ -38,6 +38,7 @@
 
 struct mlx4_device_context {
 	struct list_head	list;
+	struct list_head	bond_list;
 	struct mlx4_interface  *intf;
 	void		       *context;
 };
@@ -61,6 +62,8 @@ static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
 		spin_lock_irq(&priv->ctx_lock);
 		list_add_tail(&dev_ctx->list, &priv->ctx_list);
 		spin_unlock_irq(&priv->ctx_lock);
+		if (intf->activate)
+			intf->activate(&priv->dev, dev_ctx->context);
 	} else
 		kfree(dev_ctx);
 }
@@ -91,8 +94,14 @@ int mlx4_register_interface(struct mlx4_interface *intf)
 	mutex_lock(&intf_mutex);
 
 	list_add_tail(&intf->list, &intf_list);
-	list_for_each_entry(priv, &dev_list, dev_list)
+	list_for_each_entry(priv, &dev_list, dev_list) {
+		if (mlx4_is_mfunc(&priv->dev) && (intf->flags & MLX4_INTFF_BONDING)) {
+			mlx4_dbg(&priv->dev,
+				 "SRIOV, disabling HA mode for intf proto %d\n", intf->protocol);
+			intf->flags &= ~MLX4_INTFF_BONDING;
+		}
 		mlx4_add_device(intf, priv);
+	}
 
 	mutex_unlock(&intf_mutex);
 
@@ -115,6 +124,58 @@ void mlx4_unregister_interface(struct mlx4_interface *intf)
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
 
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx = NULL, *temp_dev_ctx;
+	unsigned long flags;
+	int ret;
+	LIST_HEAD(bond_list);
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -ENOTSUPP;
+
+	ret = mlx4_disable_rx_port_check(dev, enable);
+	if (ret) {
+		mlx4_err(dev, "Fail to %s rx port check\n",
+			 enable ? "enable" : "disable");
+		return ret;
+	}
+	if (enable) {
+		dev->flags |= MLX4_FLAG_BONDED;
+	} else {
+		ret = mlx4_virt2phy_port_map(dev, 1, 2);
+		if (ret) {
+			mlx4_err(dev, "Fail to reset port map\n");
+			return ret;
+		}
+		dev->flags &= ~MLX4_FLAG_BONDED;
+	}
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+	list_for_each_entry_safe(dev_ctx, temp_dev_ctx, &priv->ctx_list, list) {
+		if (dev_ctx->intf->flags & MLX4_INTFF_BONDING) {
+			list_add_tail(&dev_ctx->bond_list, &bond_list);
+			list_del(&dev_ctx->list);
+		}
+	}
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &bond_list, bond_list) {
+		dev_ctx->intf->remove(dev, dev_ctx->context);
+		dev_ctx->context =  dev_ctx->intf->add(dev);
+
+		spin_lock_irqsave(&priv->ctx_lock, flags);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+		mlx4_dbg(dev, "Inrerface for protocol %d restarted with when bonded mode is %s\n",
+			 dev_ctx->intf->protocol, enable ?
+			 "enabled" : "disabled");
+	}
+	return 0;
+}
+
 void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
 			 unsigned long param)
 {
@@ -138,13 +199,13 @@ int mlx4_register_device(struct mlx4_dev *dev)
 
 	mutex_lock(&intf_mutex);
 
+	dev->persist->interface_state |= MLX4_INTERFACE_STATE_UP;
 	list_add_tail(&priv->dev_list, &dev_list);
 	list_for_each_entry(intf, &intf_list, list)
 		mlx4_add_device(intf, priv);
 
 	mutex_unlock(&intf_mutex);
-	if (!mlx4_is_slave(dev))
-		mlx4_start_catas_poll(dev);
+	mlx4_start_catas_poll(dev);
 
 	return 0;
 }
@@ -154,14 +215,17 @@ void mlx4_unregister_device(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_interface *intf;
 
-	if (!mlx4_is_slave(dev))
-		mlx4_stop_catas_poll(dev);
+	if (!(dev->persist->interface_state & MLX4_INTERFACE_STATE_UP))
+		return;
+
+	mlx4_stop_catas_poll(dev);
 	mutex_lock(&intf_mutex);
 
 	list_for_each_entry(intf, &intf_list, list)
 		mlx4_remove_device(intf, priv);
 
-	list_del_init(&priv->dev_list);
+	list_del(&priv->dev_list);
+	dev->persist->interface_state &= ~MLX4_INTERFACE_STATE_UP;
 
 	mutex_unlock(&intf_mutex);
 }
@@ -186,3 +250,4 @@ void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int
 	return result;
 }
 EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev);
+
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_main.c b/sys/dev/mlx4/mlx4_core/mlx4_main.c
index bd2e19680059..dd650033d135 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_main.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_main.c
@@ -46,6 +46,8 @@
 #include <linux/netdevice.h>
 #include <linux/string.h>
 #include <linux/fs.h>
+#include <linux/cache.h>
+#include <linux/random.h>
 
 #include <dev/mlx4/device.h>
 #include <dev/mlx4/doorbell.h>
@@ -55,7 +57,9 @@
 #include "icm.h"
 #include <dev/mlx4/stats.h>
 
-/* Mellanox ConnectX HCA low-level driver */
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
 
 struct workqueue_struct *mlx4_wq;
 
@@ -71,7 +75,7 @@ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 
 static int msi_x = 1;
 module_param(msi_x, int, 0444);
-MODULE_PARM_DESC(msi_x, "0 - don't use MSI-X, 1 - use MSI-X, >1 - limit number of MSI-X irqs to msi_x (non-SRIOV only)");
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
 
 #else /* CONFIG_PCI_MSI */
 
@@ -79,71 +83,19 @@ MODULE_PARM_DESC(msi_x, "0 - don't use MSI-X, 1 - use MSI-X, >1 - limit number o
 
 #endif /* CONFIG_PCI_MSI */
 
-static int enable_sys_tune = 0;
-module_param(enable_sys_tune, int, 0444);
-MODULE_PARM_DESC(enable_sys_tune, "Tune the cpu's for better performance (default 0)");
+static uint8_t num_vfs[3] = {0, 0, 0};
+static int num_vfs_argc;
+module_param_array(num_vfs, byte , &num_vfs_argc, 0444);
+MODULE_PARM_DESC(num_vfs, "enable #num_vfs functions if num_vfs > 0\n"
+			  "num_vfs=port1,port2,port1+2");
 
-int mlx4_blck_lb = 1;
-module_param_named(block_loopback, mlx4_blck_lb, int, 0644);
-MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0 "
-				 "(default: 1)");
-enum {
-	DEFAULT_DOMAIN	= 0,
-	BDF_STR_SIZE	= 8, /* bb:dd.f- */
-	DBDF_STR_SIZE	= 13 /* mmmm:bb:dd.f- */
-};
-
-enum {
-	NUM_VFS,
-	PROBE_VF,
-	PORT_TYPE_ARRAY
-};
-
-enum {
-	VALID_DATA,
-	INVALID_DATA,
-	INVALID_STR
-};
-
-struct param_data {
-	int				id;
-	struct mlx4_dbdf2val_lst	dbdf2val;
-};
-
-static struct param_data num_vfs = {
-	.id		= NUM_VFS,
-	.dbdf2val = {
-		.name		= "num_vfs param",
-		.num_vals	= 1,
-		.def_val	= {0},
-		.range		= {0, MLX4_MAX_NUM_VF}
-	}
-};
-module_param_string(num_vfs, num_vfs.dbdf2val.str,
-		    sizeof(num_vfs.dbdf2val.str), 0444);
-MODULE_PARM_DESC(num_vfs,
-		 "Either single value (e.g. '5') to define uniform num_vfs value for all devices functions\n"
-		 "\t\tor a string to map device function numbers to their num_vfs values (e.g. '0000:04:00.0-5,002b:1c:0b.a-15').\n"
-		 "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for num_vfs value (e.g. 15).");
-
-static struct param_data probe_vf = {
-	.id		= PROBE_VF,
-	.dbdf2val = {
-		.name		= "probe_vf param",
-		.num_vals	= 1,
-		.def_val	= {0},
-		.range		= {0, MLX4_MAX_NUM_VF}
-	}
-};
-module_param_string(probe_vf, probe_vf.dbdf2val.str,
-		    sizeof(probe_vf.dbdf2val.str), 0444);
-MODULE_PARM_DESC(probe_vf,
-		 "Either single value (e.g. '3') to define uniform number of VFs to probe by the pf driver for all devices functions\n"
-		 "\t\tor a string to map device function numbers to their probe_vf values (e.g. '0000:04:00.0-3,002b:1c:0b.a-13').\n"
-		 "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for probe_vf value (e.g. 13).");
+static uint8_t probe_vf[3] = {0, 0, 0};
+static int probe_vfs_argc;
+module_param_array(probe_vf, byte, &probe_vfs_argc, 0444);
+MODULE_PARM_DESC(probe_vf, "number of vfs to probe by pf driver (num_vfs > 0)\n"
+			   "probe_vf=port1,port2,port1+2");
 
 int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
-
 module_param_named(log_num_mgm_entry_size,
 			mlx4_log_num_mgm_entry_size, int, 0444);
 MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num"
@@ -153,28 +105,45 @@ MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num"
 					 " To activate device managed"
 					 " flow steering when available, set to -1");
 
-static int high_rate_steer;
-module_param(high_rate_steer, int, 0444);
-MODULE_PARM_DESC(high_rate_steer, "Enable steering mode for higher packet rate"
-				  " (default off)");
-
-static int fast_drop;
-module_param_named(fast_drop, fast_drop, int, 0444);
-MODULE_PARM_DESC(fast_drop,
-		 "Enable fast packet drop when no receive WQEs are posted");
-
-int mlx4_enable_64b_cqe_eqe = 1;
-module_param_named(enable_64b_cqe_eqe, mlx4_enable_64b_cqe_eqe, int, 0644);
+static bool enable_64b_cqe_eqe = true;
+module_param(enable_64b_cqe_eqe, bool, 0444);
 MODULE_PARM_DESC(enable_64b_cqe_eqe,
-		 "Enable 64 byte CQEs/EQEs when the FW supports this if non-zero (default: 1)");
+		 "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)");
 
-#define HCA_GLOBAL_CAP_MASK            0
+static bool enable_4k_uar;
+module_param(enable_4k_uar, bool, 0444);
+MODULE_PARM_DESC(enable_4k_uar,
+		 "Enable using 4K UAR. Should not be enabled if have VFs which do not support 4K UARs (default: false)");
 
-#define PF_CONTEXT_BEHAVIOUR_MASK	MLX4_FUNC_CAP_64B_EQE_CQE
+#define PF_CONTEXT_BEHAVIOUR_MASK	(MLX4_FUNC_CAP_64B_EQE_CQE | \
+					 MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
+					 MLX4_FUNC_CAP_DMFS_A0_STATIC)
 
-static char mlx4_version[] __devinitdata =
-	DRV_NAME ": Mellanox ConnectX VPI driver v"
-	DRV_VERSION "\n";
+#define RESET_PERSIST_MASK_FLAGS	(MLX4_FLAG_SRIOV)
+
+static char mlx4_version[] =
+	DRV_NAME ": Mellanox ConnectX core driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static struct mlx4_profile default_profile = {
+	.num_qp		= 1 << 18,
+	.num_srq	= 1 << 16,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 16,
+	.num_mcg	= 1 << 13,
+	.num_mpt	= 1 << 19,
+	.num_mtt	= 1 << 20, /* It is really num mtt segements */
+};
+
+static struct mlx4_profile low_mem_profile = {
+	.num_qp		= 1 << 17,
+	.num_srq	= 1 << 6,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 8,
+	.num_mcg	= 1 << 8,
+	.num_mpt	= 1 << 9,
+	.num_mtt	= 1 << 7,
+};
 
 static int log_num_mac = 7;
 module_param_named(log_num_mac, log_num_mac, int, 0444);
@@ -182,33 +151,21 @@ MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
 
 static int log_num_vlan;
 module_param_named(log_num_vlan, log_num_vlan, int, 0444);
-MODULE_PARM_DESC(log_num_vlan,
-	"(Obsolete) Log2 max number of VLANs per ETH port (0-7)");
+MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)");
 /* Log2 max number of VLANs per ETH port (0-7) */
 #define MLX4_LOG_NUM_VLANS 7
+#define MLX4_MIN_LOG_NUM_VLANS 0
+#define MLX4_MIN_LOG_NUM_MAC 1
 
-int log_mtts_per_seg = ilog2(1);
+static bool use_prio;
+module_param_named(use_prio, use_prio, bool, 0444);
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports (deprecated)");
+
+int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
 module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
-MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment "
-		 "(0-7) (default: 0)");
-
-static struct param_data port_type_array = {
-	.id		= PORT_TYPE_ARRAY,
-	.dbdf2val = {
-		.name		= "port_type_array param",
-		.num_vals	= 2,
-		.def_val	= {MLX4_PORT_TYPE_ETH, MLX4_PORT_TYPE_ETH},
-		.range		= {MLX4_PORT_TYPE_IB, MLX4_PORT_TYPE_NA}
-	}
-};
-module_param_string(port_type_array, port_type_array.dbdf2val.str,
-		    sizeof(port_type_array.dbdf2val.str), 0444);
-MODULE_PARM_DESC(port_type_array,
-		 "Either pair of values (e.g. '1,2') to define uniform port1/port2 types configuration for all devices functions\n"
-		 "\t\tor a string to map device function numbers to their pair of port types values (e.g. '0000:04:00.0-1;2,002b:1c:0b.a-1;1').\n"
-		 "\t\tValid port types: 1-ib, 2-eth, 3-auto, 4-N/A\n"
-		 "\t\tIn case that only one port is available use the N/A port type for port2 (e.g '1,4').");
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)");
 
+static int port_type_array[2] = {MLX4_PORT_TYPE_NONE, MLX4_PORT_TYPE_NONE};
 
 struct mlx4_port_config {
 	struct list_head list;
@@ -216,345 +173,20 @@ struct mlx4_port_config {
 	struct pci_dev *pdev;
 };
 
-#define MLX4_LOG_NUM_MTT 20
-/* We limit to 30 as of a bit map issue which uses int and not uint.
-     see mlx4_buddy_init -> bitmap_zero which gets int.
-*/
-#define MLX4_MAX_LOG_NUM_MTT 30
-static struct mlx4_profile mod_param_profile = {
-	.num_qp         = 19,
-	.num_srq        = 16,
-	.rdmarc_per_qp  = 4,
-	.num_cq         = 16,
-	.num_mcg        = 13,
-	.num_mpt        = 19,
-	.num_mtt_segs   = 0, /* max(20, 2*MTTs for host memory)) */
-};
+static atomic_t pf_loading = ATOMIC_INIT(0);
 
-module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444);
-MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA (default: 19)");
-
-module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444);
-MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA "
-		 "(default: 16)");
-
-module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int,
-		   0444);
-MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP "
-		 "(default: 4)");
-
-module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444);
-MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA (default: 16)");
-
-module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444);
-MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA "
-		 "(default: 13)");
-
-module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444);
-MODULE_PARM_DESC(log_num_mpt,
-		 "log maximum number of memory protection table entries per "
-		 "HCA (default: 19)");
-
-module_param_named(log_num_mtt, mod_param_profile.num_mtt_segs, int, 0444);
-MODULE_PARM_DESC(log_num_mtt,
-		 "log maximum number of memory translation table segments per "
-		 "HCA (default: max(20, 2*MTTs for register all of the host memory limited to 30))");
-
-enum {
-	MLX4_IF_STATE_BASIC,
-	MLX4_IF_STATE_EXTENDED
-};
-
-static inline u64 dbdf_to_u64(int domain, int bus, int dev, int fn)
+static inline void mlx4_set_num_reserved_uars(struct mlx4_dev *dev,
+					      struct mlx4_dev_cap *dev_cap)
 {
-	return (domain << 20) | (bus << 12) | (dev << 4) | fn;
-}
-
-static inline void pr_bdf_err(const char *dbdf, const char *pname)
-{
-	pr_warn("mlx4_core: '%s' is not valid bdf in '%s'\n", dbdf, pname);
-}
-
-static inline void pr_val_err(const char *dbdf, const char *pname,
-			      const char *val)
-{
-	pr_warn("mlx4_core: value '%s' of bdf '%s' in '%s' is not valid\n"
-		, val, dbdf, pname);
-}
-
-static inline void pr_out_of_range_bdf(const char *dbdf, int val,
-				       struct mlx4_dbdf2val_lst *dbdf2val)
-{
-	pr_warn("mlx4_core: value %d in bdf '%s' of '%s' is out of its valid range (%d,%d)\n"
-		, val, dbdf, dbdf2val->name , dbdf2val->range.min,
-		dbdf2val->range.max);
-}
-
-static inline void pr_out_of_range(struct mlx4_dbdf2val_lst *dbdf2val)
-{
-	pr_warn("mlx4_core: value of '%s' is out of its valid range (%d,%d)\n"
-		, dbdf2val->name , dbdf2val->range.min, dbdf2val->range.max);
-}
-
-static inline int is_in_range(int val, struct mlx4_range *r)
-{
-	return (val >= r->min && val <= r->max);
-}
-
-static int update_defaults(struct param_data *pdata)
-{
-	long int val[MLX4_MAX_BDF_VALS];
-	int ret;
-	char *t, *p = pdata->dbdf2val.str;
-	char sval[32];
-	int val_len;
-
-	if (!strlen(p) || strchr(p, ':') || strchr(p, '.') || strchr(p, ';'))
-		return INVALID_STR;
-
-	switch (pdata->id) {
-	case PORT_TYPE_ARRAY:
-		t = strchr(p, ',');
-		if (!t || t == p || (t - p) > sizeof(sval))
-			return INVALID_STR;
-
-		val_len = t - p;
-		strncpy(sval, p, val_len);
-		sval[val_len] = 0;
-
-		ret = kstrtol(sval, 0, &val[0]);
-		if (ret == -EINVAL)
-			return INVALID_STR;
-		if (ret || !is_in_range(val[0], &pdata->dbdf2val.range)) {
-			pr_out_of_range(&pdata->dbdf2val);
-			return INVALID_DATA;
-		}
-
-		ret = kstrtol(t + 1, 0, &val[1]);
-		if (ret == -EINVAL)
-			return INVALID_STR;
-		if (ret || !is_in_range(val[1], &pdata->dbdf2val.range)) {
-			pr_out_of_range(&pdata->dbdf2val);
-			return INVALID_DATA;
-		}
-
-		pdata->dbdf2val.tbl[0].val[0] = val[0];
-		pdata->dbdf2val.tbl[0].val[1] = val[1];
-		break;
-
-	case NUM_VFS:
-	case PROBE_VF:
-		ret = kstrtol(p, 0, &val[0]);
-		if (ret == -EINVAL)
-			return INVALID_STR;
-		if (ret || !is_in_range(val[0], &pdata->dbdf2val.range)) {
-			pr_out_of_range(&pdata->dbdf2val);
-			return INVALID_DATA;
-		}
-		pdata->dbdf2val.tbl[0].val[0] = val[0];
-		break;
-	}
-	pdata->dbdf2val.tbl[1].dbdf = MLX4_ENDOF_TBL;
-
-	return VALID_DATA;
-}
-
-int mlx4_fill_dbdf2val_tbl(struct mlx4_dbdf2val_lst *dbdf2val_lst)
-{
-	int domain, bus, dev, fn;
-	u64 dbdf;
-	char *p, *t, *v;
-	char tmp[32];
-	char sbdf[32];
-	char sep = ',';
-	int j, k, str_size, i = 1;
-	int prfx_size;
-
-	p = dbdf2val_lst->str;
-
-	for (j = 0; j < dbdf2val_lst->num_vals; j++)
-		dbdf2val_lst->tbl[0].val[j] = dbdf2val_lst->def_val[j];
-	dbdf2val_lst->tbl[1].dbdf = MLX4_ENDOF_TBL;
-
-	str_size = strlen(dbdf2val_lst->str);
-
-	if (str_size == 0)
-		return 0;
-
-	while (strlen(p)) {
-		prfx_size = BDF_STR_SIZE;
-		sbdf[prfx_size] = 0;
-		strncpy(sbdf, p, prfx_size);
-		domain = DEFAULT_DOMAIN;
-		if (sscanf(sbdf, "%02x:%02x.%x-", &bus, &dev, &fn) != 3) {
-			prfx_size = DBDF_STR_SIZE;
-			sbdf[prfx_size] = 0;
-			strncpy(sbdf, p, prfx_size);
-			if (sscanf(sbdf, "%04x:%02x:%02x.%x-", &domain, &bus,
-				   &dev, &fn) != 4) {
-				pr_bdf_err(sbdf, dbdf2val_lst->name);
-				goto err;
-			}
-			sprintf(tmp, "%04x:%02x:%02x.%x-", domain, bus, dev,
-				fn);
-		} else {
-			sprintf(tmp, "%02x:%02x.%x-", bus, dev, fn);
-		}
-
-		if (strnicmp(sbdf, tmp, sizeof(tmp))) {
-			pr_bdf_err(sbdf, dbdf2val_lst->name);
-			goto err;
-		}
-
-		dbdf = dbdf_to_u64(domain, bus, dev, fn);
-
-		for (j = 1; j < i; j++)
-			if (dbdf2val_lst->tbl[j].dbdf == dbdf) {
-				pr_warn("mlx4_core: in '%s', %s appears multiple times\n"
-					, dbdf2val_lst->name, sbdf);
-				goto err;
-			}
-
-		if (i >= MLX4_DEVS_TBL_SIZE) {
-			pr_warn("mlx4_core: Too many devices in '%s'\n"
-				, dbdf2val_lst->name);
-			goto err;
-		}
-
-		p += prfx_size;
-		t = strchr(p, sep);
-		t = t ? t : p + strlen(p);
-		if (p >= t) {
-			pr_val_err(sbdf, dbdf2val_lst->name, "");
-			goto err;
-		}
-
-		for (k = 0; k < dbdf2val_lst->num_vals; k++) {
-			char sval[32];
-			long int val;
-			int ret, val_len;
-			char vsep = ';';
-
-			v = (k == dbdf2val_lst->num_vals - 1) ? t : strchr(p, vsep);
-			if (!v || v > t || v == p || (v - p) > sizeof(sval)) {
-				pr_val_err(sbdf, dbdf2val_lst->name, p);
-				goto err;
-			}
-			val_len = v - p;
-			strncpy(sval, p, val_len);
-			sval[val_len] = 0;
-
-			ret = kstrtol(sval, 0, &val);
-			if (ret) {
-				if (strchr(p, vsep))
-					pr_warn("mlx4_core: too many vals in bdf '%s' of '%s'\n"
-						, sbdf, dbdf2val_lst->name);
-				else
-					pr_val_err(sbdf, dbdf2val_lst->name,
-						   sval);
-				goto err;
-			}
-			if (!is_in_range(val, &dbdf2val_lst->range)) {
-				pr_out_of_range_bdf(sbdf, val, dbdf2val_lst);
-				goto err;
-			}
-
-			dbdf2val_lst->tbl[i].val[k] = val;
-			p = v;
-			if (p[0] == vsep)
-				p++;
-		}
-
-		dbdf2val_lst->tbl[i].dbdf = dbdf;
-		if (strlen(p)) {
-			if (p[0] != sep) {
-				pr_warn("mlx4_core: expect separator '%c' before '%s' in '%s'\n"
-					, sep, p, dbdf2val_lst->name);
-				goto err;
-			}
-			p++;
-		}
-		i++;
-		if (i < MLX4_DEVS_TBL_SIZE)
-			dbdf2val_lst->tbl[i].dbdf = MLX4_ENDOF_TBL;
-	}
-
-	return 0;
-
-err:
-	dbdf2val_lst->tbl[1].dbdf = MLX4_ENDOF_TBL;
-	pr_warn("mlx4_core: The value of '%s' is incorrect. The value is discarded!\n"
-		, dbdf2val_lst->name);
-
-	return -EINVAL;
-}
-EXPORT_SYMBOL(mlx4_fill_dbdf2val_tbl);
-
-int mlx4_get_val(struct mlx4_dbdf2val *tbl, struct pci_dev *pdev, int idx,
-		 int *val)
-{
-	u64 dbdf;
-	int i = 1;
-
-	*val = tbl[0].val[idx];
-	if (!pdev)
-		return -EINVAL;
-
-        dbdf = dbdf_to_u64(pci_get_domain(pdev->dev.bsddev), pci_get_bus(pdev->dev.bsddev),
-			   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
-
-	while ((i < MLX4_DEVS_TBL_SIZE) && (tbl[i].dbdf != MLX4_ENDOF_TBL)) {
-		if (tbl[i].dbdf == dbdf) {
-			*val = tbl[i].val[idx];
-			return 0;
-		}
-		i++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(mlx4_get_val);
-
-static void process_mod_param_profile(struct mlx4_profile *profile)
-{
-        vm_size_t hwphyssz;
-        hwphyssz = 0;
-        TUNABLE_ULONG_FETCH("hw.realmem", (u_long *) &hwphyssz);
-
-	profile->num_qp        = 1 << mod_param_profile.num_qp;
-	profile->num_srq       = 1 << mod_param_profile.num_srq;
-	profile->rdmarc_per_qp = 1 << mod_param_profile.rdmarc_per_qp;
-	profile->num_cq	       = 1 << mod_param_profile.num_cq;
-	profile->num_mcg       = 1 << mod_param_profile.num_mcg;
-	profile->num_mpt       = 1 << mod_param_profile.num_mpt;
-	/*
-	 * We want to scale the number of MTTs with the size of the
-	 * system memory, since it makes sense to register a lot of
-	 * memory on a system with a lot of memory.  As a heuristic,
-	 * make sure we have enough MTTs to register twice the system
-	 * memory (with PAGE_SIZE entries).
-	 *
-	 * This number has to be a power of two and fit into 32 bits
-	 * due to device limitations. We cap this at 2^30 as of bit map
-	 * limitation to work with int instead of uint (mlx4_buddy_init -> bitmap_zero)
-	 * That limits us to 4TB of memory registration per HCA with
-	 * 4KB pages, which is probably OK for the next few months.
+	/* The reserved_uars is calculated by system page size unit.
+	 * Therefore, adjustment is added when the uar page size is less
+	 * than the system page size
 	 */
-	if (mod_param_profile.num_mtt_segs)
-		profile->num_mtt_segs = 1 << mod_param_profile.num_mtt_segs;
-	else {
-		profile->num_mtt_segs =
-			roundup_pow_of_two(max_t(unsigned,
-						1 << (MLX4_LOG_NUM_MTT - log_mtts_per_seg),
-						min(1UL << 
-						(MLX4_MAX_LOG_NUM_MTT -
-						log_mtts_per_seg),
-						(hwphyssz << 1)
-						>> log_mtts_per_seg)));
-		/* set the actual value, so it will be reflected to the user
-		   using the sysfs */
-		mod_param_profile.num_mtt_segs = ilog2(profile->num_mtt_segs);
-	}
+	dev->caps.reserved_uars	=
+		max_t(int,
+		      mlx4_get_num_reserved_uar(dev),
+		      dev_cap->reserved_uars /
+			(1 << (PAGE_SHIFT - dev->uar_page_shift)));
 }
 
 int mlx4_check_port_params(struct mlx4_dev *dev,
@@ -562,11 +194,10 @@ int mlx4_check_port_params(struct mlx4_dev *dev,
 {
 	int i;
 
-	for (i = 0; i < dev->caps.num_ports - 1; i++) {
-		if (port_type[i] != port_type[i + 1]) {
-			if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
-				mlx4_err(dev, "Only same port types supported "
-					 "on this HCA, aborting.\n");
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+		for (i = 0; i < dev->caps.num_ports - 1; i++) {
+			if (port_type[i] != port_type[i + 1]) {
+				mlx4_err(dev, "Only same port types supported on this HCA, aborting\n");
 				return -EINVAL;
 			}
 		}
@@ -574,8 +205,8 @@ int mlx4_check_port_params(struct mlx4_dev *dev,
 
 	for (i = 0; i < dev->caps.num_ports; i++) {
 		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
-			mlx4_err(dev, "Requested port type for port %d is not "
-				      "supported on this HCA\n", i + 1);
+			mlx4_err(dev, "Requested port type for port %d is not supported on this HCA\n",
+				 i + 1);
 			return -EINVAL;
 		}
 	}
@@ -613,6 +244,101 @@ static int mlx4_query_func(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	return err;
 }
 
+static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev)
+{
+	struct mlx4_caps *dev_cap = &dev->caps;
+
+	/* FW not supporting or cancelled by user */
+	if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) ||
+	    !(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE))
+		return;
+
+	/* Must have 64B CQE_EQE enabled by FW to use bigger stride
+	 * When FW has NCSI it may decide not to report 64B CQE/EQEs
+	 */
+	if (!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_EQE) ||
+	    !(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_CQE)) {
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+		return;
+	}
+
+	if (cache_line_size() == 128 || cache_line_size() == 256) {
+		mlx4_dbg(dev, "Enabling CQE stride cacheLine supported\n");
+		/* Changing the real data inside CQE size to 32B */
+		dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
+		dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
+
+		if (mlx4_is_master(dev))
+			dev_cap->function_caps |= MLX4_FUNC_CAP_EQE_CQE_STRIDE;
+	} else {
+		if (cache_line_size() != 32  && cache_line_size() != 64)
+			mlx4_dbg(dev, "Disabling CQE stride, cacheLine size unsupported\n");
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+	}
+}
+
+static int _mlx4_dev_port(struct mlx4_dev *dev, int port,
+			  struct mlx4_port_cap *port_cap)
+{
+	dev->caps.vl_cap[port]	    = port_cap->max_vl;
+	dev->caps.ib_mtu_cap[port]	    = port_cap->ib_mtu;
+	dev->phys_caps.gid_phys_table_len[port]  = port_cap->max_gids;
+	dev->phys_caps.pkey_phys_table_len[port] = port_cap->max_pkeys;
+	/* set gid and pkey table operating lengths by default
+	 * to non-sriov values
+	 */
+	dev->caps.gid_table_len[port]  = port_cap->max_gids;
+	dev->caps.pkey_table_len[port] = port_cap->max_pkeys;
+	dev->caps.port_width_cap[port] = port_cap->max_port_width;
+	dev->caps.eth_mtu_cap[port]    = port_cap->eth_mtu;
+	dev->caps.max_tc_eth	       = port_cap->max_tc_eth;
+	dev->caps.def_mac[port]        = port_cap->def_mac;
+	dev->caps.supported_type[port] = port_cap->supported_port_types;
+	dev->caps.suggested_type[port] = port_cap->suggested_type;
+	dev->caps.default_sense[port] = port_cap->default_sense;
+	dev->caps.trans_type[port]	    = port_cap->trans_type;
+	dev->caps.vendor_oui[port]     = port_cap->vendor_oui;
+	dev->caps.wavelength[port]     = port_cap->wavelength;
+	dev->caps.trans_code[port]     = port_cap->trans_code;
+
+	return 0;
+}
+
+static int mlx4_dev_port(struct mlx4_dev *dev, int port,
+			 struct mlx4_port_cap *port_cap)
+{
+	int err = 0;
+
+	err = mlx4_QUERY_PORT(dev, port, port_cap);
+
+	if (err)
+		mlx4_err(dev, "QUERY_PORT command failed.\n");
+
+	return err;
+}
+
+static inline void mlx4_enable_ignore_fcs(struct mlx4_dev *dev)
+{
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_IGNORE_FCS))
+		return;
+
+	if (mlx4_is_mfunc(dev)) {
+		mlx4_dbg(dev, "SRIOV mode - Disabling Ignore FCS");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+		return;
+	}
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
+		mlx4_dbg(dev,
+			 "Keep FCS is not supported - Disabling Ignore FCS");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+		return;
+	}
+}
+
+#define MLX4_A0_STEERING_TABLE_SIZE	256
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -620,55 +346,41 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 
 	err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
 	if (err) {
-		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
 		return err;
 	}
+	mlx4_dev_cap_dump(dev, dev_cap);
 
 	if (dev_cap->min_page_sz > PAGE_SIZE) {
-		mlx4_err(dev, "HCA minimum page size of %d bigger than "
-			 "kernel PAGE_SIZE of %d, aborting.\n",
-			 dev_cap->min_page_sz, (int)PAGE_SIZE);
+		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
+			 dev_cap->min_page_sz, (long)PAGE_SIZE);
 		return -ENODEV;
 	}
 	if (dev_cap->num_ports > MLX4_MAX_PORTS) {
-		mlx4_err(dev, "HCA has %d ports, but we only support %d, "
-			 "aborting.\n",
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n",
 			 dev_cap->num_ports, MLX4_MAX_PORTS);
 		return -ENODEV;
 	}
 
-	if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
-		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than "
-			 "PCI resource 2 size of 0x%llx, aborting.\n",
+	if (dev_cap->uar_size > pci_resource_len(dev->persist->pdev, 2)) {
+		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
 			 dev_cap->uar_size,
-			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
 		return -ENODEV;
 	}
 
 	dev->caps.num_ports	     = dev_cap->num_ports;
-       dev->caps.num_sys_eqs = dev_cap->num_sys_eqs;
-       dev->phys_caps.num_phys_eqs = dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS ?
-                                     dev->caps.num_sys_eqs :
-                                     MLX4_MAX_EQ_NUM;
+	dev->caps.num_sys_eqs = dev_cap->num_sys_eqs;
+	dev->phys_caps.num_phys_eqs = dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS ?
+				      dev->caps.num_sys_eqs :
+				      MLX4_MAX_EQ_NUM;
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
-		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
-		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
-		dev->phys_caps.gid_phys_table_len[i]  = dev_cap->max_gids[i];
-		dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i];
-		/* set gid and pkey table operating lengths by default
-		 * to non-sriov values */
-		dev->caps.gid_table_len[i]  = dev_cap->max_gids[i];
-		dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
-		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
-		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
-		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
-		dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
-		dev->caps.suggested_type[i] = dev_cap->suggested_type[i];
-		dev->caps.default_sense[i] = dev_cap->default_sense[i];
-		dev->caps.trans_type[i]	    = dev_cap->trans_type[i];
-		dev->caps.vendor_oui[i]     = dev_cap->vendor_oui[i];
-		dev->caps.wavelength[i]     = dev_cap->wavelength[i];
-		dev->caps.trans_code[i]     = dev_cap->trans_code[i];
+		err = _mlx4_dev_port(dev, i, dev_cap->port_cap + i);
+		if (err) {
+			mlx4_err(dev, "QUERY_PORT command failed, aborting\n");
+			return err;
+		}
 	}
 
 	dev->caps.uar_page_size	     = PAGE_SIZE;
@@ -687,7 +399,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
 	/*
 	 * Subtract 1 from the limit because we need to allocate a
-	 * spare CQE to enable resizing the CQ
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
 	 */
 	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
 	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
@@ -695,8 +408,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.reserved_mtts      = dev_cap->reserved_mtts;
 	dev->caps.reserved_mrws	     = dev_cap->reserved_mrws;
 
-	/* The first 128 UARs are used for EQ doorbells */
-	dev->caps.reserved_uars	     = max_t(int, 128, dev_cap->reserved_uars);
 	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
 	dev->caps.reserved_xrcds     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
 					dev_cap->reserved_xrcds : 0;
@@ -711,10 +422,37 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
 	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
-	dev->caps.cq_timestamp       = dev_cap->timestamp_support;
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
 
+	/* Save uar page shift */
+	if (!mlx4_is_slave(dev)) {
+		/* Virtual PCI function needs to determine UAR page size from
+		 * firmware. Only master PCI function can set the uar page size
+		 */
+		if (enable_4k_uar)
+			dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
+		else
+			dev->uar_page_shift = PAGE_SHIFT;
+
+		mlx4_set_num_reserved_uars(dev, dev_cap);
+	}
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN) {
+		struct mlx4_init_hca_param hca_param;
+
+		memset(&hca_param, 0, sizeof(hca_param));
+		err = mlx4_QUERY_HCA(dev, &hca_param);
+		/* Turn off PHV_EN flag in case phv_check_en is set.
+		 * phv_check_en is a HW check that parse the packet and verify
+		 * phv bit was reported correctly in the wqe. To allow QinQ
+		 * PHV_EN flag should be set and phv_check_en must be cleared
+		 * otherwise QinQ packets will be drop by the HW.
+		 */
+		if (err || hca_param.phv_check_en)
+			dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_PHV_EN;
+	}
+
 	/* Sense port always allowed on supported devices for ConnectX-1 and -2 */
 	if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT)
 		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
@@ -722,12 +460,13 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (mlx4_is_mfunc(dev))
 		dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
 
-	dev->caps.log_num_macs  = log_num_mac;
-	dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS;
-
-	dev->caps.fast_drop	= fast_drop ?
-				  !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FAST_DROP) :
-				  0;
+	if (mlx4_low_memory_profile()) {
+		dev->caps.log_num_macs  = MLX4_MIN_LOG_NUM_MAC;
+		dev->caps.log_num_vlans = MLX4_MIN_LOG_NUM_VLANS;
+	} else {
+		dev->caps.log_num_macs  = log_num_mac;
+		dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS;
+	}
 
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
 		dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE;
@@ -740,26 +479,14 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 				 MLX4_PORT_TYPE_IB)
 				dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
 			else {
-				/*
-				 * if IB and ETH are supported, we set the port
+				/* if IB and ETH are supported, we set the port
 				 * type according to user selection of port type;
-				 * if there is no user selection, take the FW hint
-				 */
-				int pta;
-				mlx4_get_val(port_type_array.dbdf2val.tbl,
-					     pci_physfn(dev->pdev), i - 1,
-					     &pta);
-				if (pta == MLX4_PORT_TYPE_NONE) {
+				 * if user selected none, take the FW hint */
+				if (port_type_array[i - 1] == MLX4_PORT_TYPE_NONE)
 					dev->caps.port_type[i] = dev->caps.suggested_type[i] ?
 						MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB;
-				} else if (pta == MLX4_PORT_TYPE_NA) {
-					mlx4_err(dev, "Port %d is valid port. "
-						 "It is not allowed to configure its type to N/A(%d)\n",
-						 i, MLX4_PORT_TYPE_NA);
-					return -EINVAL;
-				} else {
-					dev->caps.port_type[i] = pta;
-				}
+				else
+					dev->caps.port_type[i] = port_type_array[i - 1];
 			}
 		}
 		/*
@@ -773,9 +500,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
 			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT));
 
-		/* Disablling auto sense for default Eth ports support */
-		mlx4_priv(dev)->sense.sense_allowed[i] = 0;
-
 		/*
 		 * If "default_sense" bit is set, we move the port to "AUTO" mode
 		 * and perform sense_port FW command to try and set the correct
@@ -791,27 +515,27 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev->caps.possible_type[i] = dev->caps.port_type[i];
 		}
 
-		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
-			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
-			mlx4_warn(dev, "Requested number of MACs is too much "
-				  "for port %d, reducing to %d.\n",
+		if (dev->caps.log_num_macs > dev_cap->port_cap[i].log_max_macs) {
+			dev->caps.log_num_macs = dev_cap->port_cap[i].log_max_macs;
+			mlx4_warn(dev, "Requested number of MACs is too much for port %d, reducing to %d\n",
 				  i, 1 << dev->caps.log_num_macs);
 		}
-		if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) {
-			dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
-			mlx4_warn(dev, "Requested number of VLANs is too much "
-				  "for port %d, reducing to %d.\n",
+		if (dev->caps.log_num_vlans > dev_cap->port_cap[i].log_max_vlans) {
+			dev->caps.log_num_vlans = dev_cap->port_cap[i].log_max_vlans;
+			mlx4_warn(dev, "Requested number of VLANs is too much for port %d, reducing to %d\n",
 				  i, 1 << dev->caps.log_num_vlans);
 		}
 	}
 
-	dev->caps.max_basic_counters = dev_cap->max_basic_counters;
-	dev->caps.max_extended_counters = dev_cap->max_extended_counters;
-	/* support extended counters if available */
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT)
-		dev->caps.max_counters = dev->caps.max_extended_counters;
-	else
-		dev->caps.max_counters = dev->caps.max_basic_counters;
+	if (mlx4_is_master(dev) && (dev->caps.num_ports == 2) &&
+	    (port_type_array[0] == MLX4_PORT_TYPE_IB) &&
+	    (port_type_array[1] == MLX4_PORT_TYPE_ETH)) {
+		mlx4_warn(dev,
+			  "Granular QoS per VF not supported with IB/Eth configuration\n");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_QOS_VPP;
+	}
+
+	dev->caps.max_counters = dev_cap->max_counters;
 
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
@@ -821,24 +545,52 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev->caps.num_ports;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
 
+	if (dev_cap->dmfs_high_rate_qpn_base > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)
+		dev->caps.dmfs_high_rate_qpn_base = dev_cap->dmfs_high_rate_qpn_base;
+	else
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+
+	if (dev_cap->dmfs_high_rate_qpn_range > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN) {
+		dev->caps.dmfs_high_rate_qpn_range = dev_cap->dmfs_high_rate_qpn_range;
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DEFAULT;
+		dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_FS_A0;
+	} else {
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_NOT_SUPPORTED;
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+		dev->caps.dmfs_high_rate_qpn_range = MLX4_A0_STEERING_TABLE_SIZE;
+	}
+
+	dev->caps.rl_caps = dev_cap->rl_caps;
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] =
+		dev->caps.dmfs_high_rate_qpn_range;
+
 	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH];
 
-	dev->caps.sync_qp = dev_cap->sync_qp;
-	if (dev->pdev->device == 0x1003)
-		dev->caps.cq_flags |= MLX4_DEV_CAP_CQ_FLAG_IO;
-
 	dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0;
 
-	if (!mlx4_enable_64b_cqe_eqe && !mlx4_is_slave(dev)) {
+	if (!enable_64b_cqe_eqe && !mlx4_is_slave(dev)) {
 		if (dev_cap->flags &
 		    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) {
 			mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n");
 			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
 			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
 		}
+
+		if (dev_cap->flags2 &
+		    (MLX4_DEV_CAP_FLAG2_CQE_STRIDE |
+		     MLX4_DEV_CAP_FLAG2_EQE_STRIDE)) {
+			mlx4_warn(dev, "Disabling EQE/CQE stride per user request\n");
+			dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+			dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+		}
 	}
 
 	if ((dev->caps.flags &
@@ -847,17 +599,106 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE;
 
 	if (!mlx4_is_slave(dev)) {
-		for (i = 0; i < dev->caps.num_ports; ++i)
-			dev->caps.def_counter_index[i] = i << 1;
-
+		mlx4_enable_cqe_eqe_stride(dev);
 		dev->caps.alloc_res_qp_mask =
-			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0);
+			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0) |
+			MLX4_RESERVE_A0_QP;
+
+		if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG) &&
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_SET_ETH_SCHED) {
+			mlx4_warn(dev, "Old device ETS support detected\n");
+			mlx4_warn(dev, "Consider upgrading device FW.\n");
+			dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_ETS_CFG;
+		}
+
 	} else {
 		dev->caps.alloc_res_qp_mask = 0;
 	}
 
+	mlx4_enable_ignore_fcs(dev);
+
 	return 0;
 }
+
+static int mlx4_get_pcie_dev_link_caps(struct mlx4_dev *dev,
+				       enum pci_bus_speed *speed,
+				       enum pcie_link_width *width)
+{
+	u32 lnkcap1, lnkcap2;
+	int err1, err2;
+
+#define  PCIE_MLW_CAP_SHIFT 4	/* start of MLW mask in link capabilities */
+
+	*speed = PCI_SPEED_UNKNOWN;
+	*width = PCIE_LNK_WIDTH_UNKNOWN;
+
+	err1 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP,
+					  &lnkcap1);
+	err2 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP2,
+					  &lnkcap2);
+	if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
+		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
+			*speed = PCIE_SPEED_8_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
+			*speed = PCIE_SPEED_5_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
+			*speed = PCIE_SPEED_2_5GT;
+	}
+	if (!err1) {
+		*width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> PCIE_MLW_CAP_SHIFT;
+		if (!lnkcap2) { /* pre-r3.0 */
+			if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB)
+				*speed = PCIE_SPEED_5_0GT;
+			else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB)
+				*speed = PCIE_SPEED_2_5GT;
+		}
+	}
+
+	if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN) {
+		return err1 ? err1 :
+			err2 ? err2 : -EINVAL;
+	}
+	return 0;
+}
+
+static void mlx4_check_pcie_caps(struct mlx4_dev *dev)
+{
+	enum pcie_link_width width, width_cap;
+	enum pci_bus_speed speed, speed_cap;
+	int err;
+
+#define PCIE_SPEED_STR(speed) \
+	(speed == PCIE_SPEED_8_0GT ? "8.0GT/s" : \
+	 speed == PCIE_SPEED_5_0GT ? "5.0GT/s" : \
+	 speed == PCIE_SPEED_2_5GT ? "2.5GT/s" : \
+	 "Unknown")
+
+	err = mlx4_get_pcie_dev_link_caps(dev, &speed_cap, &width_cap);
+	if (err) {
+		mlx4_warn(dev,
+			  "Unable to determine PCIe device BW capabilities\n");
+		return;
+	}
+
+	err = pcie_get_minimum_link(dev->persist->pdev, &speed, &width);
+	if (err || speed == PCI_SPEED_UNKNOWN ||
+	    width == PCIE_LNK_WIDTH_UNKNOWN) {
+		mlx4_warn(dev,
+			  "Unable to determine PCI device chain minimum BW\n");
+		return;
+	}
+
+	if (width != width_cap || speed != speed_cap)
+		mlx4_warn(dev,
+			  "PCIe BW is different than device's capability\n");
+
+	mlx4_info(dev, "PCIe link speed is %s, device supports %s\n",
+		  PCIE_SPEED_STR(speed), PCIE_SPEED_STR(speed_cap));
+	mlx4_info(dev, "PCIe link width is x%d, device supports x%d\n",
+		  width, width_cap);
+	return;
+}
+
 /*The function checks if there are live vf, return the num of them*/
 static int mlx4_how_many_lives_vf(struct mlx4_dev *dev)
 {
@@ -947,9 +788,11 @@ static void slave_adjust_steering_mode(struct mlx4_dev *dev,
 				       struct mlx4_init_hca_param *hca_param)
 {
 	dev->caps.steering_mode = hca_param->steering_mode;
-	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED)
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
-	else
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
+	} else
 		dev->caps.num_qp_per_mgm =
 			4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2);
 
@@ -964,18 +807,19 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	struct mlx4_dev_cap	   dev_cap;
 	struct mlx4_func_cap	   func_cap;
 	struct mlx4_init_hca_param hca_param;
-	int			   i;
+	u8			   i;
 
 	memset(&hca_param, 0, sizeof(hca_param));
 	err = mlx4_QUERY_HCA(dev, &hca_param);
 	if (err) {
-		mlx4_err(dev, "QUERY_HCA command failed, aborting.\n");
+		mlx4_err(dev, "QUERY_HCA command failed, aborting\n");
 		return err;
 	}
 
-	/*fail if the hca has an unknown capability */
-	if ((hca_param.global_caps | HCA_GLOBAL_CAP_MASK) !=
-	    HCA_GLOBAL_CAP_MASK) {
+	/* fail if the hca has an unknown global capability
+	 * at this time global_caps should be always zeroed
+	 */
+	if (hca_param.global_caps) {
 		mlx4_err(dev, "Unknown hca global capabilities\n");
 		return -ENOSYS;
 	}
@@ -988,49 +832,53 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp;
 	err = mlx4_dev_cap(dev, &dev_cap);
 	if (err) {
-		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
 		return err;
 	}
 
 	err = mlx4_QUERY_FW(dev);
 	if (err)
-		mlx4_err(dev, "QUERY_FW command failed: could not get FW version.\n");
-
-	if (!hca_param.mw_enable) {
-		dev->caps.flags      &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW;
-		dev->caps.bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN;
-	}
+		mlx4_err(dev, "QUERY_FW command failed: could not get FW version\n");
 
 	page_size = ~dev->caps.page_size_cap + 1;
 	mlx4_warn(dev, "HCA minimum page size:%d\n", page_size);
 	if (page_size > PAGE_SIZE) {
-		mlx4_err(dev, "HCA minimum page size of %d bigger than "
-			 "kernel PAGE_SIZE of %d, aborting.\n",
-			 page_size, (int)PAGE_SIZE);
+		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
+			 page_size, (long)PAGE_SIZE);
 		return -ENODEV;
 	}
 
-	/* slave gets uar page size from QUERY_HCA fw command */
-	dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12);
+	/* Set uar_page_shift for VF */
+	dev->uar_page_shift = hca_param.uar_page_sz + 12;
 
-	/* TODO: relax this assumption */
-	if (dev->caps.uar_page_size != PAGE_SIZE) {
-		mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %d\n",
-			 dev->caps.uar_page_size, (int)PAGE_SIZE);
-		return -ENODEV;
+	/* Make sure the master uar page size is valid */
+	if (dev->uar_page_shift > PAGE_SHIFT) {
+		mlx4_err(dev,
+			 "Invalid configuration: uar page size is larger than system page size\n");
+		return  -ENODEV;
 	}
 
+	/* Set reserved_uars based on the uar_page_shift */
+	mlx4_set_num_reserved_uars(dev, &dev_cap);
+
+	/* Although uar page size in FW differs from system page size,
+	 * upper software layers (mlx4_ib, mlx4_en and part of mlx4_core)
+	 * still works with assumption that uar page size == system page size
+	 */
+	dev->caps.uar_page_size = PAGE_SIZE;
+
 	memset(&func_cap, 0, sizeof(func_cap));
 	err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap);
 	if (err) {
-		mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d).\n",
-			  err);
+		mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d)\n",
+			 err);
 		return err;
 	}
 
 	if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
 	    PF_CONTEXT_BEHAVIOUR_MASK) {
-		mlx4_err(dev, "Unknown pf context behaviour\n");
+		mlx4_err(dev, "Unknown pf context behaviour %x known flags %x\n",
+			 func_cap.pf_context_behaviour, PF_CONTEXT_BEHAVIOUR_MASK);
 		return -ENOSYS;
 	}
 
@@ -1046,41 +894,46 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	dev->caps.num_mpts		= 1 << hca_param.log_mpt_sz;
 	dev->caps.num_eqs		= func_cap.max_eq;
 	dev->caps.reserved_eqs		= func_cap.reserved_eq;
+	dev->caps.reserved_lkey		= func_cap.reserved_lkey;
 	dev->caps.num_pds               = MLX4_NUM_PDS;
 	dev->caps.num_mgms              = 0;
 	dev->caps.num_amgms             = 0;
 
 	if (dev->caps.num_ports > MLX4_MAX_PORTS) {
-		mlx4_err(dev, "HCA has %d ports, but we only support %d, "
-			 "aborting.\n", dev->caps.num_ports, MLX4_MAX_PORTS);
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n",
+			 dev->caps.num_ports, MLX4_MAX_PORTS);
 		return -ENODEV;
 	}
 
+	mlx4_replace_zero_macs(dev);
+
+	dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 	dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 	dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 	dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 	dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 
 	if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
-	    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) {
+	    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy ||
+	    !dev->caps.qp0_qkey) {
 		err = -ENOMEM;
 		goto err_mem;
 	}
 
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
-		err = mlx4_QUERY_FUNC_CAP(dev, (u32) i, &func_cap);
+		err = mlx4_QUERY_FUNC_CAP(dev, i, &func_cap);
 		if (err) {
-			mlx4_err(dev, "QUERY_FUNC_CAP port command failed for"
-				 " port %d, aborting (%d).\n", i, err);
+			mlx4_err(dev, "QUERY_FUNC_CAP port command failed for port %d, aborting (%d)\n",
+				 i, err);
 			goto err_mem;
 		}
+		dev->caps.qp0_qkey[i - 1] = func_cap.qp0_qkey;
 		dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn;
 		dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn;
 		dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn;
 		dev->caps.qp1_proxy[i - 1] = func_cap.qp1_proxy_qpn;
-		dev->caps.def_counter_index[i - 1] = func_cap.def_counter_index;
-
 		dev->caps.port_mask[i] = dev->caps.port_type[i];
+		dev->caps.phys_port_id[i] = func_cap.phys_port_id;
 		err = mlx4_get_slave_pkey_gid_tbl_len(dev, i,
 						      &dev->caps.gid_table_len[i],
 						      &dev->caps.pkey_table_len[i]);
@@ -1090,11 +943,12 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	if (dev->caps.uar_page_size * (dev->caps.num_uars -
 				       dev->caps.reserved_uars) >
-				       pci_resource_len(dev->pdev, 2)) {
-		mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than "
-			 "PCI resource 2 size of 0x%llx, aborting.\n",
+				       pci_resource_len(dev->persist->pdev,
+							2)) {
+		mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
 			 dev->caps.uar_page_size * dev->caps.num_uars,
-			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
 		err = -ENOMEM;
 		goto err_mem;
 	}
@@ -1109,29 +963,49 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) {
 		dev->caps.cqe_size   = 64;
-		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
 	} else {
 		dev->caps.cqe_size   = 32;
 	}
 
+	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) {
+		dev->caps.eqe_size = hca_param.eqe_size;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) {
+		dev->caps.cqe_size = hca_param.cqe_size;
+		/* User still need to know when CQE > 32B */
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	}
+
 	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
-	mlx4_warn(dev, "Timestamping is not supported in slave mode.\n");
+	mlx4_warn(dev, "Timestamping is not supported in slave mode\n");
 
 	slave_adjust_steering_mode(dev, &dev_cap, &hca_param);
+	mlx4_dbg(dev, "RSS support for IP fragments is %s\n",
+		 hca_param.rss_ip_frags ? "on" : "off");
 
 	if (func_cap.extra_flags & MLX4_QUERY_FUNC_FLAGS_BF_RES_QP &&
 	    dev->caps.bf_reg_size)
 		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_ETH_BF_QP;
 
+	if (func_cap.extra_flags & MLX4_QUERY_FUNC_FLAGS_A0_RES_QP)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_A0_QP;
+
 	return 0;
 
 err_mem:
+	kfree(dev->caps.qp0_qkey);
 	kfree(dev->caps.qp0_tunnel);
 	kfree(dev->caps.qp0_proxy);
 	kfree(dev->caps.qp1_tunnel);
 	kfree(dev->caps.qp1_proxy);
-	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
-		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+	dev->caps.qp0_qkey = NULL;
+	dev->caps.qp0_tunnel = NULL;
+	dev->caps.qp0_proxy = NULL;
+	dev->caps.qp1_tunnel = NULL;
+	dev->caps.qp1_proxy = NULL;
 
 	return err;
 }
@@ -1151,10 +1025,10 @@ static void mlx4_request_modules(struct mlx4_dev *dev)
 			has_eth_port = true;
 	}
 
-	if (has_ib_port)
-		request_module_nowait(IB_DRV_NAME);
 	if (has_eth_port)
 		request_module_nowait(EN_DRV_NAME);
+	if (has_ib_port || (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+		request_module_nowait(IB_DRV_NAME);
 }
 
 /*
@@ -1181,8 +1055,8 @@ int mlx4_change_port_types(struct mlx4_dev *dev,
 			dev->caps.port_type[port] = port_types[port - 1];
 			err = mlx4_SET_PORT(dev, port, -1);
 			if (err) {
-				mlx4_err(dev, "Failed to set port %d, "
-					      "aborting\n", port);
+				mlx4_err(dev, "Failed to set port %d, aborting\n",
+					 port);
 				goto out;
 			}
 		}
@@ -1219,12 +1093,9 @@ static ssize_t show_port_type(struct device *dev,
 	return strlen(buf);
 }
 
-static ssize_t set_port_type(struct device *dev,
-			     struct device_attribute *attr,
-			     const char *buf, size_t count)
+static int __set_port_type(struct mlx4_port_info *info,
+			   enum mlx4_port_type port_type)
 {
-	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
-						   port_attr);
 	struct mlx4_dev *mdev = info->dev;
 	struct mlx4_priv *priv = mlx4_priv(mdev);
 	enum mlx4_port_type types[MLX4_MAX_PORTS];
@@ -1232,26 +1103,18 @@ static ssize_t set_port_type(struct device *dev,
 	int i;
 	int err = 0;
 
-	if (!strcmp(buf, "ib\n"))
-		info->tmp_type = MLX4_PORT_TYPE_IB;
-	else if (!strcmp(buf, "eth\n"))
-		info->tmp_type = MLX4_PORT_TYPE_ETH;
-	else if (!strcmp(buf, "auto\n"))
-		info->tmp_type = MLX4_PORT_TYPE_AUTO;
-	else {
-		mlx4_err(mdev, "%s is not supported port type\n", buf);
-		return -EINVAL;
-	}
-
-	if ((info->tmp_type & mdev->caps.supported_type[info->port]) !=
-	    info->tmp_type) {
-		mlx4_err(mdev, "Requested port type for port %d is not supported on this HCA\n",
+	if ((port_type & mdev->caps.supported_type[info->port]) != port_type) {
+		mlx4_err(mdev,
+			 "Requested port type for port %d is not supported on this HCA\n",
 			 info->port);
-		return -EINVAL;
+		err = -EINVAL;
+		goto err_sup;
 	}
 
 	mlx4_stop_sense(mdev);
 	mutex_lock(&priv->port_mutex);
+	info->tmp_type = port_type;
+
 	/* Possible type is always the one that was delivered */
 	mdev->caps.possible_type[info->port] = info->tmp_type;
 
@@ -1272,9 +1135,7 @@ static ssize_t set_port_type(struct device *dev,
 		}
 	}
 	if (err) {
-		mlx4_err(mdev, "Auto sensing is not supported on this HCA. "
-			       "Set only 'eth' or 'ib' for both ports "
-			       "(should be the same)\n");
+		mlx4_err(mdev, "Auto sensing is not supported on this HCA. Set only 'eth' or 'ib' for both ports (should be the same)\n");
 		goto out;
 	}
 
@@ -1293,8 +1154,42 @@ static ssize_t set_port_type(struct device *dev,
 	err = mlx4_change_port_types(mdev, new_types);
 
 out:
-	mlx4_start_sense(mdev);
 	mutex_unlock(&priv->port_mutex);
+	mlx4_start_sense(mdev);
+err_sup:
+	return err;
+}
+
+static ssize_t set_port_type(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	enum mlx4_port_type port_type;
+	static DEFINE_MUTEX(set_port_type_mutex);
+	int err;
+
+	mutex_lock(&set_port_type_mutex);
+
+	if (!strcmp(buf, "ib\n")) {
+		port_type = MLX4_PORT_TYPE_IB;
+	} else if (!strcmp(buf, "eth\n")) {
+		port_type = MLX4_PORT_TYPE_ETH;
+	} else if (!strcmp(buf, "auto\n")) {
+		port_type = MLX4_PORT_TYPE_AUTO;
+	} else {
+		mlx4_err(mdev, "%s is not supported port type\n", buf);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = __set_port_type(info, port_type);
+
+err_out:
+	mutex_unlock(&set_port_type_mutex);
+
 	return err ? err : count;
 }
 
@@ -1330,43 +1225,6 @@ static inline int ibta_mtu_to_int(enum ibta_mtu mtu)
 	}
 }
 
-static ssize_t
-show_board(struct device *device, struct device_attribute *attr,
-			  char *buf)
-{
-	struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info,
-						   board_attr);
-	struct mlx4_dev *mdev = info->dev;
-
-	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
-		       mdev->board_id);
-}
-
-static ssize_t
-show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info,
-						   hca_attr);
-	struct mlx4_dev *mdev = info->dev;
-
-	return sprintf(buf, "MT%d\n", mdev->pdev->device);
-}
-
-static ssize_t
-show_firmware_version(struct device *dev,
-				struct device_attribute *attr,
-				char *buf)
-{
-	struct mlx4_hca_info *info = container_of(attr, struct mlx4_hca_info,
-						   firmware_attr);
-	struct mlx4_dev *mdev = info->dev;
-
-	return sprintf(buf, "%d.%d.%d\n", (int)(mdev->caps.fw_ver >> 32),
-		       (int)(mdev->caps.fw_ver >> 16) & 0xffff,
-		       (int)mdev->caps.fw_ver & 0xffff);
-}
-
 static ssize_t show_port_ib_mtu(struct device *dev,
 			     struct device_attribute *attr,
 			     char *buf)
@@ -1375,9 +1233,8 @@ static ssize_t show_port_ib_mtu(struct device *dev,
 						   port_mtu_attr);
 	struct mlx4_dev *mdev = info->dev;
 
-	/* When port type is eth, port mtu value isn't used. */
 	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH)
-		return -EINVAL;
+		mlx4_warn(mdev, "port level mtu is only used for IB ports\n");
 
 	sprintf(buf, "%d\n",
 			ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port]));
@@ -1399,10 +1256,11 @@ static ssize_t set_port_ib_mtu(struct device *dev,
 		return -EINVAL;
 	}
 
-	mtu = (int) simple_strtol(buf, NULL, 0);
-	ibta_mtu = int_to_ibta_mtu(mtu);
+	err = kstrtoint(buf, 0, &mtu);
+	if (!err)
+		ibta_mtu = int_to_ibta_mtu(mtu);
 
-	if (ibta_mtu < 0) {
+	if (err || ibta_mtu < 0) {
 		mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf);
 		return -EINVAL;
 	}
@@ -1416,8 +1274,8 @@ static ssize_t set_port_ib_mtu(struct device *dev,
 		mlx4_CLOSE_PORT(mdev, port);
 		err = mlx4_SET_PORT(mdev, port, -1);
 		if (err) {
-			mlx4_err(mdev, "Failed to set port %d, "
-				      "aborting\n", port);
+			mlx4_err(mdev, "Failed to set port %d, aborting\n",
+				 port);
 			goto err_set_port;
 		}
 	}
@@ -1428,40 +1286,217 @@ static ssize_t set_port_ib_mtu(struct device *dev,
 	return err ? err : count;
 }
 
+/* bond for multi-function device */
+#define MAX_MF_BOND_ALLOWED_SLAVES 63
+static int mlx4_mf_bond(struct mlx4_dev *dev)
+{
+	int err = 0;
+	int nvfs;
+	struct mlx4_slaves_pport slaves_port1;
+	struct mlx4_slaves_pport slaves_port2;
+	DECLARE_BITMAP(slaves_port_1_2, MLX4_MFUNC_MAX);
+
+	slaves_port1 = mlx4_phys_to_slaves_pport(dev, 1);
+	slaves_port2 = mlx4_phys_to_slaves_pport(dev, 2);
+	bitmap_and(slaves_port_1_2,
+		   slaves_port1.slaves, slaves_port2.slaves,
+		   dev->persist->num_vfs + 1);
+
+	/* only single port vfs are allowed */
+	if (bitmap_weight(slaves_port_1_2, dev->persist->num_vfs + 1) > 1) {
+		mlx4_warn(dev, "HA mode unsupported for dual ported VFs\n");
+		return -EINVAL;
+	}
+
+	/* number of virtual functions is number of total functions minus one
+	 * physical function for each port.
+	 */
+	nvfs = bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
+		bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1) - 2;
+
+	/* limit on maximum allowed VFs */
+	if (nvfs > MAX_MF_BOND_ALLOWED_SLAVES) {
+		mlx4_warn(dev, "HA mode is not supported for %d VFs (max %d are allowed)\n",
+			  nvfs, MAX_MF_BOND_ALLOWED_SLAVES);
+		return -EINVAL;
+	}
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		mlx4_warn(dev, "HA mode unsupported for NON DMFS steering\n");
+		return -EINVAL;
+	}
+
+	err = mlx4_bond_mac_table(dev);
+	if (err)
+		return err;
+	err = mlx4_bond_vlan_table(dev);
+	if (err)
+		goto err1;
+	err = mlx4_bond_fs_rules(dev);
+	if (err)
+		goto err2;
+
+	return 0;
+err2:
+	(void)mlx4_unbond_vlan_table(dev);
+err1:
+	(void)mlx4_unbond_mac_table(dev);
+	return err;
+}
+
+static int mlx4_mf_unbond(struct mlx4_dev *dev)
+{
+	int ret, ret1;
+
+	ret = mlx4_unbond_fs_rules(dev);
+	if (ret)
+		mlx4_warn(dev, "multifunction unbond for flow rules failedi (%d)\n", ret);
+	ret1 = mlx4_unbond_mac_table(dev);
+	if (ret1) {
+		mlx4_warn(dev, "multifunction unbond for MAC table failed (%d)\n", ret1);
+		ret = ret1;
+	}
+	ret1 = mlx4_unbond_vlan_table(dev);
+	if (ret1) {
+		mlx4_warn(dev, "multifunction unbond for VLAN table failed (%d)\n", ret1);
+		ret = ret1;
+	}
+	return ret;
+}
+
+int mlx4_bond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (!mlx4_is_bonded(dev)) {
+		ret = mlx4_do_bond(dev, true);
+		if (ret)
+			mlx4_err(dev, "Failed to bond device: %d\n", ret);
+		if (!ret && mlx4_is_master(dev)) {
+			ret = mlx4_mf_bond(dev);
+			if (ret) {
+				mlx4_err(dev, "bond for multifunction failed\n");
+				mlx4_do_bond(dev, false);
+			}
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	if (!ret)
+		mlx4_dbg(dev, "Device is bonded\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_bond);
+
+int mlx4_unbond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (mlx4_is_bonded(dev)) {
+		int ret2 = 0;
+
+		ret = mlx4_do_bond(dev, false);
+		if (ret)
+			mlx4_err(dev, "Failed to unbond device: %d\n", ret);
+		if (mlx4_is_master(dev))
+			ret2 = mlx4_mf_unbond(dev);
+		if (ret2) {
+			mlx4_warn(dev, "Failed to unbond device for multifunction (%d)\n", ret2);
+			ret = ret2;
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	if (!ret)
+		mlx4_dbg(dev, "Device is unbonded\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_unbond);
+
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p)
+{
+	u8 port1 = v2p->port1;
+	u8 port2 = v2p->port2;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -ENOTSUPP;
+
+	mutex_lock(&priv->bond_mutex);
+
+	/* zero means keep current mapping for this port */
+	if (port1 == 0)
+		port1 = priv->v2p.port1;
+	if (port2 == 0)
+		port2 = priv->v2p.port2;
+
+	if ((port1 < 1) || (port1 > MLX4_MAX_PORTS) ||
+	    (port2 < 1) || (port2 > MLX4_MAX_PORTS) ||
+	    (port1 == 2 && port2 == 1)) {
+		/* besides boundary checks cross mapping makes
+		 * no sense and therefore not allowed */
+		err = -EINVAL;
+	} else if ((port1 == priv->v2p.port1) &&
+		 (port2 == priv->v2p.port2)) {
+		err = 0;
+	} else {
+		err = mlx4_virt2phy_port_map(dev, port1, port2);
+		if (!err) {
+			mlx4_dbg(dev, "port map changed: [%d][%d]\n",
+				 port1, port2);
+			priv->v2p.port1 = port1;
+			priv->v2p.port2 = port2;
+		} else {
+			mlx4_err(dev, "Failed to change port mape: %d\n", err);
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_port_map_set);
+
 static int mlx4_load_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int err, unmap_flag = 0;
+	int err;
 
 	priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
 					 GFP_HIGHUSER | __GFP_NOWARN, 0);
 	if (!priv->fw.fw_icm) {
-		mlx4_err(dev, "Couldn't allocate FW area, aborting.\n");
+		mlx4_err(dev, "Couldn't allocate FW area, aborting\n");
 		return -ENOMEM;
 	}
 
 	err = mlx4_MAP_FA(dev, priv->fw.fw_icm);
 	if (err) {
-		mlx4_err(dev, "MAP_FA command failed, aborting.\n");
+		mlx4_err(dev, "MAP_FA command failed, aborting\n");
 		goto err_free;
 	}
 
 	err = mlx4_RUN_FW(dev);
 	if (err) {
-		mlx4_err(dev, "RUN_FW command failed, aborting.\n");
+		mlx4_err(dev, "RUN_FW command failed, aborting\n");
 		goto err_unmap_fa;
 	}
 
 	return 0;
 
 err_unmap_fa:
-	unmap_flag = mlx4_UNMAP_FA(dev);
-	if (unmap_flag)
-		pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n");
+	mlx4_UNMAP_FA(dev);
 
 err_free:
-	if (!unmap_flag)
-		mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
 	return err;
 }
 
@@ -1530,34 +1565,34 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	u64 aux_pages;
 	int num_eqs;
-	int err, unmap_flag = 0;
+	int err;
 
 	err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
 	if (err) {
-		mlx4_err(dev, "SET_ICM_SIZE command failed, aborting.\n");
+		mlx4_err(dev, "SET_ICM_SIZE command failed, aborting\n");
 		return err;
 	}
 
-	mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+	mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory\n",
 		 (unsigned long long) icm_size >> 10,
 		 (unsigned long long) aux_pages << 2);
 
 	priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
 					  GFP_HIGHUSER | __GFP_NOWARN, 0);
 	if (!priv->fw.aux_icm) {
-		mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n");
+		mlx4_err(dev, "Couldn't allocate aux memory, aborting\n");
 		return -ENOMEM;
 	}
 
 	err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm);
 	if (err) {
-		mlx4_err(dev, "MAP_ICM_AUX command failed, aborting.\n");
+		mlx4_err(dev, "MAP_ICM_AUX command failed, aborting\n");
 		goto err_free_aux;
 	}
 
 	err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz);
 	if (err) {
-		mlx4_err(dev, "Failed to map cMPT context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map cMPT context memory, aborting\n");
 		goto err_unmap_aux;
 	}
 
@@ -1567,7 +1602,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->eqc_base, dev_cap->eqc_entry_sz,
 				  num_eqs, num_eqs, 0, 0);
 	if (err) {
-		mlx4_err(dev, "Failed to map EQ context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map EQ context memory, aborting\n");
 		goto err_unmap_cmpt;
 	}
 
@@ -1588,7 +1623,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  dev->caps.num_mtts,
 				  dev->caps.reserved_mtts, 1, 0);
 	if (err) {
-		mlx4_err(dev, "Failed to map MTT context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map MTT context memory, aborting\n");
 		goto err_unmap_eq;
 	}
 
@@ -1598,7 +1633,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  dev->caps.num_mpts,
 				  dev->caps.reserved_mrws, 1, 1);
 	if (err) {
-		mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map dMPT context memory, aborting\n");
 		goto err_unmap_mtt;
 	}
 
@@ -1609,7 +1644,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
 				  0, 0);
 	if (err) {
-		mlx4_err(dev, "Failed to map QP context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map QP context memory, aborting\n");
 		goto err_unmap_dmpt;
 	}
 
@@ -1620,7 +1655,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
 				  0, 0);
 	if (err) {
-		mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map AUXC context memory, aborting\n");
 		goto err_unmap_qp;
 	}
 
@@ -1631,7 +1666,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
 				  0, 0);
 	if (err) {
-		mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map ALTC context memory, aborting\n");
 		goto err_unmap_auxc;
 	}
 
@@ -1652,7 +1687,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  dev->caps.num_cqs,
 				  dev->caps.reserved_cqs, 0, 0);
 	if (err) {
-		mlx4_err(dev, "Failed to map CQ context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map CQ context memory, aborting\n");
 		goto err_unmap_rdmarc;
 	}
 
@@ -1662,7 +1697,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  dev->caps.num_srqs,
 				  dev->caps.reserved_srqs, 0, 0);
 	if (err) {
-		mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map SRQ context memory, aborting\n");
 		goto err_unmap_cq;
 	}
 
@@ -1680,7 +1715,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  dev->caps.num_mgms + dev->caps.num_amgms,
 				  0, 0);
 	if (err) {
-		mlx4_err(dev, "Failed to map MCG context memory, aborting.\n");
+		mlx4_err(dev, "Failed to map MCG context memory, aborting\n");
 		goto err_unmap_srq;
 	}
 
@@ -1720,13 +1755,10 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
 
 err_unmap_aux:
-	unmap_flag = mlx4_UNMAP_ICM_AUX(dev);
-	if (unmap_flag)
-		pr_warn("mlx4_core: mlx4_UNMAP_ICM_AUX failed.\n");
+	mlx4_UNMAP_ICM_AUX(dev);
 
 err_free_aux:
-	if (!unmap_flag)
-		mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
 
 	return err;
 }
@@ -1750,10 +1782,8 @@ static void mlx4_free_icms(struct mlx4_dev *dev)
 	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
 	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
 
-	if (!mlx4_UNMAP_ICM_AUX(dev))
-		mlx4_free_icm(dev, priv->fw.aux_icm, 0);
-	else
-		pr_warn("mlx4_core: mlx4_UNMAP_ICM_AUX failed.\n");
+	mlx4_UNMAP_ICM_AUX(dev);
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
 }
 
 static void mlx4_slave_exit(struct mlx4_dev *dev)
@@ -1761,8 +1791,9 @@ static void mlx4_slave_exit(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	mutex_lock(&priv->cmd.slave_cmd_mutex);
-	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME))
-		mlx4_warn(dev, "Failed to close slave function.\n");
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_CMD_NA_OP,
+			  MLX4_COMM_TIME))
+		mlx4_warn(dev, "Failed to close slave function\n");
 	mutex_unlock(&priv->cmd.slave_cmd_mutex);
 }
 
@@ -1776,9 +1807,9 @@ static int map_bf_area(struct mlx4_dev *dev)
 	if (!dev->caps.bf_reg_size)
 		return -ENXIO;
 
-	bf_start = pci_resource_start(dev->pdev, 2) +
+	bf_start = pci_resource_start(dev->persist->pdev, 2) +
 			(dev->caps.num_uars << PAGE_SHIFT);
-	bf_len = pci_resource_len(dev->pdev, 2) -
+	bf_len = pci_resource_len(dev->persist->pdev, 2) -
 			(dev->caps.num_uars << PAGE_SHIFT);
 	priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
 	if (!priv->bf_mapping)
@@ -1822,9 +1853,10 @@ static int map_internal_clock(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	priv->clock_mapping = ioremap(pci_resource_start(dev->pdev,
-				priv->fw.clock_bar) +
-				priv->fw.clock_offset, MLX4_CLOCK_SIZE);
+	priv->clock_mapping =
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.clock_bar) +
+			priv->fw.clock_offset, MLX4_CLOCK_SIZE);
 
 	if (!priv->clock_mapping)
 		return -ENOMEM;
@@ -1832,7 +1864,6 @@ static int map_internal_clock(struct mlx4_dev *dev)
 	return 0;
 }
 
-
 int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
 				   struct mlx4_clock_params *params)
 {
@@ -1840,6 +1871,7 @@ int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
 
 	if (mlx4_is_slave(dev))
 		return -ENOTSUPP;
+
 	if (!params)
 		return -EINVAL;
 
@@ -1863,49 +1895,97 @@ static void mlx4_close_hca(struct mlx4_dev *dev)
 {
 	unmap_internal_clock(dev);
 	unmap_bf_area(dev);
-	if (mlx4_is_slave(dev)) {
+	if (mlx4_is_slave(dev))
 		mlx4_slave_exit(dev);
-	} else {
+	else {
 		mlx4_CLOSE_HCA(dev, 0);
 		mlx4_free_icms(dev);
-
-		if (!mlx4_UNMAP_FA(dev))
-			 mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
-		else
-			pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n");
 	}
 }
 
+static void mlx4_close_fw(struct mlx4_dev *dev)
+{
+	if (!mlx4_is_slave(dev)) {
+		mlx4_UNMAP_FA(dev);
+		mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
+	}
+}
+
+static int mlx4_comm_check_offline(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_OFFLINE_OFFSET 0x09
+
+	u32 comm_flags;
+	u32 offline_bit;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	end = msecs_to_jiffies(MLX4_COMM_OFFLINE_TIME_OUT) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		offline_bit = (comm_flags &
+			       (u32)(1 << COMM_CHAN_OFFLINE_OFFSET));
+		if (!offline_bit)
+			return 0;
+		/* There are cases as part of AER/Reset flow that PF needs
+		 * around 100 msec to load. We therefore sleep for 100 msec
+		 * to allow other tasks to make use of that CPU during this
+		 * time interval.
+		 */
+		msleep(100);
+	}
+	mlx4_err(dev, "Communication channel is offline.\n");
+	return -EIO;
+}
+
+static void mlx4_reset_vf_support(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_RST_OFFSET 0x1e
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 comm_rst;
+	u32 comm_caps;
+
+	comm_caps = swab32(readl((__iomem char *)priv->mfunc.comm +
+				 MLX4_COMM_CHAN_CAPS));
+	comm_rst = (comm_caps & (u32)(1 << COMM_CHAN_RST_OFFSET));
+
+	if (comm_rst)
+		dev->caps.vf_caps |= MLX4_VF_CAP_FLAG_RESET;
+}
+
 static int mlx4_init_slave(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	u64 dma = (u64) priv->mfunc.vhcr_dma;
-	int num_of_reset_retries = NUM_OF_RESET_RETRIES;
 	int ret_from_reset = 0;
 	u32 slave_read;
 	u32 cmd_channel_ver;
 
+	if (atomic_read(&pf_loading)) {
+		mlx4_warn(dev, "PF is not ready - Deferring probe\n");
+		return -EAGAIN;
+	}
+
 	mutex_lock(&priv->cmd.slave_cmd_mutex);
 	priv->cmd.max_cmds = 1;
+	if (mlx4_comm_check_offline(dev)) {
+		mlx4_err(dev, "PF is not responsive, skipping initialization\n");
+		goto err_offline;
+	}
+
+	mlx4_reset_vf_support(dev);
 	mlx4_warn(dev, "Sending reset\n");
 	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
-				       MLX4_COMM_TIME);
+				       MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME);
 	/* if we are in the middle of flr the slave will try
 	 * NUM_OF_RESET_RETRIES times before leaving.*/
 	if (ret_from_reset) {
 		if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) {
-			msleep(SLEEP_TIME_IN_RESET);
-			while (ret_from_reset && num_of_reset_retries) {
-				mlx4_warn(dev, "slave is currently in the"
-					  "middle of FLR. retrying..."
-					  "(try num:%d)\n",
-					  (NUM_OF_RESET_RETRIES -
-					   num_of_reset_retries  + 1));
-				ret_from_reset =
-					mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET,
-						      0, MLX4_COMM_TIME);
-				num_of_reset_retries = num_of_reset_retries - 1;
-			}
+			mlx4_warn(dev, "slave is currently in the middle of FLR - Deferring probe\n");
+			mutex_unlock(&priv->cmd.slave_cmd_mutex);
+			return -EAGAIN;
 		} else
 			goto err;
 	}
@@ -1917,29 +1997,30 @@ static int mlx4_init_slave(struct mlx4_dev *dev)
 
 	if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) !=
 		MLX4_COMM_GET_IF_REV(slave_read)) {
-		mlx4_err(dev, "slave driver version is not supported"
-			 " by the master\n");
+		mlx4_err(dev, "slave driver version is not supported by the master\n");
 		goto err;
 	}
 
 	mlx4_warn(dev, "Sending vhcr0\n");
 	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48,
-						    MLX4_COMM_TIME))
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
 		goto err;
 	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32,
-						    MLX4_COMM_TIME))
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
 		goto err;
 	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16,
-						    MLX4_COMM_TIME))
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
 		goto err;
-	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME))
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma,
+			  MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
 		goto err;
 
 	mutex_unlock(&priv->cmd.slave_cmd_mutex);
 	return 0;
 
 err:
-	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0);
+	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_CMD_NA_OP, 0);
+err_offline:
 	mutex_unlock(&priv->cmd.slave_cmd_mutex);
 	return -EIO;
 }
@@ -1951,7 +2032,7 @@ static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev)
 	for (i = 1; i <= dev->caps.num_ports; i++) {
 		if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH)
 			dev->caps.gid_table_len[i] =
-				mlx4_get_slave_num_gids(dev, 0);
+				mlx4_get_slave_num_gids(dev, 0, i);
 		else
 			dev->caps.gid_table_len[i] = 1;
 		dev->caps.pkey_table_len[i] =
@@ -1972,29 +2053,62 @@ static int choose_log_fs_mgm_entry_size(int qp_per_entry)
 	return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1;
 }
 
+static const char *dmfs_high_rate_steering_mode_str(int dmfs_high_steer_mode)
+{
+	switch (dmfs_high_steer_mode) {
+	case MLX4_STEERING_DMFS_A0_DEFAULT:
+		return "default performance";
+
+	case MLX4_STEERING_DMFS_A0_DYNAMIC:
+		return "dynamic hybrid mode";
+
+	case MLX4_STEERING_DMFS_A0_STATIC:
+		return "performance optimized for limited rule configuration (static)";
+
+	case MLX4_STEERING_DMFS_A0_DISABLE:
+		return "disabled performance optimized steering";
+
+	case MLX4_STEERING_DMFS_A0_NOT_SUPPORTED:
+		return "performance optimized steering not supported";
+
+	default:
+		return "Unrecognized mode";
+	}
+}
+
+#define MLX4_DMFS_A0_STEERING			(1UL << 2)
+
 static void choose_steering_mode(struct mlx4_dev *dev,
 				 struct mlx4_dev_cap *dev_cap)
 {
-	int nvfs;
-
-	mlx4_get_val(num_vfs.dbdf2val.tbl, pci_physfn(dev->pdev), 0, &nvfs);
-	if (high_rate_steer && !mlx4_is_mfunc(dev)) {
-		dev->caps.flags &= ~(MLX4_DEV_CAP_FLAG_VEP_MC_STEER |
-				     MLX4_DEV_CAP_FLAG_VEP_UC_STEER);
-		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_FS_EN;
+	if (mlx4_log_num_mgm_entry_size <= 0) {
+		if ((-mlx4_log_num_mgm_entry_size) & MLX4_DMFS_A0_STEERING) {
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+				mlx4_err(dev, "DMFS high rate mode not supported\n");
+			else
+				dev->caps.dmfs_high_steer_mode =
+					MLX4_STEERING_DMFS_A0_STATIC;
+		}
 	}
 
-	if (mlx4_log_num_mgm_entry_size == -1 &&
+	if (mlx4_log_num_mgm_entry_size <= 0 &&
 	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
 	    (!mlx4_is_mfunc(dev) ||
-	     (dev_cap->fs_max_num_qp_per_entry >= (nvfs + 1))) &&
+	     (dev_cap->fs_max_num_qp_per_entry >=
+	     (dev->persist->num_vfs + 1))) &&
 	    choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >=
 		MLX4_MIN_MGM_LOG_ENTRY_SIZE) {
 		dev->oper_log_mgm_entry_size =
 			choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry);
 		dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
 		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
 	} else {
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DISABLE;
 		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
 		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
 			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
@@ -2003,8 +2117,7 @@ static void choose_steering_mode(struct mlx4_dev *dev,
 
 			if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER ||
 			    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
-				mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags "
-					  "set to use B0 steering. Falling back to A0 steering mode.\n");
+				mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags set to use B0 steering - falling back to A0 steering mode\n");
 		}
 		dev->oper_log_mgm_entry_size =
 			mlx4_log_num_mgm_entry_size > 0 ?
@@ -2012,36 +2125,72 @@ static void choose_steering_mode(struct mlx4_dev *dev,
 			MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
 		dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev);
 	}
-	mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, "
-		 "log_num_mgm_entry_size = %d\n",
+	mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, modparam log_num_mgm_entry_size = %d\n",
 		 mlx4_steering_mode_str(dev->caps.steering_mode),
-		 dev->oper_log_mgm_entry_size, mlx4_log_num_mgm_entry_size);
+		 dev->oper_log_mgm_entry_size,
+		 mlx4_log_num_mgm_entry_size);
 }
 
-static int mlx4_init_hca(struct mlx4_dev *dev)
+static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
+				       struct mlx4_dev_cap *dev_cap)
+{
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS)
+		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_VXLAN;
+	else
+		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_NONE;
+
+	mlx4_dbg(dev, "Tunneling offload mode is: %s\n",  (dev->caps.tunnel_offload_mode
+		 == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) ? "vxlan" : "none");
+}
+
+static int mlx4_validate_optimized_steering(struct mlx4_dev *dev)
+{
+	int i;
+	struct mlx4_port_cap port_cap;
+
+	if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+		return -EINVAL;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_dev_port(dev, i, &port_cap)) {
+			mlx4_err(dev,
+				 "QUERY_DEV_CAP command failed, can't veify DMFS high rate steering.\n");
+		} else if ((dev->caps.dmfs_high_steer_mode !=
+			    MLX4_STEERING_DMFS_A0_DEFAULT) &&
+			   (port_cap.dmfs_optimized_state ==
+			    !!(dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE))) {
+			mlx4_err(dev,
+				 "DMFS high rate steer mode differ, driver requested %s but %s in FW.\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode),
+				 (port_cap.dmfs_optimized_state ?
+					"enabled" : "disabled"));
+		}
+	}
+
+	return 0;
+}
+
+static int mlx4_init_fw(struct mlx4_dev *dev)
 {
-	struct mlx4_priv	  *priv = mlx4_priv(dev);
-	struct mlx4_dev_cap	   *dev_cap = NULL;
-	struct mlx4_adapter	   adapter;
 	struct mlx4_mod_stat_cfg   mlx4_cfg;
-	struct mlx4_profile	   profile;
-	struct mlx4_init_hca_param init_hca;
-	u64 icm_size;
-	int err;
+	int err = 0;
 
 	if (!mlx4_is_slave(dev)) {
 		err = mlx4_QUERY_FW(dev);
 		if (err) {
 			if (err == -EACCES)
-				mlx4_info(dev, "non-primary physical function, skipping.\n");
+				mlx4_info(dev, "non-primary physical function, skipping\n");
 			else
-				mlx4_err(dev, "QUERY_FW command failed, aborting.\n");
+				mlx4_err(dev, "QUERY_FW command failed, aborting\n");
 			return err;
 		}
 
 		err = mlx4_load_fw(dev);
 		if (err) {
-			mlx4_err(dev, "Failed to start FW, aborting.\n");
+			mlx4_err(dev, "Failed to start FW, aborting\n");
 			return err;
 		}
 
@@ -2050,74 +2199,107 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 		err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
 		if (err)
 			mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+	}
 
-		dev_cap = kzalloc(sizeof *dev_cap, GFP_KERNEL);
-		if (!dev_cap) {
-			mlx4_err(dev, "Failed to allocate memory for dev_cap\n");
-			err = -ENOMEM;
-			goto err_stop_fw;
-		}
+	return err;
+}
 
-		err = mlx4_dev_cap(dev, dev_cap);
+static int mlx4_init_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv	  *priv = mlx4_priv(dev);
+	struct mlx4_adapter	   adapter;
+	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_profile	   profile;
+	struct mlx4_init_hca_param init_hca;
+	u64 icm_size;
+	struct mlx4_config_dev_params params;
+	int err;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_dev_cap(dev, &dev_cap);
 		if (err) {
-			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
-			goto err_stop_fw;
+			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+			return err;
 		}
 
-		choose_steering_mode(dev, dev_cap);
+		choose_steering_mode(dev, &dev_cap);
+		choose_tunnel_offload_mode(dev, &dev_cap);
+
+		if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC &&
+		    mlx4_is_master(dev))
+			dev->caps.function_caps |= MLX4_FUNC_CAP_DMFS_A0_STATIC;
+
+		err = mlx4_get_phys_port_id(dev);
+		if (err)
+			mlx4_err(dev, "Fail to get physical port id\n");
 
 		if (mlx4_is_master(dev))
 			mlx4_parav_master_pf_caps(dev);
 
-		process_mod_param_profile(&profile);
+		if (mlx4_low_memory_profile()) {
+			mlx4_info(dev, "Running from within kdump kernel. Using low memory profile\n");
+			profile = low_mem_profile;
+		} else {
+			profile = default_profile;
+		}
 		if (dev->caps.steering_mode ==
 		    MLX4_STEERING_MODE_DEVICE_MANAGED)
 			profile.num_mcg = MLX4_FS_NUM_MCG;
 
-		icm_size = mlx4_make_profile(dev, &profile, dev_cap,
+		icm_size = mlx4_make_profile(dev, &profile, &dev_cap,
 					     &init_hca);
 		if ((long long) icm_size < 0) {
 			err = icm_size;
-			goto err_stop_fw;
+			return err;
 		}
 
 		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
 
-		init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
-		init_hca.uar_page_sz = PAGE_SHIFT - 12;
+		if (enable_4k_uar) {
+			init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
+						    PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
+			init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;
+		} else {
+			init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+			init_hca.uar_page_sz = PAGE_SHIFT - 12;
+		}
 
-		err = mlx4_init_icm(dev, dev_cap, &init_hca, icm_size);
+		init_hca.mw_enabled = 0;
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+		    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)
+			init_hca.mw_enabled = INIT_HCA_TPT_MW_ENABLE;
+
+		err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
 		if (err)
-			goto err_stop_fw;
-
-		init_hca.mw_enable = 1;
+			return err;
 
 		err = mlx4_INIT_HCA(dev, &init_hca);
 		if (err) {
-			mlx4_err(dev, "INIT_HCA command failed, aborting.\n");
+			mlx4_err(dev, "INIT_HCA command failed, aborting\n");
 			goto err_free_icm;
 		}
 
-		if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
-			err = mlx4_query_func(dev, dev_cap);
+		if (dev_cap.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+			err = mlx4_query_func(dev, &dev_cap);
 			if (err < 0) {
 				mlx4_err(dev, "QUERY_FUNC command failed, aborting.\n");
-				goto err_stop_fw;
+				goto err_close;
 			} else if (err & MLX4_QUERY_FUNC_NUM_SYS_EQS) {
-				dev->caps.num_eqs = dev_cap->max_eqs;
-				dev->caps.reserved_eqs = dev_cap->reserved_eqs;
-				dev->caps.reserved_uars = dev_cap->reserved_uars;
+				dev->caps.num_eqs = dev_cap.max_eqs;
+				dev->caps.reserved_eqs = dev_cap.reserved_eqs;
+				dev->caps.reserved_uars = dev_cap.reserved_uars;
 			}
 		}
 
 		/*
-		 * Read HCA frequency by QUERY_HCA command
+		 * If TS is supported by FW
+		 * read HCA frequency by QUERY_HCA command
 		 */
 		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
 			memset(&init_hca, 0, sizeof(init_hca));
 			err = mlx4_QUERY_HCA(dev, &init_hca);
 			if (err) {
-				mlx4_err(dev, "QUERY_HCA command failed, disable timestamp.\n");
+				mlx4_err(dev, "QUERY_HCA command failed, disable timestamp\n");
 				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
 			} else {
 				dev->caps.hca_core_clock =
@@ -2129,19 +2311,40 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 			 */
 			if (!dev->caps.hca_core_clock) {
 				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
-				mlx4_err(dev, "HCA frequency is 0. Timestamping is not supported.");
+				mlx4_err(dev,
+					 "HCA frequency is 0 - timestamping is not supported\n");
 			} else if (map_internal_clock(dev)) {
-				/* Map internal clock,
+				/*
+				 * Map internal clock,
 				 * in case of failure disable timestamping
 				 */
 				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
-				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported.\n");
+				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported\n");
 			}
 		}
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED) {
+			if (mlx4_validate_optimized_steering(dev))
+				mlx4_warn(dev, "Optimized steering validation failed\n");
+
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE) {
+				dev->caps.dmfs_high_rate_qpn_base =
+					dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+				dev->caps.dmfs_high_rate_qpn_range =
+					MLX4_A0_STEERING_TABLE_SIZE;
+			}
+
+			mlx4_dbg(dev, "DMFS high rate steer mode is: %s\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode));
+		}
 	} else {
 		err = mlx4_init_slave(dev);
 		if (err) {
-			mlx4_err(dev, "Failed to initialize slave\n");
+			if (err != -EAGAIN)
+				mlx4_err(dev, "Failed to initialize slave\n");
 			return err;
 		}
 
@@ -2155,32 +2358,35 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 	if (map_bf_area(dev))
 		mlx4_dbg(dev, "Failed to map blue flame area\n");
 
-	/* Only the master set the ports, all the rest got it from it.*/
+	/*Only the master set the ports, all the rest got it from it.*/
 	if (!mlx4_is_slave(dev))
 		mlx4_set_port_mask(dev);
 
 	err = mlx4_QUERY_ADAPTER(dev, &adapter);
 	if (err) {
-		mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n");
+		mlx4_err(dev, "QUERY_ADAPTER command failed, aborting\n");
 		goto unmap_bf;
 	}
 
+	/* Query CONFIG_DEV parameters */
+	err = mlx4_config_dev_retrieval(dev, &params);
+	if (err && err != -ENOTSUPP) {
+		mlx4_err(dev, "Failed to query CONFIG_DEV parameters\n");
+	} else if (!err) {
+		dev->caps.rx_checksum_flags_port[1] = params.rx_csum_flags_port_1;
+		dev->caps.rx_checksum_flags_port[2] = params.rx_csum_flags_port_2;
+	}
 	priv->eq_table.inta_pin = adapter.inta_pin;
 	memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
-	memcpy(dev->vsd, adapter.vsd, sizeof(dev->vsd));
-	dev->vsd_vendor_id = adapter.vsd_vendor_id;
-
-	if (!mlx4_is_slave(dev))
-		kfree(dev_cap);
 
 	return 0;
 
 unmap_bf:
-	if (!mlx4_is_slave(dev))
-		unmap_internal_clock(dev);
+	unmap_internal_clock(dev);
 	unmap_bf_area(dev);
 
 	if (mlx4_is_slave(dev)) {
+		kfree(dev->caps.qp0_qkey);
 		kfree(dev->caps.qp0_tunnel);
 		kfree(dev->caps.qp0_proxy);
 		kfree(dev->caps.qp1_tunnel);
@@ -2197,447 +2403,128 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 	if (!mlx4_is_slave(dev))
 		mlx4_free_icms(dev);
 
-err_stop_fw:
-	if (!mlx4_is_slave(dev)) {
-		if (!mlx4_UNMAP_FA(dev))
-			mlx4_free_icm(dev, priv->fw.fw_icm, 0);
-		else
-			pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n");
-		kfree(dev_cap);
-	}
 	return err;
 }
 
 static int mlx4_init_counters_table(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int nent_pow2, port_indx, vf_index, num_counters;
-	int res, index = 0;
-	struct counter_index *new_counter_index;
-
+	int nent_pow2;
 
 	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
 		return -ENOENT;
 
-	if (!mlx4_is_slave(dev) &&
-	    dev->caps.max_counters == dev->caps.max_extended_counters) {
-		res = mlx4_cmd(dev, MLX4_IF_STATE_EXTENDED, 0, 0,
-			       MLX4_CMD_SET_IF_STAT,
-			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
-		if (res) {
-			mlx4_err(dev, "Failed to set extended counters (err=%d)\n", res);
-			return res;
-		}
-	}
-
-	mutex_init(&priv->counters_table.mutex);
-
-	if (mlx4_is_slave(dev)) {
-		for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) {
-			INIT_LIST_HEAD(&priv->counters_table.global_port_list[port_indx]);
-			if (dev->caps.def_counter_index[port_indx] != 0xFF) {
-				new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
-				if (!new_counter_index)
-					return -ENOMEM;
-				new_counter_index->index = dev->caps.def_counter_index[port_indx];
-				list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port_indx]);
-			}
-		}
-		mlx4_dbg(dev, "%s: slave allocated %d counters for %d ports\n",
-			 __func__, dev->caps.num_ports, dev->caps.num_ports);
-		return 0;
-	}
+	if (!dev->caps.max_counters)
+		return -ENOSPC;
 
 	nent_pow2 = roundup_pow_of_two(dev->caps.max_counters);
-
-	for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) {
-		INIT_LIST_HEAD(&priv->counters_table.global_port_list[port_indx]);
-		/* allocating 2 counters per port for PFs */
-                /* For the PF, the ETH default counters are 0,2; */
-		/* and the RoCE default counters are 1,3 */
-		for (num_counters = 0; num_counters < 2; num_counters++, index++) {
-			new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
-			if (!new_counter_index)
-				return -ENOMEM;
-			new_counter_index->index = index;
-			list_add_tail(&new_counter_index->list,
-				      &priv->counters_table.global_port_list[port_indx]);
-		}
-	}
-
-	if (mlx4_is_master(dev)) {
-		for (vf_index = 0; vf_index < dev->num_vfs; vf_index++) {
-			for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) {
-				INIT_LIST_HEAD(&priv->counters_table.vf_list[vf_index][port_indx]);
-				new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
-				if (!new_counter_index)
-					return -ENOMEM;
-				if (index <  nent_pow2 - 2) {
-					new_counter_index->index = index;
-					index++;
-				} else {
-					new_counter_index->index = MLX4_SINK_COUNTER_INDEX;
-				}
-
-				list_add_tail(&new_counter_index->list,
-					      &priv->counters_table.vf_list[vf_index][port_indx]);
-			}
-		}
-
-		res = mlx4_bitmap_init(&priv->counters_table.bitmap,
-				       nent_pow2, nent_pow2 - 1,
-				       index, 1);
-		mlx4_dbg(dev, "%s: master allocated %d counters for %d VFs\n",
-			 __func__, index, dev->num_vfs);
-	} else {
-		res = mlx4_bitmap_init(&priv->counters_table.bitmap,
-				nent_pow2, nent_pow2 - 1,
-				index, 1);
-		mlx4_dbg(dev, "%s: native allocated %d counters for %d ports\n",
-			 __func__, index, dev->caps.num_ports);
-	}
-
-	return 0;
-
+	/* reserve last counter index for sink counter */
+	return mlx4_bitmap_init(&priv->counters_bitmap, nent_pow2,
+				nent_pow2 - 1, 0,
+				nent_pow2 - dev->caps.max_counters + 1);
 }
 
 static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
 {
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	int i, j;
-	struct counter_index *port, *tmp_port;
-	struct counter_index *vf, *tmp_vf;
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return;
 
-	mutex_lock(&priv->counters_table.mutex);
+	if (!dev->caps.max_counters)
+		return;
 
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS) {
-		for (i = 0; i < dev->caps.num_ports; i++) {
-			list_for_each_entry_safe(port, tmp_port,
-						 &priv->counters_table.global_port_list[i],
-						 list) {
-				list_del(&port->list);
-				kfree(port);
-			}
-		}
-		if (!mlx4_is_slave(dev)) {
-			for (i = 0; i < dev->num_vfs; i++) {
-				for (j = 0; j < dev->caps.num_ports; j++) {
-					list_for_each_entry_safe(vf, tmp_vf,
-								 &priv->counters_table.vf_list[i][j],
-								 list) {
-						/* clear the counter statistic */
-						if (__mlx4_clear_if_stat(dev, vf->index))
-							mlx4_dbg(dev, "%s: reset counter %d failed\n",
-								 __func__, vf->index);
-						list_del(&vf->list);
-						kfree(vf);
-					}
-				}
-			}
-			mlx4_bitmap_cleanup(&priv->counters_table.bitmap);
-		}
-	}
-	mutex_unlock(&priv->counters_table.mutex);
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
 }
 
-int __mlx4_slave_counters_free(struct mlx4_dev *dev, int slave)
+static void mlx4_cleanup_default_counters(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int i, first;
-	struct counter_index *vf, *tmp_vf;
+	int port;
 
-	/* clean VF's counters for the next useg */
-	if (slave > 0 && slave <= dev->num_vfs) {
-		mlx4_dbg(dev, "%s: free counters of slave(%d)\n"
-			 , __func__, slave);
-
-		mutex_lock(&priv->counters_table.mutex);
-		for (i = 0; i < dev->caps.num_ports; i++) {
-			first = 0;
-			list_for_each_entry_safe(vf, tmp_vf,
-						 &priv->counters_table.vf_list[slave - 1][i],
-						 list) {
-				/* clear the counter statistic */
-				if (__mlx4_clear_if_stat(dev, vf->index))
-					mlx4_dbg(dev, "%s: reset counter %d failed\n",
-						 __func__, vf->index);
-				if (first++ && vf->index != MLX4_SINK_COUNTER_INDEX) {
-					mlx4_dbg(dev, "%s: delete counter index %d for slave %d and port %d\n"
-						 , __func__, vf->index, slave, i + 1);
-					mlx4_bitmap_free(&priv->counters_table.bitmap, vf->index, MLX4_USE_RR);
-					list_del(&vf->list);
-					kfree(vf);
-				} else {
-					mlx4_dbg(dev, "%s: can't delete default counter index %d for slave %d and port %d\n"
-						 , __func__, vf->index, slave, i + 1);
-				}
-			}
-		}
-		mutex_unlock(&priv->counters_table.mutex);
-	}
-
-	return 0;
+	for (port = 0; port < dev->caps.num_ports; port++)
+		if (priv->def_counter[port] != -1)
+			mlx4_counter_free(dev,  priv->def_counter[port]);
 }
 
-int __mlx4_counter_alloc(struct mlx4_dev *dev, int slave, int port, u32 *idx)
+static int mlx4_allocate_default_counters(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port, err = 0;
+	u32 idx;
+
+	for (port = 0; port < dev->caps.num_ports; port++)
+		priv->def_counter[port] = -1;
+
+	for (port = 0; port < dev->caps.num_ports; port++) {
+		err = mlx4_counter_alloc(dev, &idx);
+
+		if (!err || err == -ENOSPC) {
+			priv->def_counter[port] = idx;
+		} else if (err == -ENOENT) {
+			err = 0;
+			continue;
+		} else if (mlx4_is_slave(dev) && err == -EINVAL) {
+			priv->def_counter[port] = MLX4_SINK_COUNTER_INDEX(dev);
+			mlx4_warn(dev, "can't allocate counter from old PF driver, using index %d\n",
+				  MLX4_SINK_COUNTER_INDEX(dev));
+			err = 0;
+		} else {
+			mlx4_err(dev, "%s: failed to allocate default counter port %d err %d\n",
+				 __func__, port + 1, err);
+			mlx4_cleanup_default_counters(dev);
+			return err;
+		}
+
+		mlx4_dbg(dev, "%s: default counter index %d for port %d\n",
+			 __func__, priv->def_counter[port], port + 1);
+	}
+
+	return err;
+}
+
+int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct counter_index *new_counter_index;
 
 	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
 		return -ENOENT;
 
-	if ((slave > MLX4_MAX_NUM_VF) || (slave < 0) ||
-	    (port < 0) || (port > MLX4_MAX_PORTS)) {
-		mlx4_dbg(dev, "%s: invalid slave(%d) or port(%d) index\n",
-			 __func__, slave, port);
-		return -EINVAL;
-	}
-
-	/* handle old guest request does not support request by port index */
-	if (port == 0) {
-		*idx = MLX4_SINK_COUNTER_INDEX;
-		mlx4_dbg(dev, "%s: allocated default counter index %d for slave %d port %d\n"
-			 , __func__, *idx, slave, port);
-		return 0;
-	}
-
-	mutex_lock(&priv->counters_table.mutex);
-
-	*idx = mlx4_bitmap_alloc(&priv->counters_table.bitmap);
-	/* if no resources return the default counter of the slave and port */
+	*idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
 	if (*idx == -1) {
-		if (slave == 0) { /* its the ethernet counter ?????? */
-			new_counter_index = list_entry(priv->counters_table.global_port_list[port - 1].next,
-						       struct counter_index,
-						       list);
-		} else {
-			new_counter_index = list_entry(priv->counters_table.vf_list[slave - 1][port - 1].next,
-						       struct counter_index,
-						       list);
-		}
-
-		*idx = new_counter_index->index;
-		mlx4_dbg(dev, "%s: allocated defualt counter index %d for slave %d port %d\n"
-			 , __func__, *idx, slave, port);
-		goto out;
+		*idx = MLX4_SINK_COUNTER_INDEX(dev);
+		return -ENOSPC;
 	}
 
-	if (slave == 0) { /* native or master */
-		new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
-		if (!new_counter_index)
-			goto no_mem;
-		new_counter_index->index = *idx;
-		list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port - 1]);
-	} else {
-		new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
-		if (!new_counter_index)
-			goto no_mem;
-		new_counter_index->index = *idx;
-		list_add_tail(&new_counter_index->list, &priv->counters_table.vf_list[slave - 1][port - 1]);
-	}
-
-	mlx4_dbg(dev, "%s: allocated counter index %d for slave %d port %d\n"
-		 , __func__, *idx, slave, port);
-out:
-	mutex_unlock(&priv->counters_table.mutex);
 	return 0;
-
-no_mem:
-	mlx4_bitmap_free(&priv->counters_table.bitmap, *idx, MLX4_USE_RR);
-	mutex_unlock(&priv->counters_table.mutex);
-	*idx = MLX4_SINK_COUNTER_INDEX;
-	mlx4_dbg(dev, "%s: failed err (%d)\n"
-		 , __func__, -ENOMEM);
-	return -ENOMEM;
 }
 
-int mlx4_counter_alloc(struct mlx4_dev *dev, u8 port, u32 *idx)
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
 {
 	u64 out_param;
 	int err;
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct counter_index *new_counter_index, *c_index;
 
 	if (mlx4_is_mfunc(dev)) {
-		err = mlx4_cmd_imm(dev, 0, &out_param,
-				   ((u32) port) << 8 | (u32) RES_COUNTER,
+		err = mlx4_cmd_imm(dev, 0, &out_param, RES_COUNTER,
 				   RES_OP_RESERVE, MLX4_CMD_ALLOC_RES,
 				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
-		if (!err) {
+		if (!err)
 			*idx = get_param_l(&out_param);
-			if (*idx == MLX4_SINK_COUNTER_INDEX)
-				return -ENOSPC;
 
-			mutex_lock(&priv->counters_table.mutex);
-			c_index = list_entry(priv->counters_table.global_port_list[port - 1].next,
-					     struct counter_index,
-					     list);
-			mutex_unlock(&priv->counters_table.mutex);
-			if (c_index->index == *idx)
-				return -EEXIST;
-
-			if (mlx4_is_slave(dev)) {
-				new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL);
-				if (!new_counter_index) {
-					mlx4_counter_free(dev, port, *idx);
-					return -ENOMEM;
-				}
-				new_counter_index->index = *idx;
-				mutex_lock(&priv->counters_table.mutex);
-				list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port - 1]);
-				mutex_unlock(&priv->counters_table.mutex);
-				mlx4_dbg(dev, "%s: allocated counter index %d for port %d\n"
-					 , __func__, *idx, port);
-			}
-		}
 		return err;
 	}
-	return __mlx4_counter_alloc(dev, 0, port, idx);
+	return __mlx4_counter_alloc(dev, idx);
 }
 EXPORT_SYMBOL_GPL(mlx4_counter_alloc);
 
-void __mlx4_counter_free(struct mlx4_dev *dev, int slave, int port, u32 idx)
+static int __mlx4_clear_if_stat(struct mlx4_dev *dev,
+				u8 counter_index)
 {
-	/* check if native or slave and deletes accordingly */
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct counter_index *pf, *tmp_pf;
-	struct counter_index *vf, *tmp_vf;
-	int first;
-
-
-	if (idx == MLX4_SINK_COUNTER_INDEX) {
-		mlx4_dbg(dev, "%s: try to delete default counter index %d for port %d\n"
-			 , __func__, idx, port);
-			return;
-	}
-
-	if ((slave > MLX4_MAX_NUM_VF) || (slave < 0) ||
-	    (port < 0) || (port > MLX4_MAX_PORTS)) {
-		mlx4_warn(dev, "%s: deletion failed due to invalid slave(%d) or port(%d) index\n"
-			 , __func__, slave, idx);
-			return;
-	}
-
-	mutex_lock(&priv->counters_table.mutex);
-	if (slave == 0) {
-		first = 0;
-		list_for_each_entry_safe(pf, tmp_pf,
-					 &priv->counters_table.global_port_list[port - 1],
-					 list) {
-			/* the first 2 counters are reserved */
-			if (pf->index == idx) {
-				/* clear the counter statistic */
-				if (__mlx4_clear_if_stat(dev, pf->index))
-					mlx4_dbg(dev, "%s: reset counter %d failed\n",
-						 __func__, pf->index);
-				if (1 < first && idx != MLX4_SINK_COUNTER_INDEX) {
-					list_del(&pf->list);
-					kfree(pf);
-					mlx4_dbg(dev, "%s: delete counter index %d for native device (%d) port %d\n"
-						 , __func__, idx, slave, port);
-					mlx4_bitmap_free(&priv->counters_table.bitmap, idx, MLX4_USE_RR);
-					goto out;
-				} else {
-					mlx4_dbg(dev, "%s: can't delete default counter index %d for native device (%d) port %d\n"
-						 , __func__, idx, slave, port);
-					goto out;
-				}
-			}
-			first++;
-		}
-		mlx4_dbg(dev, "%s: can't delete counter index %d for native device (%d) port %d\n"
-			 , __func__, idx, slave, port);
-	} else {
-		first = 0;
-		list_for_each_entry_safe(vf, tmp_vf,
-					 &priv->counters_table.vf_list[slave - 1][port - 1],
-					 list) {
-			/* the first element is reserved */
-			if (vf->index == idx) {
-				/* clear the counter statistic */
-				if (__mlx4_clear_if_stat(dev, vf->index))
-					mlx4_dbg(dev, "%s: reset counter %d failed\n",
-						 __func__, vf->index);
-				if (first) {
-					list_del(&vf->list);
-					kfree(vf);
-					mlx4_dbg(dev, "%s: delete counter index %d for slave %d port %d\n",
-						 __func__, idx, slave, port);
-					mlx4_bitmap_free(&priv->counters_table.bitmap, idx, MLX4_USE_RR);
-					goto out;
-				} else {
-					mlx4_dbg(dev, "%s: can't delete default slave (%d) counter index %d for port %d\n"
-						 , __func__, slave, idx, port);
-					goto out;
-				}
-			}
-			first++;
-		}
-		mlx4_dbg(dev, "%s: can't delete slave (%d) counter index %d for port %d\n"
-			 , __func__, slave, idx, port);
-	}
-
-out:
-	mutex_unlock(&priv->counters_table.mutex);
-}
-
-void mlx4_counter_free(struct mlx4_dev *dev, u8 port, u32 idx)
-{
-	u64 in_param = 0;
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct counter_index *counter, *tmp_counter;
-	int first = 0;
-
-	if (mlx4_is_mfunc(dev)) {
-		set_param_l(&in_param, idx);
-		mlx4_cmd(dev, in_param,
-			 ((u32) port) << 8 | (u32) RES_COUNTER,
-			 RES_OP_RESERVE,
-			 MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
-			 MLX4_CMD_WRAPPED);
-
-		if (mlx4_is_slave(dev) && idx != MLX4_SINK_COUNTER_INDEX) {
-			mutex_lock(&priv->counters_table.mutex);
-			list_for_each_entry_safe(counter, tmp_counter,
-						 &priv->counters_table.global_port_list[port - 1],
-						 list) {
-				if (counter->index == idx && first++) {
-					list_del(&counter->list);
-					kfree(counter);
-					mlx4_dbg(dev, "%s: delete counter index %d for port %d\n"
-						 , __func__, idx, port);
-					mutex_unlock(&priv->counters_table.mutex);
-					return;
-				}
-			}
-			mutex_unlock(&priv->counters_table.mutex);
-		}
-
-		return;
-	}
-	__mlx4_counter_free(dev, 0, port, idx);
-}
-EXPORT_SYMBOL_GPL(mlx4_counter_free);
-
-int __mlx4_clear_if_stat(struct mlx4_dev *dev,
-			 u8 counter_index)
-{
-	struct mlx4_cmd_mailbox *if_stat_mailbox = NULL;
-	int err = 0;
-	u32 if_stat_in_mod = (counter_index & 0xff) | (1 << 31);
-
-	if (counter_index == MLX4_SINK_COUNTER_INDEX)
-		return -EINVAL;
-
-	if (mlx4_is_slave(dev))
-		return 0;
+	struct mlx4_cmd_mailbox *if_stat_mailbox;
+	int err;
+	u32 if_stat_in_mod = (counter_index & 0xff) | MLX4_QUERY_IF_STAT_RESET;
 
 	if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(if_stat_mailbox)) {
-		err = PTR_ERR(if_stat_mailbox);
-		return err;
-	}
+	if (IS_ERR(if_stat_mailbox))
+		return PTR_ERR(if_stat_mailbox);
 
 	err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma, if_stat_in_mod, 0,
 			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
@@ -2647,102 +2534,73 @@ int __mlx4_clear_if_stat(struct mlx4_dev *dev,
 	return err;
 }
 
-u8 mlx4_get_default_counter_index(struct mlx4_dev *dev, int slave, int port)
+void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
 {
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct counter_index *new_counter_index;
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return;
 
-	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB) {
-		mlx4_dbg(dev, "%s: return counter index %d for slave %d port (MLX4_PORT_TYPE_IB) %d\n",
-			 __func__, MLX4_SINK_COUNTER_INDEX, slave, port);
-		return (u8)MLX4_SINK_COUNTER_INDEX;
-	}
+	if (idx == MLX4_SINK_COUNTER_INDEX(dev))
+		return;
 
-	mutex_lock(&priv->counters_table.mutex);
-	if (slave == 0) {
-		new_counter_index = list_entry(priv->counters_table.global_port_list[port - 1].next,
-					       struct counter_index,
-					       list);
-	} else {
-		new_counter_index = list_entry(priv->counters_table.vf_list[slave - 1][port - 1].next,
-					       struct counter_index,
-					       list);
-	}
-	mutex_unlock(&priv->counters_table.mutex);
+	__mlx4_clear_if_stat(dev, idx);
 
-	mlx4_dbg(dev, "%s: return counter index %d for slave %d port %d\n",
-		 __func__, new_counter_index->index, slave, port);
-
-
-	return (u8)new_counter_index->index;
+	mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx, MLX4_USE_RR);
+	return;
 }
 
-int mlx4_get_vport_ethtool_stats(struct mlx4_dev *dev, int port,
-			 struct mlx4_en_vport_stats *vport_stats,
-			 int reset)
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, idx);
+		mlx4_cmd(dev, in_param, RES_COUNTER, RES_OP_RESERVE,
+			 MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_WRAPPED);
+		return;
+	}
+	__mlx4_counter_free(dev, idx);
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_free);
+
+int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct mlx4_cmd_mailbox *if_stat_mailbox = NULL;
-	union  mlx4_counter *counter;
-	int err = 0;
-	u32 if_stat_in_mod;
-	struct counter_index *vport, *tmp_vport;
 
-	if (!vport_stats)
-		return -EINVAL;
-
-	if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(if_stat_mailbox)) {
-		err = PTR_ERR(if_stat_mailbox);
-		return err;
-	}
-
-	mutex_lock(&priv->counters_table.mutex);
-	list_for_each_entry_safe(vport, tmp_vport,
-				 &priv->counters_table.global_port_list[port - 1],
-				 list) {
-		if (vport->index == MLX4_SINK_COUNTER_INDEX)
-			continue;
-
-		memset(if_stat_mailbox->buf, 0, sizeof(union  mlx4_counter));
-		if_stat_in_mod = (vport->index & 0xff) | ((reset & 1) << 31);
-		err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma,
-				   if_stat_in_mod, 0,
-				   MLX4_CMD_QUERY_IF_STAT,
-				   MLX4_CMD_TIME_CLASS_C,
-				   MLX4_CMD_NATIVE);
-		if (err) {
-			mlx4_dbg(dev, "%s: failed to read statistics for counter index %d\n",
-				 __func__, vport->index);
-			goto if_stat_out;
-		}
-		counter = (union mlx4_counter *)if_stat_mailbox->buf;
-		if ((counter->control.cnt_mode & 0xf) == 1) {
-			vport_stats->rx_broadcast_packets += be64_to_cpu(counter->ext.counters[0].IfRxBroadcastFrames);
-			vport_stats->rx_unicast_packets += be64_to_cpu(counter->ext.counters[0].IfRxUnicastFrames);
-			vport_stats->rx_multicast_packets += be64_to_cpu(counter->ext.counters[0].IfRxMulticastFrames);
-			vport_stats->tx_broadcast_packets += be64_to_cpu(counter->ext.counters[0].IfTxBroadcastFrames);
-			vport_stats->tx_unicast_packets += be64_to_cpu(counter->ext.counters[0].IfTxUnicastFrames);
-			vport_stats->tx_multicast_packets += be64_to_cpu(counter->ext.counters[0].IfTxMulticastFrames);
-			vport_stats->rx_broadcast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxBroadcastOctets);
-			vport_stats->rx_unicast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxUnicastOctets);
-			vport_stats->rx_multicast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxMulticastOctets);
-			vport_stats->tx_broadcast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxBroadcastOctets);
-			vport_stats->tx_unicast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxUnicastOctets);
-			vport_stats->tx_multicast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxMulticastOctets);
-			vport_stats->rx_errors += be64_to_cpu(counter->ext.counters[0].IfRxErrorFrames);
-			vport_stats->rx_dropped += be64_to_cpu(counter->ext.counters[0].IfRxNoBufferFrames);
-			vport_stats->tx_errors += be64_to_cpu(counter->ext.counters[0].IfTxDroppedFrames);
-		}
-	}
-
-if_stat_out:
-	mutex_unlock(&priv->counters_table.mutex);
-	mlx4_free_cmd_mailbox(dev, if_stat_mailbox);
-
-	return err;
+	return priv->def_counter[port - 1];
+}
+EXPORT_SYMBOL_GPL(mlx4_get_default_counter_index);
+
+void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->mfunc.master.vf_admin[entry].vport[port].guid = guid;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_admin_guid);
+
+__be64 mlx4_get_admin_guid(struct mlx4_dev *dev, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->mfunc.master.vf_admin[entry].vport[port].guid;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_admin_guid);
+
+void mlx4_set_random_admin_guid(struct mlx4_dev *dev, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	__be64 guid;
+
+	/* hw GUID */
+	if (entry == 0)
+		return;
+
+	get_random_bytes((char *)&guid, sizeof(guid));
+	guid &= ~(cpu_to_be64(1ULL << 56));
+	guid |= cpu_to_be64(1ULL << 57);
+	priv->mfunc.master.vf_admin[entry].vport[port].guid = guid;
 }
-EXPORT_SYMBOL_GPL(mlx4_get_vport_ethtool_stats);
 
 static int mlx4_setup_hca(struct mlx4_dev *dev)
 {
@@ -2753,84 +2611,75 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 
 	err = mlx4_init_uar_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "user access region table (err=%d), aborting.\n",
-			 err);
+		mlx4_err(dev, "Failed to initialize user access region table, aborting\n");
 		return err;
 	}
 
 	err = mlx4_uar_alloc(dev, &priv->driver_uar);
 	if (err) {
-		mlx4_err(dev, "Failed to allocate driver access region "
-			 "(err=%d), aborting.\n", err);
+		mlx4_err(dev, "Failed to allocate driver access region, aborting\n");
 		goto err_uar_table_free;
 	}
 
 	priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
 	if (!priv->kar) {
-		mlx4_err(dev, "Couldn't map kernel access region, "
-			 "aborting.\n");
+		mlx4_err(dev, "Couldn't map kernel access region, aborting\n");
 		err = -ENOMEM;
 		goto err_uar_free;
 	}
 
 	err = mlx4_init_pd_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "protection domain table (err=%d), aborting.\n", err);
+		mlx4_err(dev, "Failed to initialize protection domain table, aborting\n");
 		goto err_kar_unmap;
 	}
 
 	err = mlx4_init_xrcd_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "reliable connection domain table (err=%d), "
-			 "aborting.\n", err);
+		mlx4_err(dev, "Failed to initialize reliable connection domain table, aborting\n");
 		goto err_pd_table_free;
 	}
 
 	err = mlx4_init_mr_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "memory region table (err=%d), aborting.\n", err);
+		mlx4_err(dev, "Failed to initialize memory region table, aborting\n");
 		goto err_xrcd_table_free;
 	}
 
 	if (!mlx4_is_slave(dev)) {
 		err = mlx4_init_mcg_table(dev);
 		if (err) {
-			mlx4_err(dev, "Failed to initialize "
-				 "multicast group table (err=%d), aborting.\n",
-				 err);
+			mlx4_err(dev, "Failed to initialize multicast group table, aborting\n");
 			goto err_mr_table_free;
 		}
+		err = mlx4_config_mad_demux(dev);
+		if (err) {
+			mlx4_err(dev, "Failed in config_mad_demux, aborting\n");
+			goto err_mcg_table_free;
+		}
 	}
 
 	err = mlx4_init_eq_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "event queue table (err=%d), aborting.\n", err);
+		mlx4_err(dev, "Failed to initialize event queue table, aborting\n");
 		goto err_mcg_table_free;
 	}
 
 	err = mlx4_cmd_use_events(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to switch to event-driven "
-			 "firmware commands (err=%d), aborting.\n", err);
+		mlx4_err(dev, "Failed to switch to event-driven firmware commands, aborting\n");
 		goto err_eq_table_free;
 	}
 
 	err = mlx4_NOP(dev);
 	if (err) {
 		if (dev->flags & MLX4_FLAG_MSI_X) {
-			mlx4_warn(dev, "NOP command failed to generate MSI-X "
-				  "interrupt IRQ %d).\n",
-				  priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
-			mlx4_warn(dev, "Trying again without MSI-X.\n");
+			mlx4_warn(dev, "NOP command failed to generate MSI-X interrupt IRQ %d)\n",
+				  priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+			mlx4_warn(dev, "Trying again without MSI-X\n");
 		} else {
-			mlx4_err(dev, "NOP command failed to generate interrupt "
-				 "(IRQ %d), aborting.\n",
-				 priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+			mlx4_err(dev, "NOP command failed to generate interrupt (IRQ %d), aborting\n",
+				 priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
 			mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
 		}
 
@@ -2841,31 +2690,34 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 
 	err = mlx4_init_cq_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "completion queue table (err=%d), aborting.\n", err);
+		mlx4_err(dev, "Failed to initialize completion queue table, aborting\n");
 		goto err_cmd_poll;
 	}
 
 	err = mlx4_init_srq_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "shared receive queue table (err=%d), aborting.\n",
-			 err);
+		mlx4_err(dev, "Failed to initialize shared receive queue table, aborting\n");
 		goto err_cq_table_free;
 	}
 
 	err = mlx4_init_qp_table(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to initialize "
-			 "queue pair table (err=%d), aborting.\n", err);
+		mlx4_err(dev, "Failed to initialize queue pair table, aborting\n");
 		goto err_srq_table_free;
 	}
 
-	err = mlx4_init_counters_table(dev);
-	if (err && err != -ENOENT) {
-		mlx4_err(dev, "Failed to initialize counters table (err=%d), "
-			 "aborting.\n", err);
-		goto err_qp_table_free;
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_counters_table(dev);
+		if (err && err != -ENOENT) {
+			mlx4_err(dev, "Failed to initialize counters table, aborting\n");
+			goto err_qp_table_free;
+		}
+	}
+
+	err = mlx4_allocate_default_counters(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate default counters, aborting\n");
+		goto err_counters_table_free;
 	}
 
 	if (!mlx4_is_slave(dev)) {
@@ -2874,9 +2726,8 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 			err = mlx4_get_port_ib_caps(dev, port,
 						    &ib_port_default_caps);
 			if (err)
-				mlx4_warn(dev, "failed to get port %d default "
-					  "ib capabilities (%d). Continuing "
-					  "with caps = 0\n", port, err);
+				mlx4_warn(dev, "failed to get port %d default ib capabilities (%d). Continuing with caps = 0\n",
+					  port, err);
 			dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
 
 			/* initialize per-slave default ib port capabilities */
@@ -2886,26 +2737,33 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 					if (i == mlx4_master_func_num(dev))
 						continue;
 					priv->mfunc.master.slave_state[i].ib_cap_mask[port] =
-							ib_port_default_caps;
+						ib_port_default_caps;
 				}
 			}
 
-			dev->caps.port_ib_mtu[port] = IB_MTU_4096;
+			if (mlx4_is_mfunc(dev))
+				dev->caps.port_ib_mtu[port] = IB_MTU_2048;
+			else
+				dev->caps.port_ib_mtu[port] = IB_MTU_4096;
 
 			err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ?
 					    dev->caps.pkey_table_len[port] : -1);
 			if (err) {
-				mlx4_err(dev, "Failed to set port %d (err=%d), "
-					 "aborting\n", port, err);
-				goto err_counters_table_free;
+				mlx4_err(dev, "Failed to set port %d, aborting\n",
+					 port);
+				goto err_default_countes_free;
 			}
 		}
 	}
 
 	return 0;
 
+err_default_countes_free:
+	mlx4_cleanup_default_counters(dev);
+
 err_counters_table_free:
-	mlx4_cleanup_counters_table(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_counters_table(dev);
 
 err_qp_table_free:
 	mlx4_cleanup_qp_table(dev);
@@ -2946,21 +2804,47 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 	return err;
 }
 
+static int mlx4_init_affinity_hint(struct mlx4_dev *dev, int port, int eqn)
+{
+	int requested_cpu = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_eq *eq;
+	int off = 0;
+	int i;
+
+	if (eqn > dev->caps.num_comp_vectors)
+		return -EINVAL;
+
+	for (i = 1; i < port; i++)
+		off += mlx4_get_eqs_per_port(dev, i);
+
+	requested_cpu = eqn - off - !!(eqn > MLX4_EQ_ASYNC);
+
+	/* Meaning EQs are shared, and this call comes from the second port */
+	if (requested_cpu < 0)
+		return 0;
+
+	eq = &priv->eq_table.eq[eqn];
+
+	eq->affinity_cpu_id = requested_cpu % num_online_cpus();
+
+	return 0;
+}
+
 static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct msix_entry *entries;
-	int err;
 	int i;
+	int port = 0;
 
 	if (msi_x) {
-		int nreq = dev->caps.num_ports * num_online_cpus() + MSIX_LEGACY_SZ;
+		int nreq = dev->caps.num_ports * num_online_cpus() + 1;
 
 		nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs,
 			     nreq);
-
-		if (msi_x > 1 && !mlx4_is_mfunc(dev))
-			nreq = min_t(int, nreq, msi_x);
+		if (nreq > MAX_MSIX)
+			nreq = MAX_MSIX;
 
 		entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
 		if (!entries)
@@ -2969,38 +2853,58 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 		for (i = 0; i < nreq; ++i)
 			entries[i].entry = i;
 
-	retry:
-		err = pci_enable_msix(dev->pdev, entries, nreq);
-		if (err) {
-			/* Try again if at least 2 vectors are available */
-			if (err > 1) {
-				mlx4_info(dev, "Requested %d vectors, "
-					  "but only %d MSI-X vectors available, "
-					  "trying again\n", nreq, err);
-				nreq = err;
-				goto retry;
-			}
+		nreq = pci_enable_msix_range(dev->persist->pdev, entries, 2,
+					     nreq);
+
+		if (nreq < 0 || nreq < MLX4_EQ_ASYNC) {
 			kfree(entries);
-			/* if error, or can't alloc even 1 IRQ */
-			if (err < 0) {
-				mlx4_err(dev, "No IRQs left, device can't "
-				    "be started.\n");
-				goto no_irq;
-			}
 			goto no_msi;
 		}
+		/* 1 is reserved for events (asyncrounous EQ) */
+		dev->caps.num_comp_vectors = nreq - 1;
 
-		if (nreq <
-		    MSIX_LEGACY_SZ + dev->caps.num_ports * MIN_MSIX_P_PORT) {
-			/*Working in legacy mode , all EQ's shared*/
-			dev->caps.comp_pool           = 0;
-			dev->caps.num_comp_vectors = nreq - 1;
-		} else {
-			dev->caps.comp_pool           = nreq - MSIX_LEGACY_SZ;
-			dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1;
+		priv->eq_table.eq[MLX4_EQ_ASYNC].irq = entries[0].vector;
+		bitmap_zero(priv->eq_table.eq[MLX4_EQ_ASYNC].actv_ports.ports,
+			    dev->caps.num_ports);
+
+		for (i = 0; i < dev->caps.num_comp_vectors + 1; i++) {
+			if (i == MLX4_EQ_ASYNC)
+				continue;
+
+			priv->eq_table.eq[i].irq =
+				entries[i + 1 - !!(i > MLX4_EQ_ASYNC)].vector;
+
+			if (MLX4_IS_LEGACY_EQ_MODE(dev->caps)) {
+				bitmap_fill(priv->eq_table.eq[i].actv_ports.ports,
+					    dev->caps.num_ports);
+				/* We don't set affinity hint when there
+				 * aren't enough EQs
+				 */
+			} else {
+				set_bit(port,
+					priv->eq_table.eq[i].actv_ports.ports);
+				if (mlx4_init_affinity_hint(dev, port + 1, i))
+					mlx4_warn(dev, "Couldn't init hint cpumask for EQ %d\n",
+						  i);
+			}
+			/* We divide the Eqs evenly between the two ports.
+			 * (dev->caps.num_comp_vectors / dev->caps.num_ports)
+			 * refers to the number of Eqs per port
+			 * (i.e eqs_per_port). Theoretically, we would like to
+			 * write something like (i + 1) % eqs_per_port == 0.
+			 * However, since there's an asynchronous Eq, we have
+			 * to skip over it by comparing this condition to
+			 * !!((i + 1) > MLX4_EQ_ASYNC).
+			 */
+			if ((dev->caps.num_comp_vectors > dev->caps.num_ports) &&
+			    ((i + 1) %
+			     (dev->caps.num_comp_vectors / dev->caps.num_ports)) ==
+			    !!((i + 1) > MLX4_EQ_ASYNC))
+				/* If dev->caps.num_comp_vectors < dev->caps.num_ports,
+				 * everything is shared anyway.
+				 */
+				port++;
 		}
-		for (i = 0; i < nreq; ++i)
-			priv->eq_table.eq[i].irq = entries[i].vector;
 
 		dev->flags |= MLX4_FLAG_MSI_X;
 
@@ -3010,38 +2914,15 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 
 no_msi:
 	dev->caps.num_comp_vectors = 1;
-	dev->caps.comp_pool	   = 0;
 
-	for (i = 0; i < 2; ++i)
-		priv->eq_table.eq[i].irq = dev->pdev->irq;
-	return;
-no_irq:
-	dev->caps.num_comp_vectors = 0;
-	dev->caps.comp_pool        = 0;
-	return;
-}
-
-static void
-mlx4_init_hca_info(struct mlx4_dev *dev)
-{
-	struct mlx4_hca_info *info = &mlx4_priv(dev)->hca_info;
-
-	info->dev = dev;
-
-	info->firmware_attr = (struct device_attribute)__ATTR(fw_ver, S_IRUGO,
-							show_firmware_version, NULL);
-	if (device_create_file(&dev->pdev->dev, &info->firmware_attr))
-		mlx4_err(dev, "Failed to add file firmware version");
-
-	info->hca_attr = (struct device_attribute)__ATTR(hca, S_IRUGO, show_hca,
-										NULL);
-	if (device_create_file(&dev->pdev->dev, &info->hca_attr))
-		mlx4_err(dev, "Failed to add file hca type");
-
-	info->board_attr = (struct device_attribute)__ATTR(board_id, S_IRUGO,
-							    show_board, NULL);
-	if (device_create_file(&dev->pdev->dev, &info->board_attr))
-		mlx4_err(dev, "Failed to add file board id type");
+	BUG_ON(MLX4_EQ_ASYNC >= 2);
+	for (i = 0; i < 2; ++i) {
+		priv->eq_table.eq[i].irq = dev->persist->pdev->irq;
+		if (i != MLX4_EQ_ASYNC) {
+			bitmap_fill(priv->eq_table.eq[i].actv_ports.ports,
+				    dev->caps.num_ports);
+		}
+	}
 }
 
 static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
@@ -3054,6 +2935,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	if (!mlx4_is_slave(dev)) {
 		mlx4_init_mac_table(dev, &info->mac_table);
 		mlx4_init_vlan_table(dev, &info->vlan_table);
+		mlx4_init_roce_gid_table(dev, &info->gid_table);
 		info->base_qpn = mlx4_get_base_qpn(dev, port);
 	}
 
@@ -3068,7 +2950,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	info->port_attr.show      = show_port_type;
 	sysfs_attr_init(&info->port_attr.attr);
 
-	err = device_create_file(&dev->pdev->dev, &info->port_attr);
+	err = device_create_file(&dev->persist->pdev->dev, &info->port_attr);
 	if (err) {
 		mlx4_err(dev, "Failed to create file for port %d\n", port);
 		info->port = -1;
@@ -3085,31 +2967,30 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	info->port_mtu_attr.show      = show_port_ib_mtu;
 	sysfs_attr_init(&info->port_mtu_attr.attr);
 
-	err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr);
+	err = device_create_file(&dev->persist->pdev->dev,
+				 &info->port_mtu_attr);
 	if (err) {
 		mlx4_err(dev, "Failed to create mtu file for port %d\n", port);
-		device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+		device_remove_file(&info->dev->persist->pdev->dev,
+				   &info->port_attr);
 		info->port = -1;
 	}
 
 	return err;
 }
 
-static void
-mlx4_cleanup_hca_info(struct mlx4_hca_info *info)
-{
-	device_remove_file(&info->dev->pdev->dev, &info->firmware_attr);
-	device_remove_file(&info->dev->pdev->dev, &info->board_attr);
-	device_remove_file(&info->dev->pdev->dev, &info->hca_attr);
-}
-
 static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
 {
 	if (info->port < 0)
 		return;
 
-	device_remove_file(&info->dev->pdev->dev, &info->port_attr);
-	device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr);
+	device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr);
+	device_remove_file(&info->dev->persist->pdev->dev,
+			   &info->port_mtu_attr);
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(info->rmap);
+	info->rmap = NULL;
+#endif
 }
 
 static int mlx4_init_steering(struct mlx4_dev *dev)
@@ -3176,10 +3057,11 @@ static int mlx4_get_ownership(struct mlx4_dev *dev)
 	void __iomem *owner;
 	u32 ret;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return -EIO;
 
-	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
 			MLX4_OWNER_SIZE);
 	if (!owner) {
 		mlx4_err(dev, "Failed to obtain ownership bit\n");
@@ -3195,10 +3077,11 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
 {
 	void __iomem *owner;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return;
 
-	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
 			MLX4_OWNER_SIZE);
 	if (!owner) {
 		mlx4_err(dev, "Failed to obtain ownership bit\n");
@@ -3209,118 +3092,142 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
 	iounmap(owner);
 }
 
-static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
+#define SRIOV_VALID_STATE(flags) (!!((flags) & MLX4_FLAG_SRIOV)	==\
+				  !!((flags) & MLX4_FLAG_MASTER))
+
+static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
+			     u8 total_vfs, int existing_vfs, int reset_flow)
+{
+	u64 dev_flags = dev->flags;
+	int err = 0;
+
+	if (reset_flow) {
+		dev->dev_vfs = kcalloc(total_vfs, sizeof(*dev->dev_vfs),
+				       GFP_KERNEL);
+		if (!dev->dev_vfs)
+			goto free_mem;
+		return dev_flags;
+	}
+
+	atomic_inc(&pf_loading);
+	if (dev->flags &  MLX4_FLAG_SRIOV) {
+		if (existing_vfs != total_vfs) {
+			mlx4_err(dev, "SR-IOV was already enabled, but with num_vfs (%d) different than requested (%d)\n",
+				 existing_vfs, total_vfs);
+			total_vfs = existing_vfs;
+		}
+	}
+
+	dev->dev_vfs = kzalloc(total_vfs * sizeof(*dev->dev_vfs), GFP_KERNEL);
+	if (NULL == dev->dev_vfs) {
+		mlx4_err(dev, "Failed to allocate memory for VFs\n");
+		goto disable_sriov;
+	}
+
+	if (!(dev->flags &  MLX4_FLAG_SRIOV)) {
+		mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", total_vfs);
+		err = pci_enable_sriov(pdev, total_vfs);
+	}
+	if (err) {
+		mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n",
+			 err);
+		goto disable_sriov;
+	} else {
+		mlx4_warn(dev, "Running in master mode\n");
+		dev_flags |= MLX4_FLAG_SRIOV |
+			MLX4_FLAG_MASTER;
+		dev_flags &= ~MLX4_FLAG_SLAVE;
+		dev->persist->num_vfs = total_vfs;
+	}
+	return dev_flags;
+
+disable_sriov:
+	atomic_dec(&pf_loading);
+free_mem:
+	dev->persist->num_vfs = 0;
+	kfree(dev->dev_vfs);
+        dev->dev_vfs = NULL;
+	return dev_flags & ~MLX4_FLAG_MASTER;
+}
+
+enum {
+	MLX4_DEV_CAP_CHECK_NUM_VFS_ABOVE_64 = -1,
+};
+
+static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+			      int *nvfs)
+{
+	int requested_vfs = nvfs[0] + nvfs[1] + nvfs[2];
+	/* Checking for 64 VFs as a limitation of CX2 */
+	if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_80_VFS) &&
+	    requested_vfs >= 64) {
+		mlx4_err(dev, "Requested %d VFs, but FW does not support more than 64\n",
+			 requested_vfs);
+		return MLX4_DEV_CAP_CHECK_NUM_VFS_ABOVE_64;
+	}
+	return 0;
+}
+
+static int mlx4_pci_enable_device(struct mlx4_dev *dev)
+{
+	struct pci_dev *pdev = dev->persist->pdev;
+	int err = 0;
+
+	mutex_lock(&dev->persist->pci_status_mutex);
+	if (dev->persist->pci_status == MLX4_PCI_STATUS_DISABLED) {
+		err = pci_enable_device(pdev);
+		if (!err)
+			dev->persist->pci_status = MLX4_PCI_STATUS_ENABLED;
+	}
+	mutex_unlock(&dev->persist->pci_status_mutex);
+
+	return err;
+}
+
+static void mlx4_pci_disable_device(struct mlx4_dev *dev)
+{
+	struct pci_dev *pdev = dev->persist->pdev;
+
+	mutex_lock(&dev->persist->pci_status_mutex);
+	if (dev->persist->pci_status == MLX4_PCI_STATUS_ENABLED) {
+		pci_disable_device(pdev);
+		dev->persist->pci_status = MLX4_PCI_STATUS_DISABLED;
+	}
+	mutex_unlock(&dev->persist->pci_status_mutex);
+}
+
+static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
+			 int total_vfs, int *nvfs, struct mlx4_priv *priv,
+			 int reset_flow)
 {
-	struct mlx4_priv *priv;
 	struct mlx4_dev *dev;
+	unsigned sum = 0;
 	int err;
 	int port;
-	int nvfs, prb_vf;
+	int i;
+	struct mlx4_dev_cap *dev_cap = NULL;
+	int existing_vfs = 0;
 
-	pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev));
+	dev = &priv->dev;
 
-	err = pci_enable_device(pdev);
-	if (err) {
-		dev_err(&pdev->dev, "Cannot enable PCI device, "
-			"aborting.\n");
-		return err;
-	}
-
-	mlx4_get_val(num_vfs.dbdf2val.tbl, pci_physfn(pdev), 0, &nvfs);
-	mlx4_get_val(probe_vf.dbdf2val.tbl, pci_physfn(pdev), 0, &prb_vf);
-	if (nvfs > MLX4_MAX_NUM_VF) {
-		dev_err(&pdev->dev, "There are more VF's (%d) than allowed(%d)\n",
-			nvfs, MLX4_MAX_NUM_VF);
-		return -EINVAL;
-	}
-
-	if (nvfs < 0) {
-		dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n");
-		return -EINVAL;
-	}
-	/*
-	 * Check for BARs.
-	 */
-	if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) &&
-	    !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
-		dev_err(&pdev->dev, "Missing DCS, aborting."
-			"(driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%x)\n",
-			pci_dev_data, pci_resource_flags(pdev, 0));
-		err = -ENODEV;
-		goto err_disable_pdev;
-	}
-	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
-		dev_err(&pdev->dev, "Missing UAR, aborting.\n");
-		err = -ENODEV;
-		goto err_disable_pdev;
-	}
-
-	err = pci_request_regions(pdev, DRV_NAME);
-	if (err) {
-		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
-		goto err_disable_pdev;
-	}
-
-	pci_set_master(pdev);
-
-	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (err) {
-		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
-		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (err) {
-			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
-			goto err_release_regions;
-		}
-	}
-	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (err) {
-		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
-			 "consistent PCI DMA mask.\n");
-		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (err) {
-			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
-				"aborting.\n");
-			goto err_release_regions;
-		}
-	}
-
-	/* Allow large DMA segments, up to the firmware limit of 1 GB */
-	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
-
-	priv = kzalloc(sizeof *priv, GFP_KERNEL);
-	if (!priv) {
-		dev_err(&pdev->dev, "Device struct alloc failed, "
-			"aborting.\n");
-		err = -ENOMEM;
-		goto err_release_regions;
-	}
-
-	dev       = &priv->dev;
-	dev->pdev = pdev;
-	INIT_LIST_HEAD(&priv->dev_list);
 	INIT_LIST_HEAD(&priv->ctx_list);
 	spin_lock_init(&priv->ctx_lock);
 
 	mutex_init(&priv->port_mutex);
+	mutex_init(&priv->bond_mutex);
 
 	INIT_LIST_HEAD(&priv->pgdir_list);
 	mutex_init(&priv->pgdir_mutex);
+	spin_lock_init(&priv->cmd.context_lock);
 
 	INIT_LIST_HEAD(&priv->bf_list);
 	mutex_init(&priv->bf_mutex);
 
 	dev->rev_id = pdev->revision;
 	dev->numa_node = dev_to_node(&pdev->dev);
+
 	/* Detect if this device is a virtual function */
 	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
-		/* When acting as pf, we normally skip vfs unless explicitly
-		 * requested to probe them. */
-		if (nvfs && extended_func_num(pdev) > prb_vf) {
-			mlx4_warn(dev, "Skipping virtual function:%d\n",
-						extended_func_num(pdev));
-			err = -ENODEV;
-			goto err_free_dev;
-		}
 		mlx4_warn(dev, "Detected virtual function - running in slave mode\n");
 		dev->flags |= MLX4_FLAG_SLAVE;
 	} else {
@@ -3330,27 +3237,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 		err = mlx4_get_ownership(dev);
 		if (err) {
 			if (err < 0)
-				goto err_free_dev;
+				return err;
 			else {
-				mlx4_warn(dev, "Multiple PFs not yet supported."
-					  " Skipping PF.\n");
-				err = -EINVAL;
-				goto err_free_dev;
-			}
-		}
-
-		if (nvfs) {
-			mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", nvfs);
-			err = pci_enable_sriov(pdev, nvfs);
-			if (err) {
-				mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d).\n",
-					 err);
-				err = 0;
-			} else {
-				mlx4_warn(dev, "Running in master mode\n");
-				dev->flags |= MLX4_FLAG_SRIOV |
-					      MLX4_FLAG_MASTER;
-				dev->num_vfs = nvfs;
+				mlx4_warn(dev, "Multiple PFs not yet supported - Skipping PF\n");
+				return -EINVAL;
 			}
 		}
 
@@ -3364,15 +3254,28 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 		 */
 		err = mlx4_reset(dev);
 		if (err) {
-			mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+			mlx4_err(dev, "Failed to reset HCA, aborting\n");
 			goto err_sriov;
 		}
+
+		if (total_vfs) {
+			dev->flags = MLX4_FLAG_MASTER;
+			existing_vfs = pci_num_vf(pdev);
+			if (existing_vfs)
+				dev->flags |= MLX4_FLAG_SRIOV;
+			dev->persist->num_vfs = total_vfs;
+		}
 	}
 
+	/* on load remove any previous indication of internal error,
+	 * device is up.
+	 */
+	dev->persist->state = MLX4_DEVICE_STATE_UP;
+
 slave_start:
 	err = mlx4_cmd_init(dev);
 	if (err) {
-		mlx4_err(dev, "Failed to init command interface, aborting.\n");
+		mlx4_err(dev, "Failed to init command interface, aborting\n");
 		goto err_sriov;
 	}
 
@@ -3380,39 +3283,166 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 	 * before posting commands. Also, init num_slaves before calling
 	 * mlx4_init_hca */
 	if (mlx4_is_mfunc(dev)) {
-		if (mlx4_is_master(dev))
+		if (mlx4_is_master(dev)) {
 			dev->num_slaves = MLX4_MAX_NUM_SLAVES;
-		else {
+
+		} else {
 			dev->num_slaves = 0;
 			err = mlx4_multi_func_init(dev);
 			if (err) {
-				mlx4_err(dev, "Failed to init slave mfunc"
-					 " interface, aborting.\n");
+				mlx4_err(dev, "Failed to init slave mfunc interface, aborting\n");
 				goto err_cmd;
 			}
 		}
 	}
 
+	err = mlx4_init_fw(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to init fw, aborting.\n");
+		goto err_mfunc;
+	}
+
+	if (mlx4_is_master(dev)) {
+		/* when we hit the goto slave_start below, dev_cap already initialized */
+		if (!dev_cap) {
+			dev_cap = kzalloc(sizeof(*dev_cap), GFP_KERNEL);
+
+			if (!dev_cap) {
+				err = -ENOMEM;
+				goto err_fw;
+			}
+
+			err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+			if (err) {
+				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+				goto err_fw;
+			}
+
+			if (mlx4_check_dev_cap(dev, dev_cap, nvfs))
+				goto err_fw;
+
+			if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
+				u64 dev_flags = mlx4_enable_sriov(dev, pdev,
+								  total_vfs,
+								  existing_vfs,
+								  reset_flow);
+
+				mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+				dev->flags = dev_flags;
+				if (!SRIOV_VALID_STATE(dev->flags)) {
+					mlx4_err(dev, "Invalid SRIOV state\n");
+					goto err_sriov;
+				}
+				err = mlx4_reset(dev);
+				if (err) {
+					mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+					goto err_sriov;
+				}
+				goto slave_start;
+			}
+		} else {
+			/* Legacy mode FW requires SRIOV to be enabled before
+			 * doing QUERY_DEV_CAP, since max_eq's value is different if
+			 * SRIOV is enabled.
+			 */
+			memset(dev_cap, 0, sizeof(*dev_cap));
+			err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+			if (err) {
+				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+				goto err_fw;
+			}
+
+			if (mlx4_check_dev_cap(dev, dev_cap, nvfs))
+				goto err_fw;
+		}
+	}
+
 	err = mlx4_init_hca(dev);
 	if (err) {
 		if (err == -EACCES) {
 			/* Not primary Physical function
 			 * Running in slave mode */
-			mlx4_cmd_cleanup(dev);
+			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+			/* We're not a PF */
+			if (dev->flags & MLX4_FLAG_SRIOV) {
+				if (!existing_vfs)
+					pci_disable_sriov(pdev);
+				if (mlx4_is_master(dev) && !reset_flow)
+					atomic_dec(&pf_loading);
+				dev->flags &= ~MLX4_FLAG_SRIOV;
+			}
+			if (!mlx4_is_slave(dev))
+				mlx4_free_ownership(dev);
 			dev->flags |= MLX4_FLAG_SLAVE;
 			dev->flags &= ~MLX4_FLAG_MASTER;
 			goto slave_start;
 		} else
-			goto err_mfunc;
+			goto err_fw;
 	}
 
+	if (mlx4_is_master(dev) && (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
+		u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
+						  existing_vfs, reset_flow);
+
+		if ((dev->flags ^ dev_flags) & (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE)) {
+			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_VHCR);
+			dev->flags = dev_flags;
+			err = mlx4_cmd_init(dev);
+			if (err) {
+				/* Only VHCR is cleaned up, so could still
+				 * send FW commands
+				 */
+				mlx4_err(dev, "Failed to init VHCR command interface, aborting\n");
+				goto err_close;
+			}
+		} else {
+			dev->flags = dev_flags;
+		}
+
+		if (!SRIOV_VALID_STATE(dev->flags)) {
+			mlx4_err(dev, "Invalid SRIOV state\n");
+			goto err_close;
+		}
+	}
+
+	/* check if the device is functioning at its maximum possible speed.
+	 * No return code for this call, just warn the user in case of PCI
+	 * express device capabilities are under-satisfied by the bus.
+	 */
+	if (!mlx4_is_slave(dev))
+		mlx4_check_pcie_caps(dev);
+
 	/* In master functions, the communication channel must be initialized
 	 * after obtaining its address from fw */
 	if (mlx4_is_master(dev)) {
+		if (dev->caps.num_ports < 2 &&
+		    num_vfs_argc > 1) {
+			err = -EINVAL;
+			mlx4_err(dev,
+				 "Error: Trying to configure VFs on port 2, but HCA has only %d physical ports\n",
+				 dev->caps.num_ports);
+			goto err_close;
+		}
+		memcpy(dev->persist->nvfs, nvfs, sizeof(dev->persist->nvfs));
+
+		for (i = 0;
+		     i < sizeof(dev->persist->nvfs)/
+		     sizeof(dev->persist->nvfs[0]); i++) {
+			unsigned j;
+
+			for (j = 0; j < dev->persist->nvfs[i]; ++sum, ++j) {
+				dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1;
+				dev->dev_vfs[sum].n_ports = i < 2 ? 1 :
+					dev->caps.num_ports;
+			}
+		}
+
+		/* In master functions, the communication channel
+		 * must be initialized after obtaining its address from fw
+		 */
 		err = mlx4_multi_func_init(dev);
 		if (err) {
-			mlx4_err(dev, "Failed to init master mfunc"
-				 "interface, aborting.\n");
+			mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n");
 			goto err_close;
 		}
 	}
@@ -3421,29 +3451,21 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 	if (err)
 		goto err_master_mfunc;
 
-	priv->msix_ctl.pool_bm = 0;
+	bitmap_zero(priv->msix_ctl.pool_bm, MAX_MSIX);
 	mutex_init(&priv->msix_ctl.pool_lock);
 
 	mlx4_enable_msi_x(dev);
-
-	/* no MSIX and no shared IRQ */
-	if (!dev->caps.num_comp_vectors && !dev->caps.comp_pool) {
-		err = -ENOSPC;
-		goto err_free_eq;
-	}
-
 	if ((mlx4_is_mfunc(dev)) &&
 	    !(dev->flags & MLX4_FLAG_MSI_X)) {
 		err = -ENOSYS;
-		mlx4_err(dev, "INTx is not supported in multi-function mode."
-			 " aborting.\n");
+		mlx4_err(dev, "INTx is not supported in multi-function mode, aborting\n");
 		goto err_free_eq;
 	}
 
 	if (!mlx4_is_slave(dev)) {
 		err = mlx4_init_steering(dev);
 		if (err)
-			goto err_free_eq;
+			goto err_disable_msix;
 	}
 
 	mlx4_init_quotas(dev);
@@ -3453,7 +3475,6 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 	    !mlx4_is_mfunc(dev)) {
 		dev->flags &= ~MLX4_FLAG_MSI_X;
 		dev->caps.num_comp_vectors = 1;
-		dev->caps.comp_pool	   = 0;
 		pci_disable_msix(pdev);
 		err = mlx4_setup_hca(dev);
 	}
@@ -3461,7 +3482,17 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 	if (err)
 		goto err_steer;
 
-	mlx4_init_hca_info(dev);
+	/* When PF resources are ready arm its comm channel to enable
+	 * getting commands
+	 */
+	if (mlx4_is_master(dev)) {
+		err = mlx4_ARM_COMM_CHANNEL(dev);
+		if (err) {
+			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+				 err);
+			goto err_steer;
+		}
+	}
 
 	for (port = 1; port <= dev->caps.num_ports; port++) {
 		err = mlx4_init_port_info(dev, port);
@@ -3469,6 +3500,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 			goto err_port;
 	}
 
+	priv->v2p.port1 = 1;
+	priv->v2p.port2 = 2;
+
 	err = mlx4_register_device(dev);
 	if (err)
 		goto err_port;
@@ -3478,9 +3512,12 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 	mlx4_sense_init(dev);
 	mlx4_start_sense(dev);
 
-	priv->pci_dev_data = pci_dev_data;
-	pci_set_drvdata(pdev, dev);
+	priv->removed = 0;
 
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
+		atomic_dec(&pf_loading);
+
+	kfree(dev_cap);
 	return 0;
 
 err_port:
@@ -3503,6 +3540,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 	if (!mlx4_is_slave(dev))
 		mlx4_clear_steering(dev);
 
+err_disable_msix:
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
 err_free_eq:
 	mlx4_free_eq_table(dev);
 
@@ -3513,6 +3554,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 	}
 
 	if (mlx4_is_slave(dev)) {
+		kfree(dev->caps.qp0_qkey);
 		kfree(dev->caps.qp0_tunnel);
 		kfree(dev->caps.qp0_proxy);
 		kfree(dev->caps.qp1_tunnel);
@@ -3520,118 +3562,354 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data)
 	}
 
 err_close:
-	if (dev->flags & MLX4_FLAG_MSI_X)
-		pci_disable_msix(pdev);
-
 	mlx4_close_hca(dev);
 
+err_fw:
+	mlx4_close_fw(dev);
+
 err_mfunc:
 	if (mlx4_is_slave(dev))
 		mlx4_multi_func_cleanup(dev);
 
 err_cmd:
-	mlx4_cmd_cleanup(dev);
+	mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
 
 err_sriov:
-	if (dev->flags & MLX4_FLAG_SRIOV)
+	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) {
 		pci_disable_sriov(pdev);
+		dev->flags &= ~MLX4_FLAG_SRIOV;
+	}
+
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
+		atomic_dec(&pf_loading);
+
+	kfree(priv->dev.dev_vfs);
 
 	if (!mlx4_is_slave(dev))
 		mlx4_free_ownership(dev);
 
-err_free_dev:
-	kfree(priv);
+	kfree(dev_cap);
+	return err;
+}
+
+static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
+			   struct mlx4_priv *priv)
+{
+	int err;
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = {
+		{2, 0, 0}, {0, 1, 2}, {0, 1, 2} };
+	unsigned total_vfs = 0;
+	unsigned int i;
+
+	pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev));
+
+	err = mlx4_pci_enable_device(&priv->dev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
+		return err;
+	}
+
+	/* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS
+	 * per port, we must limit the number of VFs to 63 (since their are
+	 * 128 MACs)
+	 */
+	for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc;
+	     total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) {
+		nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i];
+		if (nvfs[i] < 0) {
+			dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n");
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+	for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc;
+	     i++) {
+		prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i];
+		if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) {
+			dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n");
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+	if (total_vfs > MLX4_MAX_NUM_VF) {
+		dev_err(&pdev->dev,
+			"Requested more VF's (%d) than allowed by hw (%d)\n",
+			total_vfs, MLX4_MAX_NUM_VF);
+		err = -EINVAL;
+		goto err_disable_pdev;
+	}
+
+	for (i = 0; i < MLX4_MAX_PORTS; i++) {
+		if (nvfs[i] + nvfs[2] > MLX4_MAX_NUM_VF_P_PORT) {
+			dev_err(&pdev->dev,
+				"Requested more VF's (%d) for port (%d) than allowed by driver (%d)\n",
+				nvfs[i] + nvfs[2], i + 1,
+				MLX4_MAX_NUM_VF_P_PORT);
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+
+	/* Check for BARs. */
+	if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) &&
+	    !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n",
+			pci_dev_data, (long)pci_resource_flags(pdev, 0));
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n");
+			goto err_release_regions;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n");
+			goto err_release_regions;
+		}
+	}
+
+	/* Allow large DMA segments, up to the firmware limit of 1 GB */
+	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
+	/* Detect if this device is a virtual function */
+	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
+		/* When acting as pf, we normally skip vfs unless explicitly
+		 * requested to probe them.
+		 */
+		if (total_vfs) {
+			unsigned vfs_offset = 0;
+
+			for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) &&
+			     vfs_offset + nvfs[i] < extended_func_num(pdev);
+			     vfs_offset += nvfs[i], i++)
+				;
+			if (i == sizeof(nvfs)/sizeof(nvfs[0])) {
+				err = -ENODEV;
+				goto err_release_regions;
+			}
+			if ((extended_func_num(pdev) - vfs_offset)
+			    > prb_vf[i]) {
+				dev_warn(&pdev->dev, "Skipping virtual function:%d\n",
+					 extended_func_num(pdev));
+				err = -ENODEV;
+				goto err_release_regions;
+			}
+		}
+	}
+
+	err = mlx4_catas_init(&priv->dev);
+	if (err)
+		goto err_release_regions;
+
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0);
+	if (err)
+		goto err_catas;
+
+	return 0;
+
+err_catas:
+	mlx4_catas_end(&priv->dev);
 
 err_release_regions:
 	pci_release_regions(pdev);
 
 err_disable_pdev:
-	pci_disable_device(pdev);
+	mlx4_pci_disable_device(&priv->dev);
 	pci_set_drvdata(pdev, NULL);
 	return err;
 }
 
-static int __devinit mlx4_init_one(struct pci_dev *pdev,
-				   const struct pci_device_id *id)
+static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	device_set_desc(pdev->dev.bsddev, mlx4_version);
-	return __mlx4_init_one(pdev, id->driver_data);
+	struct mlx4_priv *priv;
+	struct mlx4_dev *dev;
+	int ret;
+
+	printk_once(KERN_INFO "%s", mlx4_version);
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	dev       = &priv->dev;
+	dev->persist = kzalloc(sizeof(*dev->persist), GFP_KERNEL);
+	if (!dev->persist) {
+		kfree(priv);
+		return -ENOMEM;
+	}
+	dev->persist->pdev = pdev;
+	dev->persist->dev = dev;
+	pci_set_drvdata(pdev, dev->persist);
+	priv->pci_dev_data = id->driver_data;
+	mutex_init(&dev->persist->device_state_mutex);
+	mutex_init(&dev->persist->interface_state_mutex);
+	mutex_init(&dev->persist->pci_status_mutex);
+
+	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
+	if (ret) {
+		kfree(dev->persist);
+		kfree(priv);
+	} else {
+		pci_save_state(pdev->dev.bsddev);
+	}
+
+	return ret;
+}
+
+static void mlx4_clean_dev(struct mlx4_dev *dev)
+{
+	struct mlx4_dev_persistent *persist = dev->persist;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long	flags = (dev->flags & RESET_PERSIST_MASK_FLAGS);
+
+	memset(priv, 0, sizeof(*priv));
+	priv->dev.persist = persist;
+	priv->dev.flags = flags;
+}
+
+static void mlx4_unload_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int               pci_dev_data;
+	int p, i;
+
+	if (priv->removed)
+		return;
+
+	/* saving current ports type for further use */
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		dev->persist->curr_port_type[i] = dev->caps.port_type[i + 1];
+		dev->persist->curr_port_poss_type[i] = dev->caps.
+						       possible_type[i + 1];
+	}
+
+	pci_dev_data = priv->pci_dev_data;
+
+	mlx4_stop_sense(dev);
+	mlx4_unregister_device(dev);
+
+	for (p = 1; p <= dev->caps.num_ports; p++) {
+		mlx4_cleanup_port_info(&priv->port[p]);
+		mlx4_CLOSE_PORT(dev, p);
+	}
+
+	if (mlx4_is_master(dev))
+		mlx4_free_resource_tracker(dev,
+					   RES_TR_FREE_SLAVES_ONLY);
+
+	mlx4_cleanup_default_counters(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_counters_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_xrcd_table(dev);
+	mlx4_cleanup_pd_table(dev);
+
+	if (mlx4_is_master(dev))
+		mlx4_free_resource_tracker(dev,
+					   RES_TR_FREE_STRUCTS_ONLY);
+
+	iounmap(priv->kar);
+	mlx4_uar_free(dev, &priv->driver_uar);
+	mlx4_cleanup_uar_table(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_clear_steering(dev);
+	mlx4_free_eq_table(dev);
+	if (mlx4_is_master(dev))
+		mlx4_multi_func_cleanup(dev);
+	mlx4_close_hca(dev);
+	mlx4_close_fw(dev);
+	if (mlx4_is_slave(dev))
+		mlx4_multi_func_cleanup(dev);
+	mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_free_ownership(dev);
+
+	kfree(dev->caps.qp0_qkey);
+	kfree(dev->caps.qp0_tunnel);
+	kfree(dev->caps.qp0_proxy);
+	kfree(dev->caps.qp1_tunnel);
+	kfree(dev->caps.qp1_proxy);
+	kfree(dev->dev_vfs);
+
+	mlx4_clean_dev(dev);
+	priv->pci_dev_data = pci_dev_data;
+	priv->removed = 1;
 }
 
 static void mlx4_remove_one(struct pci_dev *pdev)
 {
-	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int p;
+	int active_vfs = 0;
 
-	if (dev) {
-		/* in SRIOV it is not allowed to unload the pf's
-		 * driver while there are alive vf's */
-		if (mlx4_is_master(dev)) {
-			if (mlx4_how_many_lives_vf(dev))
-				mlx4_err(dev, "Removing PF when there are assigned VF's !!!\n");
+	mutex_lock(&persist->interface_state_mutex);
+	persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
+	mutex_unlock(&persist->interface_state_mutex);
+
+	/* Disabling SR-IOV is not allowed while there are active vf's */
+	if (mlx4_is_master(dev) && dev->flags & MLX4_FLAG_SRIOV) {
+		active_vfs = mlx4_how_many_lives_vf(dev);
+		if (active_vfs) {
+			pr_warn("Removing PF when there are active VF's !!\n");
+			pr_warn("Will not disable SR-IOV.\n");
 		}
-		mlx4_stop_sense(dev);
-		mlx4_unregister_device(dev);
-
-		mlx4_cleanup_hca_info(&priv->hca_info);
-		for (p = 1; p <= dev->caps.num_ports; p++) {
-			mlx4_cleanup_port_info(&priv->port[p]);
-			mlx4_CLOSE_PORT(dev, p);
-		}
-
-		if (mlx4_is_master(dev))
-			mlx4_free_resource_tracker(dev,
-						   RES_TR_FREE_SLAVES_ONLY);
-
-		mlx4_cleanup_counters_table(dev);
-		mlx4_cleanup_qp_table(dev);
-		mlx4_cleanup_srq_table(dev);
-		mlx4_cleanup_cq_table(dev);
-		mlx4_cmd_use_polling(dev);
-		mlx4_cleanup_eq_table(dev);
-		mlx4_cleanup_mcg_table(dev);
-		mlx4_cleanup_mr_table(dev);
-		mlx4_cleanup_xrcd_table(dev);
-		mlx4_cleanup_pd_table(dev);
-
-		if (mlx4_is_master(dev))
-			mlx4_free_resource_tracker(dev,
-						   RES_TR_FREE_STRUCTS_ONLY);
-
-		iounmap(priv->kar);
-		mlx4_uar_free(dev, &priv->driver_uar);
-		mlx4_cleanup_uar_table(dev);
-		if (!mlx4_is_slave(dev))
-			mlx4_clear_steering(dev);
-		mlx4_free_eq_table(dev);
-		if (mlx4_is_master(dev))
-			mlx4_multi_func_cleanup(dev);
-		mlx4_close_hca(dev);
-		if (mlx4_is_slave(dev))
-			mlx4_multi_func_cleanup(dev);
-		mlx4_cmd_cleanup(dev);
-
-		if (dev->flags & MLX4_FLAG_MSI_X)
-			pci_disable_msix(pdev);
-		if (dev->flags & MLX4_FLAG_SRIOV) {
-			mlx4_warn(dev, "Disabling SR-IOV\n");
-			pci_disable_sriov(pdev);
-		}
-
-		if (!mlx4_is_slave(dev))
-			mlx4_free_ownership(dev);
-
-		kfree(dev->caps.qp0_tunnel);
-		kfree(dev->caps.qp0_proxy);
-		kfree(dev->caps.qp1_tunnel);
-		kfree(dev->caps.qp1_proxy);
-
-		kfree(priv);
-		pci_release_regions(pdev);
-		pci_disable_device(pdev);
-		pci_set_drvdata(pdev, NULL);
 	}
+
+	/* device marked to be under deletion running now without the lock
+	 * letting other tasks to be terminated
+	 */
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	else
+		mlx4_info(dev, "%s: interface is down\n", __func__);
+	mlx4_catas_end(dev);
+	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
+		mlx4_warn(dev, "Disabling SR-IOV\n");
+		pci_disable_sriov(pdev);
+	}
+
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	kfree(dev->persist);
+	kfree(priv);
+	pci_set_drvdata(pdev, NULL);
 }
 
 static int restore_current_port_types(struct mlx4_dev *dev,
@@ -3642,40 +3920,45 @@ static int restore_current_port_types(struct mlx4_dev *dev,
 	int err, i;
 
 	mlx4_stop_sense(dev);
+
 	mutex_lock(&priv->port_mutex);
 	for (i = 0; i < dev->caps.num_ports; i++)
 		dev->caps.possible_type[i + 1] = poss_types[i];
 	err = mlx4_change_port_types(dev, types);
-	mlx4_start_sense(dev);
 	mutex_unlock(&priv->port_mutex);
+
+	mlx4_start_sense(dev);
+
 	return err;
 }
 
 int mlx4_restart_one(struct pci_dev *pdev)
 {
-	struct mlx4_dev	 *dev  = pci_get_drvdata(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	enum mlx4_port_type curr_type[MLX4_MAX_PORTS];
-	enum mlx4_port_type poss_type[MLX4_MAX_PORTS];
-	int pci_dev_data, err, i;
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int pci_dev_data, err, total_vfs;
 
 	pci_dev_data = priv->pci_dev_data;
-	for (i = 0; i < dev->caps.num_ports; i++) {
-		curr_type[i] = dev->caps.port_type[i + 1];
-		poss_type[i] = dev->caps.possible_type[i + 1];
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+	mlx4_unload_one(pdev);
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 1);
+	if (err) {
+		mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n",
+			 __func__, pci_name(pdev), err);
+		return err;
 	}
 
-	mlx4_remove_one(pdev);
-	err = __mlx4_init_one(pdev, pci_dev_data);
+	err = restore_current_port_types(dev, dev->persist->curr_port_type,
+					 dev->persist->curr_port_poss_type);
 	if (err)
-		return err;
-
-	dev = pci_get_drvdata(pdev);
-	err = restore_current_port_types(dev, curr_type, poss_type);
-	if (err)
-		mlx4_err(dev, "mlx4_restart_one: could not restore original port types (%d)\n",
+		mlx4_err(dev, "could not restore original port types (%d)\n",
 			 err);
-	return 0;
+
+	return err;
 }
 
 static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = {
@@ -3743,143 +4026,137 @@ MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
 static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
 					      pci_channel_state_t state)
 {
-	mlx4_remove_one(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
 
-	return state == pci_channel_io_perm_failure ?
-		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+	mlx4_err(persist->dev, "mlx4_pci_err_detected was called\n");
+	mlx4_enter_error_state(persist);
+
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+
+	mutex_unlock(&persist->interface_state_mutex);
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	mlx4_pci_disable_device(persist->dev);
+	return PCI_ERS_RESULT_NEED_RESET;
 }
 
 static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
 {
-	int ret = __mlx4_init_one(pdev, 0);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
+	int err;
 
-	return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+	mlx4_err(dev, "mlx4_pci_slot_reset was called\n");
+	err = mlx4_pci_enable_device(dev);
+	if (err) {
+		mlx4_err(dev, "Can not re-enable device, err=%d\n", err);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void mlx4_pci_resume(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int total_vfs;
+	int err;
+
+	mlx4_err(dev, "%s was called\n", __func__);
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+	mutex_lock(&persist->interface_state_mutex);
+	if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
+		err = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs, nvfs,
+				    priv, 1);
+		if (err) {
+			mlx4_err(dev, "%s: mlx4_load_one failed, err=%d\n",
+				 __func__,  err);
+			goto end;
+		}
+
+		err = restore_current_port_types(dev, dev->persist->
+						 curr_port_type, dev->persist->
+						 curr_port_poss_type);
+		if (err)
+			mlx4_err(dev, "could not restore original port types (%d)\n", err);
+	}
+end:
+	mutex_unlock(&persist->interface_state_mutex);
+
+}
+
+static void mlx4_shutdown(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+
+	mlx4_info(persist->dev, "mlx4_shutdown was called\n");
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	mutex_unlock(&persist->interface_state_mutex);
 }
 
 static const struct pci_error_handlers mlx4_err_handler = {
 	.error_detected = mlx4_pci_err_detected,
 	.slot_reset     = mlx4_pci_slot_reset,
+	.resume		= mlx4_pci_resume,
 };
 
-static int suspend(struct pci_dev *pdev, pm_message_t state)
-{
-	mlx4_remove_one(pdev);
-
-	return 0;
-}
-
-static int resume(struct pci_dev *pdev)
-{
-	return __mlx4_init_one(pdev, 0);
-}
-
 static struct pci_driver mlx4_driver = {
 	.name		= DRV_NAME,
 	.id_table	= mlx4_pci_table,
 	.probe		= mlx4_init_one,
-	.remove		= __devexit_p(mlx4_remove_one),
-	.suspend	= suspend,
-	.resume		= resume,
+	.shutdown	= mlx4_shutdown,
+	.remove		= mlx4_remove_one,
 	.err_handler    = &mlx4_err_handler,
 };
 
 static int __init mlx4_verify_params(void)
 {
-	int status;
-
-	status = update_defaults(&port_type_array);
-	if (status == INVALID_STR) {
-		if (mlx4_fill_dbdf2val_tbl(&port_type_array.dbdf2val))
-			return -1;
-	} else if (status == INVALID_DATA) {
-		return -1;
-	}
-
-	status = update_defaults(&num_vfs);
-	if (status == INVALID_STR) {
-		if (mlx4_fill_dbdf2val_tbl(&num_vfs.dbdf2val))
-			return -1;
-	} else if (status == INVALID_DATA) {
-		return -1;
-	}
-
-	status = update_defaults(&probe_vf);
-	if (status == INVALID_STR) {
-		if (mlx4_fill_dbdf2val_tbl(&probe_vf.dbdf2val))
-			return -1;
-	} else if (status == INVALID_DATA) {
-		return -1;
-	}
-
-	if (msi_x < 0) {
-		pr_warn("mlx4_core: bad msi_x: %d\n", msi_x);
-		return -1;
-	}
-
 	if ((log_num_mac < 0) || (log_num_mac > 7)) {
-		pr_warning("mlx4_core: bad num_mac: %d\n", log_num_mac);
+		pr_warn("mlx4_core: bad num_mac: %d\n", log_num_mac);
 		return -1;
 	}
 
 	if (log_num_vlan != 0)
-		pr_warning("mlx4_core: log_num_vlan - obsolete module param, using %d\n",
-			   MLX4_LOG_NUM_VLANS);
+		pr_warn("mlx4_core: log_num_vlan - obsolete module param, using %d\n",
+			MLX4_LOG_NUM_VLANS);
 
-	if (mlx4_set_4k_mtu != -1)
-		pr_warning("mlx4_core: set_4k_mtu - obsolete module param\n");
+	if (use_prio != 0)
+		pr_warn("mlx4_core: use_prio - obsolete module param, ignored\n");
 
-	if ((log_mtts_per_seg < 0) || (log_mtts_per_seg > 7)) {
-		pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg);
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) {
+		pr_warn("mlx4_core: bad log_mtts_per_seg: %d\n",
+			log_mtts_per_seg);
 		return -1;
 	}
 
-	if (mlx4_log_num_mgm_entry_size != -1 &&
-	    (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
-	     mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) {
-		pr_warning("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not "
-			   "in legal range (-1 or %d..%d)\n",
-			   mlx4_log_num_mgm_entry_size,
-			   MLX4_MIN_MGM_LOG_ENTRY_SIZE,
-			   MLX4_MAX_MGM_LOG_ENTRY_SIZE);
+	/* Check if module param for ports type has legal combination */
+	if (port_type_array[0] == false && port_type_array[1] == true) {
+		pr_warn("Module parameter configuration ETH/IB is not supported. Switching to default configuration IB/IB\n");
+		port_type_array[0] = true;
+	}
+
+	if (mlx4_log_num_mgm_entry_size < -7 ||
+	    (mlx4_log_num_mgm_entry_size > 0 &&
+	     (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
+	      mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE))) {
+		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-7..0 or %d..%d)\n",
+			mlx4_log_num_mgm_entry_size,
+			MLX4_MIN_MGM_LOG_ENTRY_SIZE,
+			MLX4_MAX_MGM_LOG_ENTRY_SIZE);
 		return -1;
 	}
 
-	if (mod_param_profile.num_qp < 18 || mod_param_profile.num_qp > 23) {
-		pr_warning("mlx4_core: bad log_num_qp: %d\n",
-			   mod_param_profile.num_qp);
-		return -1;
-	}
-
-	if (mod_param_profile.num_srq < 10) {
-		pr_warning("mlx4_core: too low log_num_srq: %d\n",
-			   mod_param_profile.num_srq);
-		return -1;
-	}
-
-	if (mod_param_profile.num_cq < 10) {
-		pr_warning("mlx4_core: too low log_num_cq: %d\n",
-			   mod_param_profile.num_cq);
-		return -1;
-	}
-
-	if (mod_param_profile.num_mpt < 10) {
-		pr_warning("mlx4_core: too low log_num_mpt: %d\n",
-			   mod_param_profile.num_mpt);
-		return -1;
-	}
-
-	if (mod_param_profile.num_mtt_segs &&
-	    mod_param_profile.num_mtt_segs < 15) {
-		pr_warning("mlx4_core: too low log_num_mtt: %d\n",
-			   mod_param_profile.num_mtt_segs);
-		return -1;
-	}
-
-	if (mod_param_profile.num_mtt_segs > MLX4_MAX_LOG_NUM_MTT) {
-		pr_warning("mlx4_core: too high log_num_mtt: %d\n",
-			   mod_param_profile.num_mtt_segs);
-		return -1;
-	}
 	return 0;
 }
 
@@ -3890,40 +4167,24 @@ static int __init mlx4_init(void)
 	if (mlx4_verify_params())
 		return -EINVAL;
 
-	mlx4_catas_init();
 
 	mlx4_wq = create_singlethread_workqueue("mlx4");
 	if (!mlx4_wq)
 		return -ENOMEM;
 
-	if (enable_sys_tune)
-		sys_tune_init();
-
 	ret = pci_register_driver(&mlx4_driver);
 	if (ret < 0)
-		goto err;
-
-	return 0;
-
-err:
-	if (enable_sys_tune)
-		sys_tune_fini();
-
-	destroy_workqueue(mlx4_wq);
-
-	return ret;
+		destroy_workqueue(mlx4_wq);
+	return ret < 0 ? ret : 0;
 }
 
 static void __exit mlx4_cleanup(void)
 {
-	if (enable_sys_tune)
-		sys_tune_fini();
-
 	pci_unregister_driver(&mlx4_driver);
 	destroy_workqueue(mlx4_wq);
 }
 
-module_init_order(mlx4_init, SI_ORDER_MIDDLE);
+module_init(mlx4_init);
 module_exit(mlx4_cleanup);
 
 static int
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_mcg.c b/sys/dev/mlx4/mlx4_core/mlx4_mcg.c
index 846daa99c634..d202b77f69c3 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_mcg.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_mcg.c
@@ -323,15 +323,14 @@ static bool check_duplicate_entry(struct mlx4_dev *dev, u8 port,
 	return true;
 }
 
-/*
- * returns true if all the QPs != tqpn contained in this entry
- * are Promisc QPs. return false otherwise.
+/* Returns true if all the QPs != tqpn contained in this entry
+ * are Promisc QPs. Returns false otherwise.
  */
 static bool promisc_steering_entry(struct mlx4_dev *dev, u8 port,
-				      enum mlx4_steer_type steer,
-				      unsigned int index, u32 tqpn, u32 *members_count)
+				   enum mlx4_steer_type steer,
+				   unsigned int index, u32 tqpn,
+				   u32 *members_count)
 {
-	struct mlx4_steer *s_steer;
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_mgm *mgm;
 	u32 m_count;
@@ -341,8 +340,6 @@ static bool promisc_steering_entry(struct mlx4_dev *dev, u8 port,
 	if (port < 1 || port > dev->caps.num_ports)
 		return false;
 
-	s_steer = &mlx4_priv(dev)->steer[port - 1];
-
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return false;
@@ -382,7 +379,8 @@ static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 port,
 
 	s_steer = &mlx4_priv(dev)->steer[port - 1];
 
-	if (!promisc_steering_entry(dev, port, steer, index, tqpn, &members_count))
+	if (!promisc_steering_entry(dev, port, steer, index,
+				    tqpn, &members_count))
 		goto out;
 
 	/* All the qps currently registered for this entry are promiscuous,
@@ -390,10 +388,10 @@ static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 port,
 	ret = true;
 	list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) {
 		if (entry->index == index) {
-			if (list_empty(&entry->duplicates) || members_count == 1) {
+			if (list_empty(&entry->duplicates) ||
+			    members_count == 1) {
 				struct mlx4_promisc_qp *pqp, *tmp_pqp;
-				/*
-				 * If there is only 1 entry in duplicates than
+				/* If there is only 1 entry in duplicates then
 				 * this is the QP we want to delete, going over
 				 * the list and deleting the entry.
 				 */
@@ -460,40 +458,53 @@ static int add_promisc_qp(struct mlx4_dev *dev, u8 port,
 	mgm = mailbox->buf;
 
 	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
-		/* the promisc qp needs to be added for each one of the steering
-		 * entries, if it already exists, needs to be added as a duplicate
-		 * for this entry */
-		list_for_each_entry(entry, &s_steer->steer_entries[steer], list) {
+		/* The promisc QP needs to be added for each one of the steering
+		 * entries. If it already exists, needs to be added as
+		 * a duplicate for this entry.
+		 */
+		list_for_each_entry(entry,
+				    &s_steer->steer_entries[steer],
+				    list) {
 			err = mlx4_READ_ENTRY(dev, entry->index, mailbox);
 			if (err)
 				goto out_mailbox;
 
-			members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+			members_count = be32_to_cpu(mgm->members_count) &
+					0xffffff;
 			prot = be32_to_cpu(mgm->members_count) >> 30;
 			found = false;
 			for (i = 0; i < members_count; i++) {
-				if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qpn) {
-					/* Entry already exists, add to duplicates */
-					dqp = kmalloc(sizeof *dqp, GFP_KERNEL);
+				if ((be32_to_cpu(mgm->qp[i]) &
+				     MGM_QPN_MASK) == qpn) {
+					/* Entry already exists.
+					 * Add to duplicates.
+					 */
+					dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
 					if (!dqp) {
 						err = -ENOMEM;
 						goto out_mailbox;
 					}
 					dqp->qpn = qpn;
-					list_add_tail(&dqp->list, &entry->duplicates);
+					list_add_tail(&dqp->list,
+						      &entry->duplicates);
 					found = true;
 				}
 			}
 			if (!found) {
 				/* Need to add the qpn to mgm */
-				if (members_count == dev->caps.num_qp_per_mgm) {
+				if (members_count ==
+				    dev->caps.num_qp_per_mgm) {
 					/* entry is full */
 					err = -ENOMEM;
 					goto out_mailbox;
 				}
-				mgm->qp[members_count++] = cpu_to_be32(qpn & MGM_QPN_MASK);
-				mgm->members_count = cpu_to_be32(members_count | (prot << 30));
-				err = mlx4_WRITE_ENTRY(dev, entry->index, mailbox);
+				mgm->qp[members_count++] =
+					cpu_to_be32(qpn & MGM_QPN_MASK);
+				mgm->members_count =
+					cpu_to_be32(members_count |
+						    (prot << 30));
+				err = mlx4_WRITE_ENTRY(dev, entry->index,
+						       mailbox);
 				if (err)
 					goto out_mailbox;
 			}
@@ -547,7 +558,7 @@ static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
 	u32 members_count;
 	bool found;
 	bool back_to_list = false;
-	int i, loc = -1;
+	int i;
 	int err;
 
 	if (port < 1 || port > dev->caps.num_ports)
@@ -575,7 +586,6 @@ static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
 		goto out_list;
 	}
 	mgm = mailbox->buf;
-	memset(mgm, 0, sizeof *mgm);
 	members_count = 0;
 	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list)
 		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
@@ -586,8 +596,10 @@ static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
 		goto out_mailbox;
 
 	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
-		/* remove the qp from all the steering entries*/
-		list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) {
+		/* Remove the QP from all the steering entries */
+		list_for_each_entry_safe(entry, tmp_entry,
+					 &s_steer->steer_entries[steer],
+					 list) {
 			found = false;
 			list_for_each_entry(dqp, &entry->duplicates, list) {
 				if (dqp->qpn == qpn) {
@@ -596,25 +608,33 @@ static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
 				}
 			}
 			if (found) {
-				/* a duplicate, no need to change the mgm,
-				 * only update the duplicates list */
+				/* A duplicate, no need to change the MGM,
+				 * only update the duplicates list
+				 */
 				list_del(&dqp->list);
 				kfree(dqp);
 			} else {
-				err = mlx4_READ_ENTRY(dev, entry->index, mailbox);
-					if (err)
-						goto out_mailbox;
-				members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+				int loc = -1;
+
+				err = mlx4_READ_ENTRY(dev,
+						      entry->index,
+						      mailbox);
+				if (err)
+					goto out_mailbox;
+				members_count =
+					be32_to_cpu(mgm->members_count) &
+					0xffffff;
 				if (!members_count) {
-					mlx4_warn(dev, "QP %06x wasn't found in entry %x mcount=0."
-					               " deleting entry...\n", qpn, entry->index);
+					mlx4_warn(dev, "QP %06x wasn't found in entry %x mcount=0. deleting entry...\n",
+						  qpn, entry->index);
 					list_del(&entry->list);
 					kfree(entry);
 					continue;
 				}
 
 				for (i = 0; i < members_count; ++i)
-					if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qpn) {
+					if ((be32_to_cpu(mgm->qp[i]) &
+					     MGM_QPN_MASK) == qpn) {
 						loc = i;
 						break;
 					}
@@ -626,15 +646,20 @@ static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
 					goto out_mailbox;
 				}
 
-				/* copy the last QP in this MGM over removed QP */
+				/* Copy the last QP in this MGM
+				 * over removed QP
+				 */
 				mgm->qp[loc] = mgm->qp[members_count - 1];
 				mgm->qp[members_count - 1] = 0;
-				mgm->members_count = cpu_to_be32(--members_count |
-								 (MLX4_PROT_ETH << 30));
+				mgm->members_count =
+					cpu_to_be32(--members_count |
+						    (MLX4_PROT_ETH << 30));
 
-				err = mlx4_WRITE_ENTRY(dev, entry->index, mailbox);
-					if (err)
-						goto out_mailbox;
+				err = mlx4_WRITE_ENTRY(dev,
+						       entry->index,
+						       mailbox);
+				if (err)
+					goto out_mailbox;
 			}
 		}
 	}
@@ -706,7 +731,7 @@ static int find_entry(struct mlx4_dev *dev, u8 port,
 
 		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
 			if (*index != hash) {
-				mlx4_err(dev, "Found zero MGID in AMGM.\n");
+				mlx4_err(dev, "Found zero MGID in AMGM\n");
 				err = -EINVAL;
 			}
 			return err;
@@ -728,20 +753,22 @@ static const u8 __promisc_mode[] = {
 	[MLX4_FS_REGULAR]   = 0x0,
 	[MLX4_FS_ALL_DEFAULT] = 0x1,
 	[MLX4_FS_MC_DEFAULT] = 0x3,
-	[MLX4_FS_UC_SNIFFER] = 0x4,
-	[MLX4_FS_MC_SNIFFER] = 0x5,
+	[MLX4_FS_MIRROR_RX_PORT] = 0x4,
+	[MLX4_FS_MIRROR_SX_PORT] = 0x5,
+	[MLX4_FS_UC_SNIFFER] = 0x6,
+	[MLX4_FS_MC_SNIFFER] = 0x7,
 };
 
-int map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
-			       enum mlx4_net_trans_promisc_mode flow_type)
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type)
 {
-	if (flow_type >= MLX4_FS_MODE_NUM || flow_type < 0) {
+	if (flow_type >= MLX4_FS_MODE_NUM) {
 		mlx4_err(dev, "Invalid flow type. type = %d\n", flow_type);
 		return -EINVAL;
 	}
 	return __promisc_mode[flow_type];
 }
-EXPORT_SYMBOL_GPL(map_sw_to_hw_steering_mode);
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_mode);
 
 static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
 				  struct mlx4_net_trans_rule_hw_ctrl *hw)
@@ -765,19 +792,20 @@ const u16 __sw_id_hw[] = {
 	[MLX4_NET_TRANS_RULE_ID_IPV6]    = 0xE003,
 	[MLX4_NET_TRANS_RULE_ID_IPV4]    = 0xE002,
 	[MLX4_NET_TRANS_RULE_ID_TCP]     = 0xE004,
-	[MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006
+	[MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006,
+	[MLX4_NET_TRANS_RULE_ID_VXLAN]	 = 0xE008
 };
 
-int map_sw_to_hw_steering_id(struct mlx4_dev *dev,
-			     enum mlx4_net_trans_rule_id id)
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id)
 {
-	if (id >= MLX4_NET_TRANS_RULE_NUM || id < 0) {
+	if (id >= MLX4_NET_TRANS_RULE_NUM) {
 		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
 		return -EINVAL;
 	}
 	return __sw_id_hw[id];
 }
-EXPORT_SYMBOL_GPL(map_sw_to_hw_steering_id);
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_id);
 
 static const int __rule_hw_sz[] = {
 	[MLX4_NET_TRANS_RULE_ID_ETH] =
@@ -790,29 +818,31 @@ static const int __rule_hw_sz[] = {
 	[MLX4_NET_TRANS_RULE_ID_TCP] =
 		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
 	[MLX4_NET_TRANS_RULE_ID_UDP] =
-		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp)
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_VXLAN] =
+		sizeof(struct mlx4_net_trans_rule_hw_vxlan)
 };
 
-int hw_rule_sz(struct mlx4_dev *dev,
+int mlx4_hw_rule_sz(struct mlx4_dev *dev,
 	       enum mlx4_net_trans_rule_id id)
 {
-	if (id >= MLX4_NET_TRANS_RULE_NUM || id < 0) {
+	if (id >= MLX4_NET_TRANS_RULE_NUM) {
 		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
 		return -EINVAL;
 	}
 
 	return __rule_hw_sz[id];
 }
-EXPORT_SYMBOL_GPL(hw_rule_sz);
+EXPORT_SYMBOL_GPL(mlx4_hw_rule_sz);
 
 static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
 			    struct _rule_hw *rule_hw)
 {
-	if (hw_rule_sz(dev, spec->id) < 0)
+	if (mlx4_hw_rule_sz(dev, spec->id) < 0)
 		return -EINVAL;
-	memset(rule_hw, 0, hw_rule_sz(dev, spec->id));
+	memset(rule_hw, 0, mlx4_hw_rule_sz(dev, spec->id));
 	rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]);
-	rule_hw->size = hw_rule_sz(dev, spec->id) >> 2;
+	rule_hw->size = mlx4_hw_rule_sz(dev, spec->id) >> 2;
 
 	switch (spec->id) {
 	case MLX4_NET_TRANS_RULE_ID_ETH:
@@ -855,6 +885,13 @@ static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
 		rule_hw->tcp_udp.src_port_msk = spec->tcp_udp.src_port_msk;
 		break;
 
+	case MLX4_NET_TRANS_RULE_ID_VXLAN:
+		rule_hw->vxlan.vni =
+			cpu_to_be32(be32_to_cpu(spec->vxlan.vni) << 8);
+		rule_hw->vxlan.vni_mask =
+			cpu_to_be32(be32_to_cpu(spec->vxlan.vni_mask) << 8);
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -879,7 +916,10 @@ static void mlx4_err_rule(struct mlx4_dev *dev, char *str,
 		switch (cur->id) {
 		case MLX4_NET_TRANS_RULE_ID_ETH:
 			len += snprintf(buf + len, BUF_SIZE - len,
-					"dmac = %pM ", &cur->eth.dst_mac);
+					"dmac = 0x%02x%02x%02x%02x%02x%02x ",
+					cur->eth.dst_mac[0], cur->eth.dst_mac[1],
+					cur->eth.dst_mac[2], cur->eth.dst_mac[3],
+					cur->eth.dst_mac[4], cur->eth.dst_mac[5]);
 			if (cur->eth.ether_type)
 				len += snprintf(buf + len, BUF_SIZE - len,
 						"ethertype = 0x%x ",
@@ -922,6 +962,10 @@ static void mlx4_err_rule(struct mlx4_dev *dev, char *str,
 					GID_PRINT_ARGS(cur->ib.dst_gid_msk));
 			break;
 
+		case MLX4_NET_TRANS_RULE_ID_VXLAN:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"VNID = %d ", be32_to_cpu(cur->vxlan.vni));
+			break;
 		case MLX4_NET_TRANS_RULE_ID_IPV6:
 			break;
 
@@ -933,7 +977,7 @@ static void mlx4_err_rule(struct mlx4_dev *dev, char *str,
 	mlx4_err(dev, "%s", buf);
 
 	if (len >= BUF_SIZE)
-		mlx4_err(dev, "Network rule error message was truncated, print buffer is too small.\n");
+		mlx4_err(dev, "Network rule error message was truncated, print buffer is too small\n");
 }
 
 int mlx4_flow_attach(struct mlx4_dev *dev,
@@ -948,7 +992,6 @@ int mlx4_flow_attach(struct mlx4_dev *dev,
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
-	memset(mailbox->buf, 0, sizeof(struct mlx4_net_trans_rule_hw_ctrl));
 	trans_rule_ctrl_to_hw(rule, mailbox->buf);
 
 	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
@@ -957,18 +1000,33 @@ int mlx4_flow_attach(struct mlx4_dev *dev,
 		ret = parse_trans_rule(dev, cur, mailbox->buf + size);
 		if (ret < 0) {
 			mlx4_free_cmd_mailbox(dev, mailbox);
-			return -EINVAL;
+			return ret;
 		}
 		size += ret;
 	}
 
 	ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id);
-	if (ret == -ENOMEM)
+	if (ret == -ENOMEM) {
 		mlx4_err_rule(dev,
-			      "mcg table is full. Fail to register network rule.\n",
+			      "mcg table is full. Fail to register network rule\n",
 			      rule);
-	else if (ret)
-		mlx4_err_rule(dev, "Fail to register network rule.\n", rule);
+	} else if (ret) {
+		if (ret == -ENXIO) {
+			if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
+				mlx4_err_rule(dev,
+					      "DMFS is not enabled, "
+					      "failed to register network rule.\n",
+					      rule);
+			else
+				mlx4_err_rule(dev,
+					      "Rule exceeds the dmfs_high_rate_mode limitations, "
+					      "failed to register network rule.\n",
+					      rule);
+
+		} else {
+			mlx4_err_rule(dev, "Fail to register network rule.\n", rule);
+		}
+	}
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
@@ -988,7 +1046,46 @@ int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id)
 }
 EXPORT_SYMBOL_GPL(mlx4_flow_detach);
 
-int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, u32 max_range_qpn)
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id)
+{
+	int err;
+	struct mlx4_spec_list spec_eth_outer = { {NULL} };
+	struct mlx4_spec_list spec_vxlan     = { {NULL} };
+	struct mlx4_spec_list spec_eth_inner = { {NULL} };
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+	};
+
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	rule.port = port;
+	rule.qpn = qpn;
+	rule.priority = prio;
+	INIT_LIST_HEAD(&rule.list);
+
+	spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN);
+	memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN;    /* any vxlan header */
+	spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH;	 /* any inner eth header */
+
+	list_add_tail(&spec_eth_outer.list, &rule.list);
+	list_add_tail(&spec_vxlan.list,     &rule.list);
+	list_add_tail(&spec_eth_inner.list, &rule.list);
+
+	err = mlx4_flow_attach(dev, &rule, reg_id);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_tunnel_steer_add);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
+				      u32 max_range_qpn)
 {
 	int err;
 	u64 in_param;
@@ -1012,7 +1109,7 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_mgm *mgm;
 	u32 members_count;
-	int index, prev;
+	int index = -1, prev;
 	int link = 0;
 	int i;
 	int err;
@@ -1053,7 +1150,7 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 
 	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
 	if (members_count == dev->caps.num_qp_per_mgm) {
-		mlx4_err(dev, "MGM at index %x is full.\n", index);
+		mlx4_err(dev, "MGM at index %x is full\n", index);
 		err = -ENOMEM;
 		goto out;
 	}
@@ -1065,8 +1162,11 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			goto out;
 		}
 
-	mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
-					       (!!mlx4_blck_lb << MGM_BLCK_LB_BIT));
+	if (block_mcast_loopback)
+		mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+						       (1U << MGM_BLCK_LB_BIT));
+	else
+		mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK);
 
 	mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30);
 
@@ -1074,9 +1174,8 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 	if (err)
 		goto out;
 
-	/* if !link, still add the new entry. */
 	if (!link)
-		goto skip_link;
+		goto out;
 
 	err = mlx4_READ_ENTRY(dev, prev, mailbox);
 	if (err)
@@ -1088,20 +1187,19 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 	if (err)
 		goto out;
 
-skip_link:
-	if (prot == MLX4_PROT_ETH) {
+out:
+	if (prot == MLX4_PROT_ETH && index != -1) {
 		/* manage the steering entry for promisc mode */
 		if (new_entry)
-			new_steering_entry(dev, port, steer, index, qp->qpn);
+			err = new_steering_entry(dev, port, steer,
+						 index, qp->qpn);
 		else
-			existing_steering_entry(dev, port, steer,
-						index, qp->qpn);
+			err = existing_steering_entry(dev, port, steer,
+						      index, qp->qpn);
 	}
-
-out:
 	if (err && link && index != -1) {
 		if (index < dev->caps.num_mgms)
-			mlx4_warn(dev, "Got AMGM index %d < %d",
+			mlx4_warn(dev, "Got AMGM index %d < %d\n",
 				  index, dev->caps.num_mgms);
 		else
 			mlx4_bitmap_free(&priv->mcg_table.bitmap,
@@ -1145,10 +1243,9 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 		goto out;
 	}
 
-	/*
-	  if this QP is also a promisc QP, it shouldn't be removed only if
-	  at least one none promisc QP is also attached to this MCG
-	*/
+	/* If this QP is also a promisc QP, it shouldn't be removed only if
+	 * at least one none promisc QP is also attached to this MCG
+	 */
 	if (prot == MLX4_PROT_ETH &&
 	    check_duplicate_entry(dev, port, steer, index, qp->qpn) &&
 	    !promisc_steering_entry(dev, port, steer, index, qp->qpn, NULL))
@@ -1199,7 +1296,7 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 
 		if (amgm_index) {
 			if (amgm_index < dev->caps.num_mgms)
-				mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d",
+				mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d\n",
 					  index, amgm_index, dev->caps.num_mgms);
 			else
 				mlx4_bitmap_free(&priv->mcg_table.bitmap,
@@ -1219,7 +1316,7 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			goto out;
 
 		if (index < dev->caps.num_mgms)
-			mlx4_warn(dev, "entry %d had next AMGM index %d < %d",
+			mlx4_warn(dev, "entry %d had next AMGM index %d < %d\n",
 				  prev, index, dev->caps.num_mgms);
 		else
 			mlx4_bitmap_free(&priv->mcg_table.bitmap,
@@ -1230,6 +1327,9 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 	mutex_unlock(&priv->mcg_table.mutex);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err && dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		/* In case device is under an error, return success as a closing command */
+		err = 0;
 	return err;
 }
 
@@ -1259,6 +1359,9 @@ static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp,
 		       MLX4_CMD_WRAPPED);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err && !attach &&
+	    dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		err = 0;
 	return err;
 }
 
@@ -1306,9 +1409,6 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			  u8 port, int block_mcast_loopback,
 			  enum mlx4_protocol prot, u64 *reg_id)
 {
-	enum mlx4_steer_type steer;
-	steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER;
-
 	switch (dev->caps.steering_mode) {
 	case MLX4_STEERING_MODE_A0:
 		if (prot == MLX4_PROT_ETH)
@@ -1316,7 +1416,7 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 
 	case MLX4_STEERING_MODE_B0:
 		if (prot == MLX4_PROT_ETH)
-			gid[7] |= (steer << 1);
+			gid[7] |= (MLX4_MC_STEER << 1);
 
 		if (mlx4_is_mfunc(dev))
 			return mlx4_QP_ATTACH(dev, qp, gid, 1,
@@ -1338,9 +1438,6 @@ EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			  enum mlx4_protocol prot, u64 reg_id)
 {
-	enum mlx4_steer_type steer;
-	steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER;
-
 	switch (dev->caps.steering_mode) {
 	case MLX4_STEERING_MODE_A0:
 		if (prot == MLX4_PROT_ETH)
@@ -1348,7 +1445,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 
 	case MLX4_STEERING_MODE_B0:
 		if (prot == MLX4_PROT_ETH)
-			gid[7] |= (steer << 1);
+			gid[7] |= (MLX4_MC_STEER << 1);
 
 		if (mlx4_is_mfunc(dev))
 			return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
@@ -1368,7 +1465,12 @@ EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
 int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
 				u32 qpn, enum mlx4_net_trans_promisc_mode mode)
 {
-	struct mlx4_net_trans_rule rule;
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+	};
+
 	u64 *regid_p;
 
 	switch (mode) {
@@ -1459,11 +1561,14 @@ int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
 			 struct mlx4_cmd_info *cmd)
 {
 	u32 qpn = (u32) vhcr->in_param & 0xffffffff;
-	u8 port = vhcr->in_param >> 62;
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_param >> 62);
 	enum mlx4_steer_type steer = vhcr->in_modifier;
 
-	/* Promiscuous unicast is not allowed in mfunc for VFs */
-	if ((slave != dev->caps.function) && (steer == MLX4_UC_STEER))
+	if (port < 0)
+		return -EINVAL;
+
+	/* Promiscuous unicast is not allowed in mfunc */
+	if (mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)
 		return 0;
 
 	if (vhcr->op_modifier)
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_mr.c b/sys/dev/mlx4/mlx4_core/mlx4_mr.c
index c194105249d8..e58b5cfbb8d3 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_mr.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_mr.c
@@ -119,8 +119,11 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
 	for (i = 0; i <= buddy->max_order; ++i) {
 		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
 		buddy->bits[i] = kcalloc(s, sizeof (long), GFP_KERNEL | __GFP_NOWARN);
-		if (!buddy->bits[i]) 
-			goto err_out_free;
+		if (!buddy->bits[i]) {
+			buddy->bits[i] = vzalloc(s * sizeof(long));
+			if (!buddy->bits[i])
+				goto err_out_free;
+		}
 	}
 
 	set_bit(0, buddy->bits[buddy->max_order]);
@@ -130,7 +133,7 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
 
 err_out_free:
 	for (i = 0; i <= buddy->max_order; ++i)
-		kfree(buddy->bits[i]);
+		kvfree(buddy->bits[i]);
 
 err_out:
 	kfree(buddy->bits);
@@ -144,7 +147,7 @@ static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy)
 	int i;
 
 	for (i = 0; i <= buddy->max_order; ++i)
-		kfree(buddy->bits[i]);
+		kvfree(buddy->bits[i]);
 
 	kfree(buddy->bits);
 	kfree(buddy->num_free);
@@ -210,11 +213,8 @@ int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
 		++mtt->order;
 
 	mtt->offset = mlx4_alloc_mtt_range(dev, mtt->order);
-	if (mtt->offset == -1) {
-		mlx4_err(dev, "Failed to allocate mtts for %d pages(order %d)\n",
-			 npages, mtt->order);
+	if (mtt->offset == -1)
 		return -ENOMEM;
-	}
 
 	return 0;
 }
@@ -247,11 +247,11 @@ static void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
 						       MLX4_CMD_TIME_CLASS_A,
 						       MLX4_CMD_WRAPPED);
 		if (err)
-			mlx4_warn(dev, "Failed to free mtt range at:"
-				  "%d order:%d\n", offset, order);
+			mlx4_warn(dev, "Failed to free mtt range at:%d order:%d\n",
+				  offset, order);
 		return;
 	}
-	 __mlx4_free_mtt_range(dev, offset, order);
+	__mlx4_free_mtt_range(dev, offset, order);
 }
 
 void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
@@ -295,6 +295,130 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox
 			    MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
 }
 
+/* Must protect against concurrent access */
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry)
+{
+	int err;
+	int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+
+	if (mmr->enabled != MLX4_MPT_EN_HW)
+		return -EINVAL;
+
+	err = mlx4_HW2SW_MPT(dev, NULL, key);
+	if (err) {
+		mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
+		mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
+		return err;
+	}
+
+	mmr->enabled = MLX4_MPT_EN_SW;
+
+	if (!mlx4_is_mfunc(dev)) {
+		**mpt_entry = mlx4_table_find(
+				&mlx4_priv(dev)->mr_table.dmpt_table,
+				key, NULL);
+	} else {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, key,
+				   0, MLX4_CMD_QUERY_MPT,
+				   MLX4_CMD_TIME_CLASS_B,
+				   MLX4_CMD_WRAPPED);
+		if (err)
+			goto free_mailbox;
+
+		*mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf;
+	}
+
+	if (!(*mpt_entry) || !(**mpt_entry)) {
+		err = -ENOMEM;
+		goto free_mailbox;
+	}
+
+	return 0;
+
+free_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt);
+
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry)
+{
+	int err;
+
+	if (!mlx4_is_mfunc(dev)) {
+		/* Make sure any changes to this entry are flushed */
+		wmb();
+
+		*(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW;
+
+		/* Make sure the new status is written */
+		wmb();
+
+		err = mlx4_SYNC_TPT(dev);
+	} else {
+		int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+
+		err = mlx4_SW2HW_MPT(dev, mailbox, key);
+	}
+
+	if (!err) {
+		mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
+		mmr->enabled = MLX4_MPT_EN_HW;
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
+
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry)
+{
+	if (mlx4_is_mfunc(dev)) {
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
+
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn)
+{
+	u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags) & ~MLX4_MPT_PD_MASK;
+	/* The wrapper function will put the slave's id here */
+	if (mlx4_is_mfunc(dev))
+		pd_flags &= ~MLX4_MPT_PD_VF_MASK;
+
+	mpt_entry->pd_flags = cpu_to_be32(pd_flags |
+					  (pdn & MLX4_MPT_PD_MASK)
+					  | MLX4_MPT_PD_FLAG_EN_INV);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd);
+
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access)
+{
+	u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) |
+		    (access & MLX4_PERM_MASK);
+
+	mpt_entry->flags = cpu_to_be32(flags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access);
+
 static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
 			   u64 iova, u64 size, u32 access, int npages,
 			   int page_shift, struct mlx4_mr *mr)
@@ -361,14 +485,14 @@ static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
 	__mlx4_mpt_release(dev, index);
 }
 
-int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp)
 {
 	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
 
-	return mlx4_table_get(dev, &mr_table->dmpt_table, index);
+	return mlx4_table_get(dev, &mr_table->dmpt_table, index, gfp);
 }
 
-static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp)
 {
 	u64 param = 0;
 
@@ -379,7 +503,7 @@ static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
 							MLX4_CMD_TIME_CLASS_A,
 							MLX4_CMD_WRAPPED);
 	}
-	return __mlx4_mpt_alloc_icm(dev, index);
+	return __mlx4_mpt_alloc_icm(dev, index, gfp);
 }
 
 void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
@@ -433,8 +557,8 @@ static int mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr)
 				     key_to_hw_index(mr->key) &
 				     (dev->caps.num_mpts - 1));
 		if (err) {
-			mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
-			mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
+			mlx4_warn(dev, "HW2SW_MPT failed (%d), MR has MWs bound to it\n",
+				  err);
 			return err;
 		}
 
@@ -460,13 +584,58 @@ int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
 }
 EXPORT_SYMBOL_GPL(mlx4_mr_free);
 
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+	mr->mtt.order = -1;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup);
+
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry)
+{
+	int err;
+
+	err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+	if (err)
+		return err;
+
+	mpt_entry->start       = cpu_to_be64(iova);
+	mpt_entry->length      = cpu_to_be64(size);
+	mpt_entry->entity_size = cpu_to_be32(page_shift);
+	mpt_entry->flags    &= ~(cpu_to_be32(MLX4_MPT_FLAG_FREE |
+					   MLX4_MPT_FLAG_SW_OWNS));
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_addr = 0;
+	} else {
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
+		if (mr->mtt.page_shift == 0)
+			mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+	}
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
+	mr->enabled = MLX4_MPT_EN_SW;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write);
+
 int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_mpt_entry *mpt_entry;
 	int err;
 
-	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key));
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key), GFP_KERNEL);
 	if (err)
 		return err;
 
@@ -476,9 +645,6 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
 		goto err_table;
 	}
 	mpt_entry = mailbox->buf;
-
-	memset(mpt_entry, 0, sizeof *mpt_entry);
-
 	mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO	 |
 				       MLX4_MPT_FLAG_REGION	 |
 				       mr->access);
@@ -542,13 +708,13 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	if (!mtts)
 		return -ENOMEM;
 
-	dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle,
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, dma_handle,
 				npages * sizeof (u64), DMA_TO_DEVICE);
 
 	for (i = 0; i < npages; ++i)
 		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
-	dma_sync_single_for_device(&dev->pdev->dev, dma_handle,
+	dma_sync_single_for_device(&dev->persist->pdev->dev, dma_handle,
 				   npages * sizeof (u64), DMA_TO_DEVICE);
 
 	return 0;
@@ -627,13 +793,14 @@ int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 EXPORT_SYMBOL_GPL(mlx4_write_mtt);
 
 int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
-		       struct mlx4_buf *buf)
+		       struct mlx4_buf *buf, gfp_t gfp)
 {
 	u64 *page_list;
 	int err;
 	int i;
 
-	page_list = kmalloc(buf->npages * sizeof *page_list, GFP_KERNEL);
+	page_list = kmalloc(buf->npages * sizeof *page_list,
+			    gfp);
 	if (!page_list)
 		return -ENOMEM;
 
@@ -655,6 +822,12 @@ int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type,
 {
 	u32 index;
 
+	if ((type == MLX4_MW_TYPE_1 &&
+	     !(dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)) ||
+	     (type == MLX4_MW_TYPE_2 &&
+	     !(dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)))
+		return -ENOTSUPP;
+
 	index = mlx4_mpt_reserve(dev);
 	if (index == -1)
 		return -ENOMEM;
@@ -674,7 +847,7 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw)
 	struct mlx4_mpt_entry *mpt_entry;
 	int err;
 
-	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key));
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key), GFP_KERNEL);
 	if (err)
 		return err;
 
@@ -685,11 +858,9 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw)
 	}
 	mpt_entry = mailbox->buf;
 
-	memset(mpt_entry, 0, sizeof(*mpt_entry));
-
 	/* Note that the MLX4_MPT_FLAG_REGION bit in mpt_entry->flags is turned
-	* off, thus creating a memory window and not a memory region.
-	*/
+	 * off, thus creating a memory window and not a memory region.
+	 */
 	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mw->key));
 	mpt_entry->pd_flags    = cpu_to_be32(mw->pd);
 	if (mw->type == MLX4_MW_TYPE_2) {
@@ -759,8 +930,8 @@ int mlx4_init_mr_table(struct mlx4_dev *dev)
 		return err;
 
 	err = mlx4_buddy_init(&mr_table->mtt_buddy,
-			      ilog2(div_u64(dev->caps.num_mtts,
-			      (1 << log_mtts_per_seg))));
+			      ilog2((u32)dev->caps.num_mtts /
+			      (1 << log_mtts_per_seg)));
 	if (err)
 		goto err_buddy;
 
@@ -769,7 +940,7 @@ int mlx4_init_mr_table(struct mlx4_dev *dev)
 			mlx4_alloc_mtt_range(dev,
 					     fls(dev->caps.reserved_mtts - 1));
 		if (priv->reserved_mtts < 0) {
-			mlx4_warn(dev, "MTT table of order %u is too small.\n",
+			mlx4_warn(dev, "MTT table of order %u is too small\n",
 				  mr_table->mtt_buddy.max_order);
 			err = -ENOMEM;
 			goto err_reserve_mtts;
@@ -849,13 +1020,13 @@ int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list
 	/* Make sure MPT status is visible before writing MTT entries */
 	wmb();
 
-	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->dma_handle,
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, fmr->dma_handle,
 				npages * sizeof(u64), DMA_TO_DEVICE);
 
 	for (i = 0; i < npages; ++i)
 		fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
-	dma_sync_single_for_device(&dev->pdev->dev, fmr->dma_handle,
+	dma_sync_single_for_device(&dev->persist->pdev->dev, fmr->dma_handle,
 				   npages * sizeof(u64), DMA_TO_DEVICE);
 
 	fmr->mpt->key    = cpu_to_be32(key);
@@ -879,7 +1050,7 @@ int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
 		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int err = -ENOMEM, ret;
+	int err = -ENOMEM;
 
 	if (max_maps > dev->caps.max_fmr_maps)
 		return -EINVAL;
@@ -913,9 +1084,7 @@ int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
 	return 0;
 
 err_free:
-	ret = mlx4_mr_free(dev, &fmr->mr);
-	if (ret)
-		mlx4_err(dev, "Error deregistering MR. The system may have become unstable.");
+	(void) mlx4_mr_free(dev, &fmr->mr);
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_fmr_alloc);
@@ -941,34 +1110,30 @@ EXPORT_SYMBOL_GPL(mlx4_fmr_enable);
 void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
 		    u32 *lkey, u32 *rkey)
 {
-	u32 key;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
 
 	if (!fmr->maps)
 		return;
 
-	key = key_to_hw_index(fmr->mr.key) & (dev->caps.num_mpts - 1);
-
-	*(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW;
-
-	/* Make sure MPT status is visible before changing MPT fields */
-	wmb();
-
-	fmr->mr.key = hw_index_to_key(key);
-
-	fmr->mpt->key    = cpu_to_be32(key);
-	fmr->mpt->lkey   = cpu_to_be32(key);
-	fmr->mpt->length = 0;
-	fmr->mpt->start  = 0;
-
-	/* Make sure MPT data is visible before changing MPT status */
-	wmb();
-
-	*(u8 *)fmr->mpt = MLX4_MPT_STATUS_HW;
-
-	/* Make sure MPT satus is visible */
-	wmb();
-
 	fmr->maps = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		pr_warn("mlx4_ib: mlx4_alloc_cmd_mailbox failed (%d)\n", err);
+		return;
+	}
+
+	err = mlx4_HW2SW_MPT(dev, NULL,
+			     key_to_hw_index(fmr->mr.key) &
+			     (dev->caps.num_mpts - 1));
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err) {
+		pr_warn("mlx4_ib: mlx4_HW2SW_MPT failed (%d)\n", err);
+		return;
+	}
+	fmr->mr.enabled = MLX4_MPT_EN_SW;
 }
 EXPORT_SYMBOL_GPL(mlx4_fmr_unmap);
 
@@ -990,7 +1155,7 @@ EXPORT_SYMBOL_GPL(mlx4_fmr_free);
 
 int mlx4_SYNC_TPT(struct mlx4_dev *dev)
 {
-	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000,
-			MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 }
 EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_pd.c b/sys/dev/mlx4/mlx4_core/mlx4_pd.c
index ecbe3eae7f4d..25fb9e754535 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_pd.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_pd.c
@@ -151,11 +151,13 @@ int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
 		return -ENOMEM;
 
 	if (mlx4_is_slave(dev))
-		offset = uar->index % ((int) pci_resource_len(dev->pdev, 2) /
+		offset = uar->index % ((int)pci_resource_len(dev->persist->pdev,
+							     2) /
 				       dev->caps.uar_page_size);
 	else
 		offset = uar->index;
-	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + offset;
+	uar->pfn = (pci_resource_start(dev->persist->pdev, 2) >> PAGE_SHIFT)
+		    + offset;
 	uar->map = NULL;
 	return 0;
 }
@@ -167,7 +169,6 @@ void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar)
 }
 EXPORT_SYMBOL_GPL(mlx4_uar_free);
 
-#ifndef CONFIG_PPC
 int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -186,9 +187,9 @@ int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
 			err = -ENOMEM;
 			goto out;
 		}
-		uar = kmalloc_node(sizeof *uar, GFP_KERNEL, node);
+		uar = kmalloc_node(sizeof(*uar), GFP_KERNEL, node);
 		if (!uar) {
-			uar = kmalloc(sizeof *uar, GFP_KERNEL);
+			uar = kmalloc(sizeof(*uar), GFP_KERNEL);
 			if (!uar) {
 				err = -ENOMEM;
 				goto out;
@@ -204,7 +205,9 @@ int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
 			goto free_uar;
 		}
 
-		uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT, PAGE_SIZE);
+		uar->bf_map = io_mapping_map_wc(priv->bf_mapping,
+						uar->index << PAGE_SHIFT,
+						PAGE_SIZE);
 		if (!uar->bf_map) {
 			err = -ENOMEM;
 			goto unamp_uar;
@@ -213,7 +216,6 @@ int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
 		list_add(&uar->bf_list, &priv->bf_list);
 	}
 
-	bf->uar = uar;
 	idx = ffz(uar->free_bf_bmap);
 	uar->free_bf_bmap |= 1 << idx;
 	bf->uar = uar;
@@ -267,26 +269,16 @@ void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf)
 }
 EXPORT_SYMBOL_GPL(mlx4_bf_free);
 
-#else
-int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
-{
-	memset(bf, 0, sizeof *bf);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL_GPL(mlx4_bf_alloc);
-
-void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf)
-{
-       return;
-}
-EXPORT_SYMBOL_GPL(mlx4_bf_free);
-#endif
-
 int mlx4_init_uar_table(struct mlx4_dev *dev)
 {
-	if (dev->caps.num_uars <= 128) {
-		mlx4_err(dev, "Only %d UAR pages (need more than 128)\n",
-			 dev->caps.num_uars);
+	int num_reserved_uar = mlx4_get_num_reserved_uar(dev);
+
+	mlx4_dbg(dev, "uar_page_shift = %d", dev->uar_page_shift);
+	mlx4_dbg(dev, "Effective reserved_uars=%d", dev->caps.reserved_uars);
+
+	if (dev->caps.num_uars <= num_reserved_uar) {
+		mlx4_err(dev, "Only %d UAR pages (need more than %d)\n",
+			 dev->caps.num_uars, num_reserved_uar);
 		mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n");
 		return -ENODEV;
 	}
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_port.c b/sys/dev/mlx4/mlx4_core/mlx4_port.c
index 00c88de91ee0..fbe215932306 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_port.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_port.c
@@ -39,21 +39,26 @@
 
 #include <dev/mlx4/cmd.h>
 #include <linux/moduleparam.h>
+
+#include <rdma/ib_verbs.h>
+
 #include "mlx4.h"
 #include <dev/mlx4/stats.h>
 
-
-int mlx4_set_4k_mtu = -1;
-module_param_named(set_4k_mtu, mlx4_set_4k_mtu, int, 0444);
-MODULE_PARM_DESC(set_4k_mtu,
-	"(Obsolete) attempt to set 4K MTU to all ConnectX ports");
-
-
 #define MLX4_MAC_VALID		(1ull << 63)
 
 #define MLX4_VLAN_VALID		(1u << 31)
 #define MLX4_VLAN_MASK		0xfff
 
+#define MLX4_STATS_TRAFFIC_COUNTERS_MASK	0xfULL
+#define MLX4_STATS_TRAFFIC_DROPS_MASK		0xc0ULL
+#define MLX4_STATS_ERROR_COUNTERS_MASK		0x1ffc30ULL
+#define MLX4_STATS_PORT_COUNTERS_MASK		0x1fe00000ULL
+
+#define MLX4_FLAG_V_IGNORE_FCS_MASK		0x2
+#define MLX4_IGNORE_FCS_MASK			0x1
+#define MLX4_TC_MAX_NUMBER			8
+
 void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
 {
 	int i;
@@ -62,6 +67,7 @@ void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
 	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
 		table->entries[i] = 0;
 		table->refs[i]	 = 0;
+		table->is_dup[i] = false;
 	}
 	table->max   = 1 << dev->caps.log_num_macs;
 	table->total = 0;
@@ -75,17 +81,28 @@ void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
 	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
 		table->entries[i] = 0;
 		table->refs[i]	 = 0;
+		table->is_dup[i] = false;
 	}
 	table->max   = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR;
 	table->total = 0;
 }
 
+void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
+			      struct mlx4_roce_gid_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++)
+		memset(table->roce_gids[i].raw, 0, MLX4_ROCE_GID_ENTRY_SIZE);
+}
+
 static int validate_index(struct mlx4_dev *dev,
 			  struct mlx4_mac_table *table, int index)
 {
 	int err = 0;
 
-	if (index < 0 || index >= table->max || !table->refs[index]) {
+	if (index < 0 || index >= table->max || !table->entries[index]) {
 		mlx4_warn(dev, "No valid Mac entry for the given index\n");
 		err = -EINVAL;
 	}
@@ -98,7 +115,8 @@ static int find_index(struct mlx4_dev *dev,
 	int i;
 
 	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
-		if ((mac & MLX4_MAC_MASK) ==
+		if (table->refs[i] &&
+		    (MLX4_MAC_MASK & mac) ==
 		    (MLX4_MAC_MASK & be64_to_cpu(table->entries[i])))
 			return i;
 	}
@@ -121,39 +139,154 @@ static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
 
 	in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
 
-	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
 
+int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int i;
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!table->refs[i])
+			continue;
+
+		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_mac);
+
+static bool mlx4_need_mf_bond(struct mlx4_dev *dev)
+{
+	int i, num_eth_ports = 0;
+
+	if (!mlx4_is_mfunc(dev))
+		return false;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		++num_eth_ports;
+
+	return (num_eth_ports ==  2) ? true : false;
+}
+
 int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 {
 	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
 	struct mlx4_mac_table *table = &info->mac_table;
 	int i, err = 0;
 	int free = -1;
+	int free_for_dup = -1;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
+	bool need_mf_bond = mlx4_need_mf_bond(dev);
+	bool can_mf_bond = true;
 
-	mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d\n",
-		 (unsigned long long) mac, port);
+	mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d %s duplicate\n",
+		 (unsigned long long)mac, port,
+		 dup ? "with" : "without");
+
+	if (need_mf_bond) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	if (need_mf_bond) {
+		int index_at_port = -1;
+		int index_at_dup_port = -1;
+
+		for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+			if (((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))))
+				index_at_port = i;
+			if (((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(dup_table->entries[i]))))
+				index_at_dup_port = i;
+		}
+
+		/* check that same mac is not in the tables at different indices */
+		if ((index_at_port != index_at_dup_port) &&
+		    (index_at_port >= 0) &&
+		    (index_at_dup_port >= 0))
+			can_mf_bond = false;
+
+		/* If the mac is already in the primary table, the slot must be
+		 * available in the duplicate table as well.
+		 */
+		if (index_at_port >= 0 && index_at_dup_port < 0 &&
+		    dup_table->refs[index_at_port]) {
+			can_mf_bond = false;
+		}
+		/* If the mac is already in the duplicate table, check that the
+		 * corresponding index is not occupied in the primary table, or
+		 * the primary table already contains the mac at the same index.
+		 * Otherwise, you cannot bond (primary contains a different mac
+		 * at that index).
+		 */
+		if (index_at_dup_port >= 0) {
+			if (!table->refs[index_at_dup_port] ||
+			    ((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[index_at_dup_port]))))
+				free_for_dup = index_at_dup_port;
+			else
+				can_mf_bond = false;
+		}
+	}
 
-	mutex_lock(&table->mutex);
 	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
-		if (free < 0 && !table->refs[i]) {
-			free = i;
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
+			if (free_for_dup < 0 && need_mf_bond && can_mf_bond) {
+				if (!dup_table->refs[i])
+					free_for_dup = i;
+			}
 			continue;
 		}
 
-		if ((mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) &&
-		    table->refs[i]) {
-			/* MAC already registered, Must not have duplicates */
+		if ((MLX4_MAC_MASK & mac) ==
+		     (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			/* MAC already registered, increment ref count */
 			err = i;
 			++table->refs[i];
+			if (dup) {
+				u64 dup_mac = MLX4_MAC_MASK & be64_to_cpu(dup_table->entries[i]);
+
+				if (dup_mac != mac || !dup_table->is_dup[i]) {
+					mlx4_warn(dev, "register mac: expect duplicate mac 0x%llx on port %d index %d\n",
+						  (long long)mac, dup_port, i);
+				}
+			}
 			goto out;
 		}
 	}
 
+	if (need_mf_bond && (free_for_dup < 0)) {
+		if (dup) {
+			mlx4_warn(dev, "Fail to allocate duplicate MAC table entry\n");
+			mlx4_warn(dev, "High Availability for virtual functions may not work as expected\n");
+			dup = false;
+		}
+		can_mf_bond = false;
+	}
+
+	if (need_mf_bond && can_mf_bond)
+		free = free_for_dup;
+
 	mlx4_dbg(dev, "Free MAC index is %d\n", free);
 
 	if (table->total == table->max) {
@@ -173,11 +306,35 @@ int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 		goto out;
 	}
 	table->refs[free] = 1;
-
-	err = free;
+	table->is_dup[free] = false;
 	++table->total;
+	if (dup) {
+		dup_table->refs[free] = 0;
+		dup_table->is_dup[free] = true;
+		dup_table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+		err = mlx4_set_port_mac_table(dev, dup_port, dup_table->entries);
+		if (unlikely(err)) {
+			mlx4_warn(dev, "Failed adding duplicate mac: 0x%llx\n", (long long)mac);
+			dup_table->is_dup[free] = false;
+			dup_table->entries[free] = 0;
+			goto out;
+		}
+		++dup_table->total;
+	}
+	err = free;
 out:
-	mutex_unlock(&table->mutex);
+	if (need_mf_bond) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(__mlx4_register_mac);
@@ -224,6 +381,9 @@ void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 	struct mlx4_port_info *info;
 	struct mlx4_mac_table *table;
 	int index;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
 
 	if (port < 1 || port > dev->caps.num_ports) {
 		mlx4_warn(dev, "invalid port number (%d), aborting...\n", port);
@@ -231,24 +391,59 @@ void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 	}
 	info = &mlx4_priv(dev)->port[port];
 	table = &info->mac_table;
-	mutex_lock(&table->mutex);
+
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
 
 	index = find_index(dev, table, mac);
 
 	if (validate_index(dev, table, index))
 		goto out;
 
-	if (--table->refs[index]) {
-		mlx4_dbg(dev, "Have more references for index %d,"
-			 "no need to modify mac table\n", index);
+	if (--table->refs[index] || table->is_dup[index]) {
+		mlx4_dbg(dev, "Have more references for index %d, no need to modify mac table\n",
+			 index);
+		if (!table->refs[index])
+			dup_table->is_dup[index] = false;
 		goto out;
 	}
 
 	table->entries[index] = 0;
-	mlx4_set_port_mac_table(dev, port, table->entries);
+	if (mlx4_set_port_mac_table(dev, port, table->entries))
+		mlx4_warn(dev, "Fail to set mac in port %d during unregister\n", port);
 	--table->total;
+
+	if (dup) {
+		dup_table->is_dup[index] = false;
+		if (dup_table->refs[index])
+			goto out;
+		dup_table->entries[index] = 0;
+		if (mlx4_set_port_mac_table(dev, dup_port, dup_table->entries))
+			mlx4_warn(dev, "Fail to set mac in duplicate port %d during unregister\n", dup_port);
+
+		--table->total;
+	}
 out:
-	mutex_unlock(&table->mutex);
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 }
 EXPORT_SYMBOL_GPL(__mlx4_unregister_mac);
 
@@ -282,9 +477,22 @@ int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
 	struct mlx4_mac_table *table = &info->mac_table;
 	int index = qpn - info->base_qpn;
 	int err = 0;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
 
 	/* CX1 doesn't support multi-functions */
-	mutex_lock(&table->mutex);
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
 
 	err = validate_index(dev, table, index);
 	if (err)
@@ -297,9 +505,30 @@ int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
 		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
 			 (unsigned long long) new_mac);
 		table->entries[index] = 0;
+	} else {
+		if (dup) {
+			dup_table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID);
+
+			err = mlx4_set_port_mac_table(dev, dup_port, dup_table->entries);
+			if (unlikely(err)) {
+				mlx4_err(dev, "Failed adding duplicate MAC: 0x%llx\n",
+					 (unsigned long long)new_mac);
+				dup_table->entries[index] = 0;
+			}
+		}
 	}
 out:
-	mutex_unlock(&table->mutex);
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(__mlx4_replace_mac);
@@ -317,8 +546,9 @@ static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
 
 	memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
 	in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
-	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
@@ -350,8 +580,28 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
 	int i, err = 0;
 	int free = -1;
+	int free_for_dup = -1;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_vlan_table *dup_table = &mlx4_priv(dev)->port[dup_port].vlan_table;
+	bool need_mf_bond = mlx4_need_mf_bond(dev);
+	bool can_mf_bond = true;
 
-	mutex_lock(&table->mutex);
+	mlx4_dbg(dev, "Registering VLAN: %d for port %d %s duplicate\n",
+		 vlan, port,
+		 dup ? "with" : "without");
+
+	if (need_mf_bond) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
 
 	if (table->total == table->max) {
 		/* No free vlan entries */
@@ -359,22 +609,85 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 		goto out;
 	}
 
+	if (need_mf_bond) {
+		int index_at_port = -1;
+		int index_at_dup_port = -1;
+
+		for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+			if (vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))
+				index_at_port = i;
+			if (vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i])))
+				index_at_dup_port = i;
+		}
+		/* check that same vlan is not in the tables at different indices */
+		if ((index_at_port != index_at_dup_port) &&
+		    (index_at_port >= 0) &&
+		    (index_at_dup_port >= 0))
+			can_mf_bond = false;
+
+		/* If the vlan is already in the primary table, the slot must be
+		 * available in the duplicate table as well.
+		 */
+		if (index_at_port >= 0 && index_at_dup_port < 0 &&
+		    dup_table->refs[index_at_port]) {
+			can_mf_bond = false;
+		}
+		/* If the vlan is already in the duplicate table, check that the
+		 * corresponding index is not occupied in the primary table, or
+		 * the primary table already contains the vlan at the same index.
+		 * Otherwise, you cannot bond (primary contains a different vlan
+		 * at that index).
+		 */
+		if (index_at_dup_port >= 0) {
+			if (!table->refs[index_at_dup_port] ||
+			    (vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[index_at_dup_port]))))
+				free_for_dup = index_at_dup_port;
+			else
+				can_mf_bond = false;
+		}
+	}
+
 	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
-		if (free < 0 && (table->refs[i] == 0)) {
-			free = i;
-			continue;
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
+			if (free_for_dup < 0 && need_mf_bond && can_mf_bond) {
+				if (!dup_table->refs[i])
+					free_for_dup = i;
+			}
 		}
 
-		if (table->refs[i] &&
+		if ((table->refs[i] || table->is_dup[i]) &&
 		    (vlan == (MLX4_VLAN_MASK &
 			      be32_to_cpu(table->entries[i])))) {
 			/* Vlan already registered, increase references count */
+			mlx4_dbg(dev, "vlan %u is already registered.\n", vlan);
 			*index = i;
 			++table->refs[i];
+			if (dup) {
+				u16 dup_vlan = MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i]);
+
+				if (dup_vlan != vlan || !dup_table->is_dup[i]) {
+					mlx4_warn(dev, "register vlan: expected duplicate vlan %u on port %d index %d\n",
+						  vlan, dup_port, i);
+				}
+			}
 			goto out;
 		}
 	}
 
+	if (need_mf_bond && (free_for_dup < 0)) {
+		if (dup) {
+			mlx4_warn(dev, "Fail to allocate duplicate VLAN table entry\n");
+			mlx4_warn(dev, "High Availability for virtual functions may not work as expected\n");
+			dup = false;
+		}
+		can_mf_bond = false;
+	}
+
+	if (need_mf_bond && can_mf_bond)
+		free = free_for_dup;
+
 	if (free < 0) {
 		err = -ENOMEM;
 		goto out;
@@ -382,6 +695,7 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 
 	/* Register new VLAN */
 	table->refs[free] = 1;
+	table->is_dup[free] = false;
 	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
 
 	err = mlx4_set_port_vlan_table(dev, port, table->entries);
@@ -391,11 +705,35 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 		table->entries[free] = 0;
 		goto out;
 	}
+	++table->total;
+	if (dup) {
+		dup_table->refs[free] = 0;
+		dup_table->is_dup[free] = true;
+		dup_table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+		err = mlx4_set_port_vlan_table(dev, dup_port, dup_table->entries);
+		if (unlikely(err)) {
+			mlx4_warn(dev, "Failed adding duplicate vlan: %u\n", vlan);
+			dup_table->is_dup[free] = false;
+			dup_table->entries[free] = 0;
+			goto out;
+		}
+		++dup_table->total;
+	}
 
 	*index = free;
-	++table->total;
 out:
-	mutex_unlock(&table->mutex);
+	if (need_mf_bond) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 	return err;
 }
 
@@ -425,8 +763,22 @@ void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
 {
 	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
 	int index;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_vlan_table *dup_table = &mlx4_priv(dev)->port[dup_port].vlan_table;
+
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
 
-	mutex_lock(&table->mutex);
 	if (mlx4_find_cached_vlan(dev, port, vlan, &index)) {
 		mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan);
 		goto out;
@@ -437,17 +789,38 @@ void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
 		goto out;
 	}
 
-	if (--table->refs[index]) {
-		mlx4_dbg(dev, "Have %d more references for index %d, "
-			 "no need to modify vlan table\n", table->refs[index],
-			 index);
+	if (--table->refs[index] || table->is_dup[index]) {
+		mlx4_dbg(dev, "Have %d more references for index %d, no need to modify vlan table\n",
+			 table->refs[index], index);
+		if (!table->refs[index])
+			dup_table->is_dup[index] = false;
 		goto out;
 	}
 	table->entries[index] = 0;
-	mlx4_set_port_vlan_table(dev, port, table->entries);
+	if (mlx4_set_port_vlan_table(dev, port, table->entries))
+		mlx4_warn(dev, "Fail to set vlan in port %d during unregister\n", port);
 	--table->total;
+	if (dup) {
+		dup_table->is_dup[index] = false;
+		if (dup_table->refs[index])
+			goto out;
+		dup_table->entries[index] = 0;
+		if (mlx4_set_port_vlan_table(dev, dup_port, dup_table->entries))
+			mlx4_warn(dev, "Fail to set vlan in duplicate port %d during unregister\n", dup_port);
+		--dup_table->total;
+	}
 out:
-	mutex_unlock(&table->mutex);
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 }
 
 void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
@@ -466,6 +839,220 @@ void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
 
+int mlx4_bond_mac_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mac_table *t1 = &mlx4_priv(dev)->port[1].mac_table;
+	struct mlx4_mac_table *t2 = &mlx4_priv(dev)->port[2].mac_table;
+	int ret = 0;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if ((t1->entries[i] != t2->entries[i]) &&
+		    t1->entries[i] && t2->entries[i]) {
+			mlx4_warn(dev, "can't duplicate entry %d in mac table\n", i);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (t1->entries[i] && !t2->entries[i]) {
+			t2->entries[i] = t1->entries[i];
+			t2->is_dup[i] = true;
+			update2 = true;
+		} else if (!t1->entries[i] && t2->entries[i]) {
+			t1->entries[i] = t2->entries[i];
+			t1->is_dup[i] = true;
+			update1 = true;
+		} else if (t1->entries[i] && t2->entries[i]) {
+			t1->is_dup[i] = true;
+			t2->is_dup[i] = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_mac_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set MAC table for port 1 (%d)\n", ret);
+	}
+	if (!ret && update2) {
+		ret = mlx4_set_port_mac_table(dev, 2, t2->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set MAC table for port 2 (%d)\n", ret);
+	}
+
+	if (ret)
+		mlx4_warn(dev, "failed to create mirror MAC tables\n");
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_unbond_mac_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mac_table *t1 = &mlx4_priv(dev)->port[1].mac_table;
+	struct mlx4_mac_table *t2 = &mlx4_priv(dev)->port[2].mac_table;
+	int ret = 0;
+	int ret1;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (t1->entries[i] != t2->entries[i]) {
+			mlx4_warn(dev, "mac table is in an unexpected state when trying to unbond\n");
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!t1->entries[i])
+			continue;
+		t1->is_dup[i] = false;
+		if (!t1->refs[i]) {
+			t1->entries[i] = 0;
+			update1 = true;
+		}
+		t2->is_dup[i] = false;
+		if (!t2->refs[i]) {
+			t2->entries[i] = 0;
+			update2 = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_mac_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to unmirror MAC tables for port 1(%d)\n", ret);
+	}
+	if (update2) {
+		ret1 = mlx4_set_port_mac_table(dev, 2, t2->entries);
+		if (ret1) {
+			mlx4_warn(dev, "failed to unmirror MAC tables for port 2(%d)\n", ret1);
+			ret = ret1;
+		}
+	}
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_bond_vlan_table(struct mlx4_dev *dev)
+{
+	struct mlx4_vlan_table *t1 = &mlx4_priv(dev)->port[1].vlan_table;
+	struct mlx4_vlan_table *t2 = &mlx4_priv(dev)->port[2].vlan_table;
+	int ret = 0;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if ((t1->entries[i] != t2->entries[i]) &&
+		    t1->entries[i] && t2->entries[i]) {
+			mlx4_warn(dev, "can't duplicate entry %d in vlan table\n", i);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (t1->entries[i] && !t2->entries[i]) {
+			t2->entries[i] = t1->entries[i];
+			t2->is_dup[i] = true;
+			update2 = true;
+		} else if (!t1->entries[i] && t2->entries[i]) {
+			t1->entries[i] = t2->entries[i];
+			t1->is_dup[i] = true;
+			update1 = true;
+		} else if (t1->entries[i] && t2->entries[i]) {
+			t1->is_dup[i] = true;
+			t2->is_dup[i] = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_vlan_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set VLAN table for port 1 (%d)\n", ret);
+	}
+	if (!ret && update2) {
+		ret = mlx4_set_port_vlan_table(dev, 2, t2->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set VLAN table for port 2 (%d)\n", ret);
+	}
+
+	if (ret)
+		mlx4_warn(dev, "failed to create mirror VLAN tables\n");
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_unbond_vlan_table(struct mlx4_dev *dev)
+{
+	struct mlx4_vlan_table *t1 = &mlx4_priv(dev)->port[1].vlan_table;
+	struct mlx4_vlan_table *t2 = &mlx4_priv(dev)->port[2].vlan_table;
+	int ret = 0;
+	int ret1;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (t1->entries[i] != t2->entries[i]) {
+			mlx4_warn(dev, "vlan table is in an unexpected state when trying to unbond\n");
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (!t1->entries[i])
+			continue;
+		t1->is_dup[i] = false;
+		if (!t1->refs[i]) {
+			t1->entries[i] = 0;
+			update1 = true;
+		}
+		t2->is_dup[i] = false;
+		if (!t2->refs[i]) {
+			t2->entries[i] = 0;
+			update2 = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_vlan_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to unmirror VLAN tables for port 1(%d)\n", ret);
+	}
+	if (update2) {
+		ret1 = mlx4_set_port_vlan_table(dev, 2, t2->entries);
+		if (ret1) {
+			mlx4_warn(dev, "failed to unmirror VLAN tables for port 2(%d)\n", ret1);
+			ret = ret1;
+		}
+	}
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
 int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
 {
 	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
@@ -484,8 +1071,6 @@ int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
 
 	inbuf = inmailbox->buf;
 	outbuf = outmailbox->buf;
-	memset(inbuf, 0, 256);
-	memset(outbuf, 0, 256);
 	inbuf[0] = 1;
 	inbuf[1] = 1;
 	inbuf[2] = 1;
@@ -504,29 +1089,159 @@ int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
 }
 static struct mlx4_roce_gid_entry zgid_entry;
 
-int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave)
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port)
 {
+	int vfs;
+	int slave_gid = slave;
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+	struct mlx4_active_ports actv_ports;
+	unsigned max_port_p_one;
+
 	if (slave == 0)
 		return MLX4_ROCE_PF_GIDS;
-	if (slave <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % dev->num_vfs))
-		return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs) + 1;
-	return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs;
+
+	/* Slave is a VF */
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1;
+
+	for (i = 1; i < max_port_p_one; i++) {
+		struct mlx4_active_ports exclusive_ports;
+		struct mlx4_slaves_pport slaves_pport_actv;
+		bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+		set_bit(i - 1, exclusive_ports.ports);
+		if (i == port)
+			continue;
+		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
+				    dev, &exclusive_ports);
+		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
+					   dev->persist->num_vfs + 1);
+	}
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
+	if (slave_gid <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % vfs))
+		return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs) + 1;
+	return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs;
 }
 
-int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave)
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port)
 {
 	int gids;
+	unsigned i;
+	int slave_gid = slave;
 	int vfs;
 
-	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
-	vfs = dev->num_vfs;
+	struct mlx4_slaves_pport slaves_pport;
+	struct mlx4_active_ports actv_ports;
+	unsigned max_port_p_one;
 
 	if (slave == 0)
 		return 0;
-	if (slave <= gids % vfs)
-		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1);
 
-	return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1));
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1;
+
+	for (i = 1; i < max_port_p_one; i++) {
+		struct mlx4_active_ports exclusive_ports;
+		struct mlx4_slaves_pport slaves_pport_actv;
+		bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+		set_bit(i - 1, exclusive_ports.ports);
+		if (i == port)
+			continue;
+		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
+				    dev, &exclusive_ports);
+		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
+					   dev->persist->num_vfs + 1);
+	}
+	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
+	if (slave_gid <= gids % vfs)
+		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave_gid - 1);
+
+	return MLX4_ROCE_PF_GIDS + (gids % vfs) +
+		((gids / vfs) * (slave_gid - 1));
+}
+EXPORT_SYMBOL_GPL(mlx4_get_base_gid_ix);
+
+static int mlx4_reset_roce_port_gids(struct mlx4_dev *dev, int slave,
+				     int port, struct mlx4_cmd_mailbox *mailbox)
+{
+	struct mlx4_roce_gid_entry *gid_entry_mbox;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int num_gids, base, offset;
+	int i, err;
+
+	num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+	base = mlx4_get_base_gid_ix(dev, slave, port);
+
+	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+
+	mutex_lock(&(priv->port[port].gid_table.mutex));
+	/* Zero-out gids belonging to that slave in the port GID table */
+	for (i = 0, offset = base; i < num_gids; offset++, i++)
+		memcpy(priv->port[port].gid_table.roce_gids[offset].raw,
+		       zgid_entry.raw, MLX4_ROCE_GID_ENTRY_SIZE);
+
+	/* Now, copy roce port gids table to mailbox for passing to FW */
+	gid_entry_mbox = (struct mlx4_roce_gid_entry *)mailbox->buf;
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
+		memcpy(gid_entry_mbox->raw,
+		       priv->port[port].gid_table.roce_gids[i].raw,
+		       MLX4_ROCE_GID_ENTRY_SIZE);
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       ((u32)port) | (MLX4_SET_PORT_GID_TABLE << 8),
+		       MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	mutex_unlock(&(priv->port[port].gid_table.mutex));
+	return err;
+}
+
+
+void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_active_ports actv_ports;
+	struct mlx4_cmd_mailbox *mailbox;
+	int num_eth_ports, err;
+	int i;
+
+	if (slave < 0 || slave > dev->persist->num_vfs)
+		return;
+
+	actv_ports = mlx4_get_active_ports(dev, slave);
+
+	for (i = 0, num_eth_ports = 0; i < dev->caps.num_ports; i++) {
+		if (test_bit(i, actv_ports.ports)) {
+			if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH)
+				continue;
+			num_eth_ports++;
+		}
+	}
+
+	if (!num_eth_ports)
+		return;
+
+	/* have ETH ports.  Alloc mailbox for SET_PORT command */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return;
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (test_bit(i, actv_ports.ports)) {
+			if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH)
+				continue;
+			err = mlx4_reset_roce_port_gids(dev, slave, i + 1, mailbox);
+			if (err)
+				mlx4_warn(dev, "Could not reset ETH port GID table for slave %d, port %d (%d)\n",
+					  slave, i + 1, err);
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return;
 }
 
 static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
@@ -555,21 +1270,17 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
 	__be32 new_cap_mask;
 
 	port = in_mod & 0xff;
-	in_modifier = (in_mod >> 8) & 0xff;
+	in_modifier = in_mod >> 8;
 	is_eth = op_mod;
 	port_info = &priv->port[port];
 
-	if (op_mod > 1)
-		return -EINVAL;
-
 	/* Slaves cannot perform SET_PORT operations except changing MTU */
 	if (is_eth) {
 		if (slave != dev->caps.function &&
 		    in_modifier != MLX4_SET_PORT_GENERAL &&
 		    in_modifier != MLX4_SET_PORT_GID_TABLE) {
-			mlx4_warn(dev, "denying SET_PORT for slave:%d,"
-				  "port %d, config_select 0x%x\n",
-				  slave, port, in_modifier);
+			mlx4_warn(dev, "denying SET_PORT for slave:%d\n",
+					slave);
 			return -EINVAL;
 		}
 		switch (in_modifier) {
@@ -612,15 +1323,28 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
 			}
 
 			gen_context->mtu = cpu_to_be16(master->max_mtu[port]);
+			/* Slave cannot change Global Pause configuration */
+			if (slave != mlx4_master_func_num(dev) &&
+			    ((gen_context->pptx != master->pptx) ||
+			     (gen_context->pprx != master->pprx))) {
+				gen_context->pptx = master->pptx;
+				gen_context->pprx = master->pprx;
+				mlx4_warn(dev,
+					  "denying Global Pause change for slave:%d\n",
+					  slave);
+			} else {
+				master->pptx = gen_context->pptx;
+				master->pprx = gen_context->pprx;
+			}
 			break;
 		case MLX4_SET_PORT_GID_TABLE:
 			/* change to MULTIPLE entries: number of guest's gids
 			 * need a FOR-loop here over number of gids the guest has.
 			 * 1. Check no duplicates in gids passed by slave
 			 */
-			num_gids = mlx4_get_slave_num_gids(dev, slave);
-			base = mlx4_get_base_gid_ix(dev, slave);
-			gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf);
+			num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+			base = mlx4_get_base_gid_ix(dev, slave, port);
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
 			for (i = 0; i < num_gids; gid_entry_mbox++, i++) {
 				if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
 					    sizeof(zgid_entry)))
@@ -641,13 +1365,15 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
 			/* 2. Check that do not have duplicates in OTHER
 			 *    entries in the port GID table
 			 */
+
+			mutex_lock(&(priv->port[port].gid_table.mutex));
 			for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
 				if (i >= base && i < base + num_gids)
 					continue; /* don't compare to slave's current gids */
-				gid_entry_tbl = &priv->roce_gids[port - 1][i];
+				gid_entry_tbl = &priv->port[port].gid_table.roce_gids[i];
 				if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry)))
 					continue;
-				gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf);
+				gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
 				for (j = 0; j < num_gids; gid_entry_mbox++, j++) {
 					if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
 						    sizeof(zgid_entry)))
@@ -655,31 +1381,45 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
 					if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw,
 						    sizeof(gid_entry_tbl->raw))) {
 						/* found duplicate */
-						mlx4_warn(dev, "requested gid entry for slave:%d "
-							  "is a duplicate of gid at index %d\n",
+						mlx4_warn(dev, "requested gid entry for slave:%d is a duplicate of gid at index %d\n",
 							  slave, i);
+						mutex_unlock(&(priv->port[port].gid_table.mutex));
 						return -EINVAL;
 					}
 				}
 			}
 
 			/* insert slave GIDs with memcpy, starting at slave's base index */
-			gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf);
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
 			for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++)
-				memcpy(priv->roce_gids[port - 1][offset].raw, gid_entry_mbox->raw, 16);
+				memcpy(priv->port[port].gid_table.roce_gids[offset].raw,
+				       gid_entry_mbox->raw, MLX4_ROCE_GID_ENTRY_SIZE);
 
 			/* Now, copy roce port gids table to current mailbox for passing to FW */
-			gid_entry_mbox = (struct mlx4_roce_gid_entry *) (inbox->buf);
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
 			for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
-				memcpy(gid_entry_mbox->raw, priv->roce_gids[port - 1][i].raw, 16);
+				memcpy(gid_entry_mbox->raw,
+				       priv->port[port].gid_table.roce_gids[i].raw,
+				       MLX4_ROCE_GID_ENTRY_SIZE);
 
-			break;
+			err = mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
+				       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				       MLX4_CMD_NATIVE);
+			mutex_unlock(&(priv->port[port].gid_table.mutex));
+			return err;
 		}
+
 		return mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
 				MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
 				MLX4_CMD_NATIVE);
 	}
 
+	/* Slaves are not allowed to SET_PORT beacon (LED) blink */
+	if (op_mod == MLX4_SET_PORT_BEACON_OPCODE) {
+		mlx4_warn(dev, "denying SET_PORT Beacon slave:%d\n", slave);
+		return -EPERM;
+	}
+
 	/* For IB, we only consider:
 	 * - The capability mask, which is set to the aggregate of all
 	 *   slave function capabilities
@@ -739,6 +1479,15 @@ int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
 			  struct mlx4_cmd_mailbox *outbox,
 			  struct mlx4_cmd_info *cmd)
 {
+	int port = mlx4_slave_convert_port(
+			dev, slave, vhcr->in_modifier & 0xFF);
+
+	if (port < 0)
+		return -EINVAL;
+
+	vhcr->in_modifier = (vhcr->in_modifier & ~0xFF) |
+			    (port & 0xFF);
+
 	return mlx4_common_set_port(dev, slave, vhcr->in_modifier,
 				    vhcr->op_modifier, inbox);
 }
@@ -755,51 +1504,43 @@ enum {
 int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
 {
 	struct mlx4_cmd_mailbox *mailbox;
-	int err = -EINVAL, vl_cap, pkey_tbl_flag = 0;
-	u32 in_mod;
+	int err, vl_cap, pkey_tbl_flag = 0;
 
-	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_NONE)
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
 		return 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
-	memset(mailbox->buf, 0, 256);
+	((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
 
-	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
-		in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
-		err = mlx4_cmd(dev, mailbox->dma, in_mod, 1,
-			       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
-			       MLX4_CMD_WRAPPED);
-	} else {
-		((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
+	if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) {
+		pkey_tbl_flag = 1;
+		((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz);
+	}
 
-		if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) {
-			pkey_tbl_flag = 1;
-			((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz);
-		}
-
-		/* IB VL CAP enum isn't used by the firmware, just numerical values */
-		for (vl_cap = dev->caps.vl_cap[port];
-				vl_cap >= 1; vl_cap >>= 1) {
-			((__be32 *) mailbox->buf)[0] = cpu_to_be32(
-				(1 << MLX4_CHANGE_PORT_MTU_CAP) |
-				(1 << MLX4_CHANGE_PORT_VL_CAP)  |
-				(pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) |
-				(dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) |
-				(vl_cap << MLX4_SET_PORT_VL_CAP));
-			err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
-					MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
-			if (err != -ENOMEM)
-				break;
-		}
+	/* IB VL CAP enum isn't used by the firmware, just numerical values */
+	for (vl_cap = 8; vl_cap >= 1; vl_cap >>= 1) {
+		((__be32 *) mailbox->buf)[0] = cpu_to_be32(
+			(1 << MLX4_CHANGE_PORT_MTU_CAP) |
+			(1 << MLX4_CHANGE_PORT_VL_CAP)  |
+			(pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) |
+			(dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) |
+			(vl_cap << MLX4_SET_PORT_VL_CAP));
+		err = mlx4_cmd(dev, mailbox->dma, port,
+			       MLX4_SET_PORT_IB_OPCODE, MLX4_CMD_SET_PORT,
+			       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+		if (err != -ENOMEM)
+			break;
 	}
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
 
+#define SET_PORT_ROCE_2_FLAGS          0x10
+#define MLX4_SET_PORT_ROCE_V1_V2       0x2
 int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
 {
@@ -812,8 +1553,6 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 	context = mailbox->buf;
-	memset(context, 0, sizeof *context);
-
 	context->flags = SET_PORT_GEN_ALL_VALID;
 	context->mtu = cpu_to_be16(mtu);
 	context->pptx = (pptx * (!pfctx)) << 7;
@@ -821,9 +1560,15 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 	context->pprx = (pprx * (!pfcrx)) << 7;
 	context->pfcrx = pfcrx;
 
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+		context->flags |= SET_PORT_ROCE_2_FLAGS;
+		context->roce_mode |=
+			MLX4_SET_PORT_ROCE_V1_V2 << 4;
+	}
 	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
-	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B,  MLX4_CMD_WRAPPED);
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
@@ -847,8 +1592,6 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 	context = mailbox->buf;
-	memset(context, 0, sizeof *context);
-
 	context->base_qpn = cpu_to_be32(base_qpn);
 	context->n_mac = dev->caps.log_num_macs;
 	context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT |
@@ -861,86 +1604,103 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 	context->vlan_miss = MLX4_VLAN_MISS_IDX;
 
 	in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
-	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B,  MLX4_CMD_WRAPPED);
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
 EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc);
 
-int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc)
+int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, u8 ignore_fcs_value)
 {
 	struct mlx4_cmd_mailbox *mailbox;
-	struct mlx4_set_port_prio2tc_context *context;
-	int err;
+	struct mlx4_set_port_general_context *context;
 	u32 in_mod;
-	int i;
+	int err;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 	context = mailbox->buf;
-	memset(context, 0, sizeof *context);
+	context->v_ignore_fcs |= MLX4_FLAG_V_IGNORE_FCS_MASK;
+	if (ignore_fcs_value)
+		context->ignore_fcs |= MLX4_IGNORE_FCS_MASK;
+	else
+		context->ignore_fcs &= ~MLX4_IGNORE_FCS_MASK;
 
-	for (i = 0; i < MLX4_NUM_UP; i += 2)
-		context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1];
-
-	in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port;
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
 	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
 		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
-EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC);
+EXPORT_SYMBOL(mlx4_SET_PORT_fcs_check);
 
-int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
-		u8 *pg, u16 *ratelimit)
+enum {
+	VXLAN_ENABLE_MODIFY	= 1 << 7,
+	VXLAN_STEERING_MODIFY	= 1 << 6,
+
+	VXLAN_ENABLE		= 1 << 7,
+};
+
+struct mlx4_set_port_vxlan_context {
+	u32	reserved1;
+	u8	modify_flags;
+	u8	reserved2;
+	u8	enable_flags;
+	u8	steering;
+};
+
+int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable)
 {
-	struct mlx4_cmd_mailbox *mailbox;
-	struct mlx4_set_port_scheduler_context *context;
 	int err;
 	u32 in_mod;
-	int i;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_vxlan_context  *context;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 	context = mailbox->buf;
-	memset(context, 0, sizeof *context);
+	memset(context, 0, sizeof(*context));
 
-	for (i = 0; i < MLX4_NUM_TC; i++) {
-		struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i];
-		u16 r;
-		if (ratelimit && ratelimit[i]) {
-			if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) {
-				r = ratelimit[i];
-				tc->max_bw_units =
-					htons(MLX4_RATELIMIT_100M_UNITS);
-			} else {
-				r = ratelimit[i]/10;
-				tc->max_bw_units =
-					htons(MLX4_RATELIMIT_1G_UNITS);
-			}
-			tc->max_bw_value = htons(r);
-		} else {
-			tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT);
-			tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS);
-		}
+	context->modify_flags = VXLAN_ENABLE_MODIFY | VXLAN_STEERING_MODIFY;
+	if (enable)
+		context->enable_flags = VXLAN_ENABLE;
+	context->steering  = steering;
 
-		tc->pg = htons(pg[i]);
-		tc->bw_precentage = htons(tc_tx_bw[i]);
-	}
-
-	in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port;
-	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
-		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	in_mod = MLX4_SET_PORT_VXLAN << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
-EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
+EXPORT_SYMBOL(mlx4_SET_PORT_VXLAN);
+
+int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	*((__be32 *)mailbox->buf) = cpu_to_be32(time);
+
+	err = mlx4_cmd(dev, mailbox->dma, port, MLX4_SET_PORT_BEACON_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_BEACON);
 
 int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
 				struct mlx4_vhcr *vhcr,
@@ -982,48 +1742,115 @@ int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
 	return 0;
 }
 
-int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id)
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
+				 int *slave_id)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int i, found_ix = -1;
 	int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	struct mlx4_slaves_pport slaves_pport;
+	unsigned num_vfs;
+	int slave_gid;
 
 	if (!mlx4_is_mfunc(dev))
 		return -EINVAL;
 
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	num_vfs = bitmap_weight(slaves_pport.slaves,
+				dev->persist->num_vfs + 1) - 1;
+
 	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
-		if (!memcmp(priv->roce_gids[port - 1][i].raw, gid, 16)) {
+		if (!memcmp(priv->port[port].gid_table.roce_gids[i].raw, gid,
+			    MLX4_ROCE_GID_ENTRY_SIZE)) {
 			found_ix = i;
 			break;
 		}
 	}
 
 	if (found_ix >= 0) {
+		/* Calculate a slave_gid which is the slave number in the gid
+		 * table and not a globally unique slave number.
+		 */
 		if (found_ix < MLX4_ROCE_PF_GIDS)
-			*slave_id = 0;
-		else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % dev->num_vfs) *
-			 (vf_gids / dev->num_vfs + 1))
-			*slave_id = ((found_ix - MLX4_ROCE_PF_GIDS) /
-				     (vf_gids / dev->num_vfs + 1)) + 1;
+			slave_gid = 0;
+		else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % num_vfs) *
+			 (vf_gids / num_vfs + 1))
+			slave_gid = ((found_ix - MLX4_ROCE_PF_GIDS) /
+				     (vf_gids / num_vfs + 1)) + 1;
 		else
-			*slave_id =
+			slave_gid =
 			((found_ix - MLX4_ROCE_PF_GIDS -
-			  ((vf_gids % dev->num_vfs) * ((vf_gids / dev->num_vfs + 1)))) /
-			 (vf_gids / dev->num_vfs)) + vf_gids % dev->num_vfs + 1;
+			  ((vf_gids % num_vfs) * ((vf_gids / num_vfs + 1)))) /
+			 (vf_gids / num_vfs)) + vf_gids % num_vfs + 1;
+
+		/* Calculate the globally unique slave id */
+		if (slave_gid) {
+			struct mlx4_active_ports exclusive_ports;
+			struct mlx4_active_ports actv_ports;
+			struct mlx4_slaves_pport slaves_pport_actv;
+			unsigned max_port_p_one;
+			int num_vfs_before = 0;
+			int candidate_slave_gid;
+
+			/* Calculate how many VFs are on the previous port, if exists */
+			for (i = 1; i < port; i++) {
+				bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+				set_bit(i - 1, exclusive_ports.ports);
+				slaves_pport_actv =
+					mlx4_phys_to_slaves_pport_actv(
+							dev, &exclusive_ports);
+				num_vfs_before += bitmap_weight(
+						slaves_pport_actv.slaves,
+						dev->persist->num_vfs + 1);
+			}
+
+			/* candidate_slave_gid isn't necessarily the correct slave, but
+			 * it has the same number of ports and is assigned to the same
+			 * ports as the real slave we're looking for. On dual port VF,
+			 * slave_gid = [single port VFs on port <port>] +
+			 * [offset of the current slave from the first dual port VF] +
+			 * 1 (for the PF).
+			 */
+			candidate_slave_gid = slave_gid + num_vfs_before;
+
+			actv_ports = mlx4_get_active_ports(dev, candidate_slave_gid);
+			max_port_p_one = find_first_bit(
+				actv_ports.ports, dev->caps.num_ports) +
+				bitmap_weight(actv_ports.ports,
+					      dev->caps.num_ports) + 1;
+
+			/* Calculate the real slave number */
+			for (i = 1; i < max_port_p_one; i++) {
+				if (i == port)
+					continue;
+				bitmap_zero(exclusive_ports.ports,
+					    dev->caps.num_ports);
+				set_bit(i - 1, exclusive_ports.ports);
+				slaves_pport_actv =
+					mlx4_phys_to_slaves_pport_actv(
+						dev, &exclusive_ports);
+				slave_gid += bitmap_weight(
+						slaves_pport_actv.slaves,
+						dev->persist->num_vfs + 1);
+			}
+		}
+		*slave_id = slave_gid;
 	}
 
 	return (found_ix >= 0) ? 0 : -EINVAL;
 }
 EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid);
 
-int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid)
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
+				 u8 *gid)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	if (!mlx4_is_master(dev))
 		return -EINVAL;
 
-	memcpy(gid, priv->roce_gids[port - 1][slave_id].raw, 16);
+	memcpy(gid, priv->port[port].gid_table.roce_gids[slave_id].raw,
+	       MLX4_ROCE_GID_ENTRY_SIZE);
 	return 0;
 }
 EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave);
@@ -1047,15 +1874,15 @@ struct mlx4_cable_info {
 };
 
 enum cable_info_err {
-	CABLE_INF_INV_PORT	= 0x1,
-	CABLE_INF_OP_NOSUP	= 0x2,
-	CABLE_INF_NOT_CONN	= 0x3,
-	CABLE_INF_NO_EEPRM	= 0x4,
-	CABLE_INF_PAGE_ERR	= 0x5,
-	CABLE_INF_INV_ADDR	= 0x6,
-	CABLE_INF_I2C_ADDR	= 0x7,
-	CABLE_INF_QSFP_VIO	= 0x8,
-	CABLE_INF_I2C_BUSY	= 0x9,
+	 CABLE_INF_INV_PORT      = 0x1,
+	 CABLE_INF_OP_NOSUP      = 0x2,
+	 CABLE_INF_NOT_CONN      = 0x3,
+	 CABLE_INF_NO_EEPRM      = 0x4,
+	 CABLE_INF_PAGE_ERR      = 0x5,
+	 CABLE_INF_INV_ADDR      = 0x6,
+	 CABLE_INF_I2C_ADDR      = 0x7,
+	 CABLE_INF_QSFP_VIO      = 0x8,
+	 CABLE_INF_I2C_BUSY      = 0x9,
 };
 
 #define MAD_STATUS_2_CABLE_ERR(mad_status) ((mad_status >> 8) & 0xFF)
@@ -1102,8 +1929,8 @@ static inline const char *cable_info_mad_err_str(u16 mad_status)
  * Returns num of read bytes on success or a negative error
  * code.
  */
-int mlx4_get_module_info(struct mlx4_dev *dev, u8 port, u16 offset,
-    u16 size, u8 *data)
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
+			 u16 offset, u16 size, u8 *data)
 {
 	struct mlx4_cmd_mailbox *inbox, *outbox;
 	struct mlx4_mad_ifc *inmad, *outmad;
@@ -1115,17 +1942,12 @@ int mlx4_get_module_info(struct mlx4_dev *dev, u8 port, u16 offset,
 		size = MODULE_INFO_MAX_READ;
 
 	inbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(inbox)) {
-		mlx4_err(dev,
-			 "mlx4_alloc_cmd_mailbox returned with error(%lx)", PTR_ERR(inbox));
+	if (IS_ERR(inbox))
 		return PTR_ERR(inbox);
-	}
 
 	outbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(outbox)) {
 		mlx4_free_cmd_mailbox(dev, inbox);
-		mlx4_err(dev,
-			 "mlx4_alloc_cmd_mailbox returned with error(%lx)", PTR_ERR(outbox));
 		return PTR_ERR(outbox);
 	}
 
@@ -1158,7 +1980,8 @@ int mlx4_get_module_info(struct mlx4_dev *dev, u8 port, u16 offset,
 	cable_info->size = cpu_to_be16(size);
 
 	ret = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
-	    MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
 	if (ret)
 		goto out;
 
@@ -1166,10 +1989,10 @@ int mlx4_get_module_info(struct mlx4_dev *dev, u8 port, u16 offset,
 		/* Mad returned with bad status */
 		ret = be16_to_cpu(outmad->status);
 #ifdef DEBUG
-		mlx4_warn(dev, "MLX4_CMD_MAD_IFC Get Module info attr(%x) "
-		    "port(%d) i2c_addr(%x) offset(%d) size(%d): Response "
-		    "Mad Status(%x) - %s\n", 0xFF60, port, i2c_addr, offset,
-		    size, ret, cable_info_mad_err_str(ret));
+		mlx4_warn(dev,
+			  "MLX4_CMD_MAD_IFC Get Module info attr(%x) port(%d) i2c_addr(%x) offset(%d) size(%d): Response Mad Status(%x) - %s\n",
+			  0xFF60, port, i2c_addr, offset, size,
+			  ret, cable_info_mad_err_str(ret));
 #endif
 		if (i2c_addr == I2C_ADDR_HIGH &&
 		    MAD_STATUS_2_CABLE_ERR(ret) == CABLE_INF_I2C_ADDR)
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_profile.c b/sys/dev/mlx4/mlx4_core/mlx4_profile.c
index 46239313d4b0..aff76dd9bdb0 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_profile.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_profile.c
@@ -76,13 +76,12 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 		u64 size;
 		u64 start;
 		int type;
-		u64 num;
+		u32 num;
 		int log_num;
 	};
 
 	u64 total_size = 0;
 	struct mlx4_resource *profile;
-	struct mlx4_resource tmp;
 	int i, j;
 
 	profile = kcalloc(MLX4_RES_NUM, sizeof(*profile), GFP_KERNEL);
@@ -107,13 +106,11 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 	profile[MLX4_RES_AUXC].num    = request->num_qp;
 	profile[MLX4_RES_SRQ].num     = request->num_srq;
 	profile[MLX4_RES_CQ].num      = request->num_cq;
-	profile[MLX4_RES_EQ].num      = mlx4_is_mfunc(dev) ?
-					dev->phys_caps.num_phys_eqs :
+	profile[MLX4_RES_EQ].num = mlx4_is_mfunc(dev) ? dev->phys_caps.num_phys_eqs :
 					min_t(unsigned, dev_cap->max_eqs, MAX_MSIX);
 	profile[MLX4_RES_DMPT].num    = request->num_mpt;
 	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
-	profile[MLX4_RES_MTT].num     = ((u64)request->num_mtt_segs) *
-					(1 << log_mtts_per_seg);
+	profile[MLX4_RES_MTT].num     = request->num_mtt * (1 << log_mtts_per_seg);
 	profile[MLX4_RES_MCG].num     = request->num_mcg;
 
 	for (i = 0; i < MLX4_RES_NUM; ++i) {
@@ -132,11 +129,8 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 	 */
 	for (i = MLX4_RES_NUM; i > 0; --i)
 		for (j = 1; j < i; ++j) {
-			if (profile[j].size > profile[j - 1].size) {
-				tmp	       = profile[j];
-				profile[j]     = profile[j - 1];
-				profile[j - 1] = tmp;
-			}
+			if (profile[j].size > profile[j - 1].size)
+				swap(profile[j], profile[j - 1]);
 		}
 
 	for (i = 0; i < MLX4_RES_NUM; ++i) {
@@ -146,18 +140,17 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 		}
 
 		if (total_size > dev_cap->max_icm_sz) {
-			mlx4_err(dev, "Profile requires 0x%llx bytes; "
-				  "won't fit in 0x%llx bytes of context memory.\n",
-				  (unsigned long long) total_size,
-				  (unsigned long long) dev_cap->max_icm_sz);
+			mlx4_err(dev, "Profile requires 0x%llx bytes; won't fit in 0x%llx bytes of context memory\n",
+				 (unsigned long long) total_size,
+				 (unsigned long long) dev_cap->max_icm_sz);
 			kfree(profile);
 			return -ENOMEM;
 		}
 
 		if (profile[i].size)
-			mlx4_dbg(dev, "  profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, "
-				  "size 0x%10llx\n",
-				 i, res_name[profile[i].type], profile[i].log_num,
+			mlx4_dbg(dev, "  profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, size 0x%10llx\n",
+				 i, res_name[profile[i].type],
+				 profile[i].log_num,
 				 (unsigned long long) profile[i].start,
 				 (unsigned long long) profile[i].size);
 	}
@@ -200,15 +193,16 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 			break;
 		case MLX4_RES_EQ:
 			if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
-				init_hca->log_num_eqs	= 0x1f;
-				init_hca->eqc_base	= profile[i].start;
-				init_hca->num_sys_eqs	= dev_cap->num_sys_eqs;
+				init_hca->log_num_eqs = 0x1f;
+				init_hca->eqc_base    = profile[i].start;
+				init_hca->num_sys_eqs = dev_cap->num_sys_eqs;
 			} else {
-				 dev->caps.num_eqs	= roundup_pow_of_two(
-					min_t(unsigned,
-						dev_cap->max_eqs, MAX_MSIX));
-				init_hca->eqc_base	= profile[i].start;
-				init_hca->log_num_eqs	= ilog2(dev->caps.num_eqs);
+				dev->caps.num_eqs     = roundup_pow_of_two(
+								min_t(unsigned,
+								      dev_cap->max_eqs,
+								      MAX_MSIX));
+				init_hca->eqc_base    = profile[i].start;
+				init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
 			}
 			break;
 		case MLX4_RES_DMPT:
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_qp.c b/sys/dev/mlx4/mlx4_core/mlx4_qp.c
index 3b40b4563d16..be435776591a 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_qp.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_qp.c
@@ -43,9 +43,7 @@
 #include "mlx4.h"
 #include "icm.h"
 
-/*
- * QP to support BF should have bits 6,7 cleared
- */
+/* QP to support BF should have bits 6,7 cleared */
 #define MLX4_BF_QP_SKIP_MASK	0xc0
 #define MLX4_MAX_BF_QP_RANGE	0x40
 
@@ -170,6 +168,12 @@ static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 	}
 
+	if ((cur_state == MLX4_QP_STATE_RTR) &&
+	    (new_state == MLX4_QP_STATE_RTS) &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+		context->roce_entropy =
+			cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn));
+
 	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
 	memcpy(mailbox->buf + 8, context, sizeof *context);
 
@@ -216,19 +220,25 @@ EXPORT_SYMBOL_GPL(mlx4_qp_modify);
 int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 			    int *base, u8 flags)
 {
-	int bf_qp = !!(flags & (u8) MLX4_RESERVE_BF_QP);
+	u32 uid;
+	int bf_qp = !!(flags & (u8)MLX4_RESERVE_ETH_BF_QP);
 
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
 
-	/* Only IPoIB uses a large cnt. In this case, just allocate
-	 * as usual, ignoring bf skipping, since IPoIB does not run over RoCE
-	 */
 	if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp)
-		bf_qp = 0;
+		return -ENOMEM;
 
-	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align,
-					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0);
+	uid = MLX4_QP_TABLE_ZONE_GENERAL;
+	if (flags & (u8)MLX4_RESERVE_A0_QP) {
+		if (bf_qp)
+			uid = MLX4_QP_TABLE_ZONE_RAW_ETH;
+		else
+			uid = MLX4_QP_TABLE_ZONE_RSS;
+	}
+
+	*base = mlx4_zone_alloc_entries(qp_table->zones, uid, cnt, align,
+					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0, NULL);
 	if (*base == -1)
 		return -ENOMEM;
 
@@ -246,7 +256,7 @@ int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 	flags &= dev->caps.alloc_res_qp_mask;
 
 	if (mlx4_is_mfunc(dev)) {
-		set_param_l(&in_param, (((u32) flags) << 24) | (u32) cnt);
+		set_param_l(&in_param, (((u32)flags) << 24) | (u32)cnt);
 		set_param_h(&in_param, align);
 		err = mlx4_cmd_imm(dev, in_param, &out_param,
 				   RES_QP, RES_OP_RESERVE,
@@ -269,7 +279,7 @@ void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
 
 	if (mlx4_is_qp_reserved(dev, (u32) base_qpn))
 		return;
-	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, MLX4_USE_RR);
+	mlx4_zone_free_entries_unique(qp_table->zones, base_qpn, cnt);
 }
 
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
@@ -284,37 +294,37 @@ void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
 			       MLX4_CMD_FREE_RES,
 			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 		if (err) {
-			mlx4_warn(dev, "Failed to release qp range"
-				  " base:%d cnt:%d\n", base_qpn, cnt);
+			mlx4_warn(dev, "Failed to release qp range base:%d cnt:%d\n",
+				  base_qpn, cnt);
 		}
 	} else
 		 __mlx4_qp_release_range(dev, base_qpn, cnt);
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
 
-int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
 	int err;
 
-	err = mlx4_table_get(dev, &qp_table->qp_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->qp_table, qpn, gfp);
 	if (err)
 		goto err_out;
 
-	err = mlx4_table_get(dev, &qp_table->auxc_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->auxc_table, qpn, gfp);
 	if (err)
 		goto err_put_qp;
 
-	err = mlx4_table_get(dev, &qp_table->altc_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->altc_table, qpn, gfp);
 	if (err)
 		goto err_put_auxc;
 
-	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn, gfp);
 	if (err)
 		goto err_put_altc;
 
-	err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn);
+	err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn, gfp);
 	if (err)
 		goto err_put_rdmarc;
 
@@ -336,7 +346,7 @@ int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
 	return err;
 }
 
-static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
+static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp)
 {
 	u64 param = 0;
 
@@ -346,7 +356,7 @@ static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
 				    MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A,
 				    MLX4_CMD_WRAPPED);
 	}
-	return __mlx4_qp_alloc_icm(dev, qpn);
+	return __mlx4_qp_alloc_icm(dev, qpn, gfp);
 }
 
 void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
@@ -375,7 +385,7 @@ static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
 		__mlx4_qp_free_icm(dev, qpn);
 }
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, gfp_t gfp)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
@@ -386,7 +396,7 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
 
 	qp->qpn = qpn;
 
-	err = mlx4_qp_alloc_icm(dev, qpn);
+	err = mlx4_qp_alloc_icm(dev, qpn, gfp);
 	if (err)
 		return err;
 
@@ -409,6 +419,75 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
 
 EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
 
+int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn,
+		   enum mlx4_update_qp_attr attr,
+		   struct mlx4_update_qp_params *params)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_update_qp_context *cmd;
+	u64 pri_addr_path_mask = 0;
+	u64 qp_mask = 0;
+	int err = 0;
+
+	if (!attr || (attr & ~MLX4_UPDATE_QP_SUPPORTED_ATTRS))
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cmd = (struct mlx4_update_qp_context *)mailbox->buf;
+
+	if (attr & MLX4_UPDATE_QP_SMAC) {
+		pri_addr_path_mask |= 1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX;
+		cmd->qp_context.pri_path.grh_mylmc = params->smac_index;
+	}
+
+	if (attr & MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB) {
+		if (!(dev->caps.flags2
+		      & MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB)) {
+			mlx4_warn(dev,
+				  "Trying to set src check LB, but it isn't supported\n");
+			err = -ENOTSUPP;
+			goto out;
+		}
+		pri_addr_path_mask |=
+			1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB;
+		if (params->flags &
+		    MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB) {
+			cmd->qp_context.pri_path.fl |=
+				MLX4_FL_ETH_SRC_CHECK_MC_LB;
+		}
+	}
+
+	if (attr & MLX4_UPDATE_QP_VSD) {
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_VSD;
+		if (params->flags & MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE)
+			cmd->qp_context.param3 |= cpu_to_be32(MLX4_STRIP_VLAN);
+	}
+
+	if (attr & MLX4_UPDATE_QP_RATE_LIMIT) {
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_RATE_LIMIT;
+		cmd->qp_context.rate_limit_params = cpu_to_be16((params->rate_unit << 14) | params->rate_val);
+	}
+
+	if (attr & MLX4_UPDATE_QP_QOS_VPORT) {
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_QOS_VPP;
+		cmd->qp_context.qos_vport = params->qos_vport;
+	}
+
+	cmd->primary_addr_path_mask = cpu_to_be64(pri_addr_path_mask);
+	cmd->qp_mask = cpu_to_be64(qp_mask);
+
+	err = mlx4_cmd(dev, mailbox->dma, qpn & 0xffffff, 0,
+		       MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_update_qp);
+
 void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
@@ -436,6 +515,227 @@ static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
 			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
+#define MLX4_QP_TABLE_RSS_ETH_PRIORITY 2
+#define MLX4_QP_TABLE_RAW_ETH_PRIORITY 1
+#define MLX4_QP_TABLE_RAW_ETH_SIZE     256
+
+static int mlx4_create_zones(struct mlx4_dev *dev,
+			     u32 reserved_bottom_general,
+			     u32 reserved_top_general,
+			     u32 reserved_bottom_rss,
+			     u32 start_offset_rss,
+			     u32 max_table_offset)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_bitmap (*bitmap)[MLX4_QP_TABLE_ZONE_NUM] = NULL;
+	int bitmap_initialized = 0;
+	u32 last_offset;
+	int k;
+	int err;
+
+	qp_table->zones = mlx4_zone_allocator_create(MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP);
+
+	if (NULL == qp_table->zones)
+		return -ENOMEM;
+
+	bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
+
+	if (NULL == bitmap) {
+		err = -ENOMEM;
+		goto free_zone;
+	}
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_GENERAL, dev->caps.num_qps,
+			       (1 << 23) - 1, reserved_bottom_general,
+			       reserved_top_general);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_GENERAL,
+				MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO |
+				MLX4_ZONE_USE_RR, 0,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_GENERAL);
+
+	if (err)
+		goto free_bitmap;
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+			       reserved_bottom_rss,
+			       reserved_bottom_rss - 1,
+			       dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+			       reserved_bottom_rss - start_offset_rss);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_RSS,
+				MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+				MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+				MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RSS_ETH_PRIORITY,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_RSS);
+
+	if (err)
+		goto free_bitmap;
+
+	last_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+	/*  We have a single zone for the A0 steering QPs area of the FW. This area
+	 *  needs to be split into subareas. One set of subareas is for RSS QPs
+	 *  (in which qp number bits 6 and/or 7 are set); the other set of subareas
+	 *  is for RAW_ETH QPs, which require that both bits 6 and 7 are zero.
+	 *  Currently, the values returned by the FW (A0 steering area starting qp number
+	 *  and A0 steering area size) are such that there are only two subareas -- one
+	 *  for RSS and one for RAW_ETH.
+	 */
+	for (k = MLX4_QP_TABLE_ZONE_RSS + 1; k < sizeof(*bitmap)/sizeof((*bitmap)[0]);
+	     k++) {
+		int size;
+		u32 offset = start_offset_rss;
+		u32 bf_mask;
+		u32 requested_size;
+
+		/* Assuming MLX4_BF_QP_SKIP_MASK is consecutive ones, this calculates
+		 * a mask of all LSB bits set until (and not including) the first
+		 * set bit of  MLX4_BF_QP_SKIP_MASK. For example, if MLX4_BF_QP_SKIP_MASK
+		 * is 0xc0, bf_mask will be 0x3f.
+		 */
+		bf_mask = (MLX4_BF_QP_SKIP_MASK & ~(MLX4_BF_QP_SKIP_MASK - 1)) - 1;
+		requested_size = min((u32)MLX4_QP_TABLE_RAW_ETH_SIZE, bf_mask + 1);
+
+		if (((last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     ((int)(max_table_offset - last_offset)) >=
+		     roundup_pow_of_two(MLX4_BF_QP_SKIP_MASK)) ||
+		    (!(last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     !((last_offset + requested_size - 1) &
+		       MLX4_BF_QP_SKIP_MASK)))
+			size = requested_size;
+		else {
+			u32 candidate_offset =
+				(last_offset | MLX4_BF_QP_SKIP_MASK | bf_mask) + 1;
+
+			if (last_offset & MLX4_BF_QP_SKIP_MASK)
+				last_offset = candidate_offset;
+
+			/* From this point, the BF bits are 0 */
+
+			if (last_offset > max_table_offset) {
+				/* need to skip */
+				size = -1;
+			} else {
+				size = min3(max_table_offset - last_offset,
+					    bf_mask - (last_offset & bf_mask),
+					    requested_size);
+				if (size < requested_size) {
+					int candidate_size;
+
+					candidate_size = min3(
+						max_table_offset - candidate_offset,
+						bf_mask - (last_offset & bf_mask),
+						requested_size);
+
+					/*  We will not take this path if last_offset was
+					 *  already set above to candidate_offset
+					 */
+					if (candidate_size > size) {
+						last_offset = candidate_offset;
+						size = candidate_size;
+					}
+				}
+			}
+		}
+
+		if (size > 0) {
+			/* mlx4_bitmap_alloc_range will find a contiguous range of "size"
+			 * QPs in which both bits 6 and 7 are zero, because we pass it the
+			 * MLX4_BF_SKIP_MASK).
+			 */
+			offset = mlx4_bitmap_alloc_range(
+					*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+					size, 1,
+					MLX4_BF_QP_SKIP_MASK);
+
+			if (offset == (u32)-1) {
+				err = -ENOMEM;
+				break;
+			}
+
+			last_offset = offset + size;
+
+			err = mlx4_bitmap_init(*bitmap + k, roundup_pow_of_two(size),
+					       roundup_pow_of_two(size) - 1, 0,
+					       roundup_pow_of_two(size) - size);
+		} else {
+			/* Add an empty bitmap, we'll allocate from different zones (since
+			 * at least one is reserved)
+			 */
+			err = mlx4_bitmap_init(*bitmap + k, 1,
+					       MLX4_QP_TABLE_RAW_ETH_SIZE - 1, 0,
+					       0);
+			mlx4_bitmap_alloc_range(*bitmap + k, 1, 1, 0);
+		}
+
+		if (err)
+			break;
+
+		++bitmap_initialized;
+
+		err = mlx4_zone_add_one(qp_table->zones, *bitmap + k,
+					MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+					MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+					MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RAW_ETH_PRIORITY,
+					offset, qp_table->zones_uids + k);
+
+		if (err)
+			break;
+	}
+
+	if (err)
+		goto free_bitmap;
+
+	qp_table->bitmap_gen = *bitmap;
+
+	return err;
+
+free_bitmap:
+	for (k = 0; k < bitmap_initialized; k++)
+		mlx4_bitmap_cleanup(*bitmap + k);
+	kfree(bitmap);
+free_zone:
+	mlx4_zone_allocator_destroy(qp_table->zones);
+	return err;
+}
+
+static void mlx4_cleanup_qp_zones(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+
+	if (qp_table->zones) {
+		int i;
+
+		for (i = 0;
+		     i < sizeof(qp_table->zones_uids)/sizeof(qp_table->zones_uids[0]);
+		     i++) {
+			struct mlx4_bitmap *bitmap =
+				mlx4_zone_get_bitmap(qp_table->zones,
+						     qp_table->zones_uids[i]);
+
+			mlx4_zone_remove_one(qp_table->zones, qp_table->zones_uids[i]);
+			if (NULL == bitmap)
+				continue;
+
+			mlx4_bitmap_cleanup(bitmap);
+		}
+		mlx4_zone_allocator_destroy(qp_table->zones);
+		kfree(qp_table->bitmap_gen);
+		qp_table->bitmap_gen = NULL;
+		qp_table->zones = NULL;
+	}
+}
+
 int mlx4_init_qp_table(struct mlx4_dev *dev)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
@@ -443,49 +743,56 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	int reserved_from_top = 0;
 	int reserved_from_bot;
 	int k;
+	int fixed_reserved_from_bot_rv = 0;
+	int bottom_reserved_for_rss_bitmap;
+	u32 max_table_offset = dev->caps.dmfs_high_rate_qpn_base +
+			dev->caps.dmfs_high_rate_qpn_range;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
 	if (mlx4_is_slave(dev))
 		return 0;
 
-	/*
-	 * We reserve 2 extra QPs per port for the special QPs.  The
+	/* We reserve 2 extra QPs per port for the special QPs.  The
 	 * block of special QPs must be aligned to a multiple of 8, so
 	 * round up.
 	 *
 	 * We also reserve the MSB of the 24-bit QP number to indicate
 	 * that a QP is an XRC QP.
 	 */
-	dev->phys_caps.base_sqpn =
-		ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
+	for (k = 0; k <= MLX4_QP_REGION_BOTTOM; k++)
+		fixed_reserved_from_bot_rv += dev->caps.reserved_qps_cnt[k];
+
+	if (fixed_reserved_from_bot_rv < max_table_offset)
+		fixed_reserved_from_bot_rv = max_table_offset;
+
+	/* We reserve at least 1 extra for bitmaps that we don't have enough space for*/
+	bottom_reserved_for_rss_bitmap =
+		roundup_pow_of_two(fixed_reserved_from_bot_rv + 1);
+	dev->phys_caps.base_sqpn = ALIGN(bottom_reserved_for_rss_bitmap, 8);
 
 	{
 		int sort[MLX4_NUM_QP_REGION];
-		int i, j, tmp;
+		int i, j;
 		int last_base = dev->caps.num_qps;
 
 		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
 			sort[i] = i;
 
-		for (i = MLX4_NUM_QP_REGION; i > 0; --i) {
-			for (j = 2; j < i; ++j) {
+		for (i = MLX4_NUM_QP_REGION; i > MLX4_QP_REGION_BOTTOM; --i) {
+			for (j = MLX4_QP_REGION_BOTTOM + 2; j < i; ++j) {
 				if (dev->caps.reserved_qps_cnt[sort[j]] >
-				    dev->caps.reserved_qps_cnt[sort[j - 1]]) {
-					tmp             = sort[j];
-					sort[j]         = sort[j - 1];
-					sort[j - 1]     = tmp;
-				}
+				    dev->caps.reserved_qps_cnt[sort[j - 1]])
+					swap(sort[j], sort[j - 1]);
 			}
 		}
 
-		for (i = 1; i < MLX4_NUM_QP_REGION; ++i) {
+		for (i = MLX4_QP_REGION_BOTTOM + 1; i < MLX4_NUM_QP_REGION; ++i) {
 			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
 			dev->caps.reserved_qps_base[sort[i]] = last_base;
 			reserved_from_top +=
 				dev->caps.reserved_qps_cnt[sort[i]];
 		}
-
 	}
 
        /* Reserve 8 real SQPs in both native and SRIOV modes.
@@ -500,14 +807,15 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	*/
 	reserved_from_bot = mlx4_num_reserved_sqps(dev);
 	if (reserved_from_bot + reserved_from_top > dev->caps.num_qps) {
-		mlx4_err(dev, "Number of reserved QPs is higher than number "
-			 "of QPs, increase the value of log_num_qp\n");
+		mlx4_err(dev, "Number of reserved QPs is higher than number of QPs\n");
 		return -EINVAL;
 	}
 
-	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
-			       (1 << 23) - 1, reserved_from_bot,
-			       reserved_from_top);
+	err = mlx4_create_zones(dev, reserved_from_bot, reserved_from_bot,
+				bottom_reserved_for_rss_bitmap,
+				fixed_reserved_from_bot_rv,
+				max_table_offset);
+
 	if (err)
 		return err;
 
@@ -543,7 +851,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn);
 	if (err)
 		goto err_mem;
-	return 0;
+
+	return err;
 
 err_mem:
 	kfree(dev->caps.qp0_tunnel);
@@ -552,6 +861,7 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	kfree(dev->caps.qp1_proxy);
 	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
 		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+	mlx4_cleanup_qp_zones(dev);
 	return err;
 }
 
@@ -561,7 +871,8 @@ void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
 		return;
 
 	mlx4_CONF_SPECIAL_QP(dev, 0);
-	mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
+
+	mlx4_cleanup_qp_zones(dev);
 }
 
 int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
@@ -601,11 +912,12 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	for (i = 0; i < ARRAY_SIZE(states) - 1; i++) {
 		context->flags &= cpu_to_be32(~(0xf << 28));
 		context->flags |= cpu_to_be32(states[i + 1] << 28);
+		if (states[i + 1] != MLX4_QP_STATE_RTR)
+			context->params2 &= ~MLX4_QP_BIT_FPP;
 		err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
 				     context, 0, 0, qp);
 		if (err) {
-			mlx4_err(dev, "Failed to bring QP to state: "
-				 "%d with error: %d\n",
+			mlx4_err(dev, "Failed to bring QP to state: %d with error: %d\n",
 				 states[i + 1], err);
 			return err;
 		}
@@ -616,3 +928,23 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn)
+{
+	struct mlx4_qp_context context;
+	struct mlx4_qp qp;
+	int err;
+
+	qp.qpn = qpn;
+	err = mlx4_qp_query(dev, &qp, &context);
+	if (!err) {
+		u32 dest_qpn = be32_to_cpu(context.remote_qpn) & 0xffffff;
+		u16 folded_dst = folded_qp(dest_qpn);
+		u16 folded_src = folded_qp(qpn);
+
+		return (dest_qpn != qpn) ?
+			((folded_dst ^ folded_src) | 0xC000) :
+			folded_src | 0xC000;
+	}
+	return 0xdead;
+}
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_reset.c b/sys/dev/mlx4/mlx4_core/mlx4_reset.c
index 44ec1e12b898..dd2a9acfd74f 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_reset.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_reset.c
@@ -72,29 +72,29 @@ int mlx4_reset(struct mlx4_dev *dev)
 	hca_header = kmalloc(256, GFP_KERNEL);
 	if (!hca_header) {
 		err = -ENOMEM;
-		mlx4_err(dev, "Couldn't allocate memory to save HCA "
-			  "PCI header, aborting.\n");
+		mlx4_err(dev, "Couldn't allocate memory to save HCA PCI header, aborting\n");
 		goto out;
 	}
 
-	pcie_cap = pci_pcie_cap(dev->pdev);
+	pcie_cap = pci_pcie_cap(dev->persist->pdev);
 
 	for (i = 0; i < 64; ++i) {
 		if (i == 22 || i == 23)
 			continue;
-		if (pci_read_config_dword(dev->pdev, i * 4, hca_header + i)) {
+		if (pci_read_config_dword(dev->persist->pdev, i * 4,
+					  hca_header + i)) {
 			err = -ENODEV;
-			mlx4_err(dev, "Couldn't save HCA "
-				  "PCI header, aborting.\n");
+			mlx4_err(dev, "Couldn't save HCA PCI header, aborting\n");
 			goto out;
 		}
 	}
 
-	reset = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_RESET_BASE,
+	reset = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_RESET_BASE,
 			MLX4_RESET_SIZE);
 	if (!reset) {
 		err = -ENOMEM;
-		mlx4_err(dev, "Couldn't map HCA reset register, aborting.\n");
+		mlx4_err(dev, "Couldn't map HCA reset register, aborting\n");
 		goto out;
 	}
 
@@ -119,13 +119,13 @@ int mlx4_reset(struct mlx4_dev *dev)
 	writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET);
 	iounmap(reset);
 
-	/* wait half a second before accessing device */
-	msleep(500);
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
 
 	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
 	do {
-		if (!pci_read_config_word(dev->pdev, PCI_VENDOR_ID, &vendor) &&
-		    vendor != 0xffff)
+		if (!pci_read_config_word(dev->persist->pdev, PCI_VENDOR_ID,
+					  &vendor) && vendor != 0xffff)
 			break;
 
 		msleep(1);
@@ -133,27 +133,26 @@ int mlx4_reset(struct mlx4_dev *dev)
 
 	if (vendor == 0xffff) {
 		err = -ENODEV;
-		mlx4_err(dev, "PCI device did not come back after reset, "
-			  "aborting.\n");
+		mlx4_err(dev, "PCI device did not come back after reset, aborting\n");
 		goto out;
 	}
 
 	/* Now restore the PCI headers */
 	if (pcie_cap) {
 		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
-		if (pcie_capability_write_word(dev->pdev, PCI_EXP_DEVCTL,
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_DEVCTL,
 					       devctl)) {
 			err = -ENODEV;
-			mlx4_err(dev, "Couldn't restore HCA PCI Express "
-				 "Device Control register, aborting.\n");
+			mlx4_err(dev, "Couldn't restore HCA PCI Express Device Control register, aborting\n");
 			goto out;
 		}
 		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
-		if (pcie_capability_write_word(dev->pdev, PCI_EXP_LNKCTL,
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_LNKCTL,
 					       linkctl)) {
 			err = -ENODEV;
-			mlx4_err(dev, "Couldn't restore HCA PCI Express "
-				 "Link control register, aborting.\n");
+			mlx4_err(dev, "Couldn't restore HCA PCI Express Link control register, aborting\n");
 			goto out;
 		}
 	}
@@ -162,19 +161,19 @@ int mlx4_reset(struct mlx4_dev *dev)
 		if (i * 4 == PCI_COMMAND)
 			continue;
 
-		if (pci_write_config_dword(dev->pdev, i * 4, hca_header[i])) {
+		if (pci_write_config_dword(dev->persist->pdev, i * 4,
+					   hca_header[i])) {
 			err = -ENODEV;
-			mlx4_err(dev, "Couldn't restore HCA reg %x, "
-				  "aborting.\n", i);
+			mlx4_err(dev, "Couldn't restore HCA reg %x, aborting\n",
+				 i);
 			goto out;
 		}
 	}
 
-	if (pci_write_config_dword(dev->pdev, PCI_COMMAND,
+	if (pci_write_config_dword(dev->persist->pdev, PCI_COMMAND,
 				   hca_header[PCI_COMMAND / 4])) {
 		err = -ENODEV;
-		mlx4_err(dev, "Couldn't restore HCA COMMAND, "
-			  "aborting.\n");
+		mlx4_err(dev, "Couldn't restore HCA COMMAND, aborting\n");
 		goto out;
 	}
 
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_resource_tracker.c b/sys/dev/mlx4/mlx4_core/mlx4_resource_tracker.c
index 43edcac1bfc9..4e4b04758e11 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_resource_tracker.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_resource_tracker.c
@@ -48,6 +48,8 @@
 #include "fw.h"
 
 #define MLX4_MAC_VALID		(1ull << 63)
+#define MLX4_PF_COUNTERS_PER_PORT	2
+#define MLX4_VF_COUNTERS_PER_PORT	1
 
 struct mac_res {
 	struct list_head list;
@@ -219,13 +221,15 @@ enum res_fs_rule_states {
 struct res_fs_rule {
 	struct res_common	com;
 	int			qpn;
+	/* VF DMFS mbox with port flipped */
+	void			*mirr_mbox;
+	/* > 0 --> apply mirror when getting into HA mode      */
+	/* = 0 --> un-apply mirror when getting out of HA mode */
+	u32			mirr_mbox_size;
+	struct list_head	mirr_list;
+	u64			mirr_rule_id;
 };
 
-static int mlx4_is_eth(struct mlx4_dev *dev, int port)
-{
-	return dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB ? 0 : 1;
-}
-
 static void *res_tracker_lookup(struct rb_root *root, u64 res_id)
 {
 	struct rb_node *node = root->rb_node;
@@ -279,7 +283,7 @@ enum qp_transition {
 };
 
 /* For Debug uses */
-static const char *ResourceType(enum mlx4_resource rt)
+static const char *resource_str(enum mlx4_resource rt)
 {
 	switch (rt) {
 	case RES_QP: return "RES_QP";
@@ -307,10 +311,15 @@ static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
 		&priv->mfunc.master.res_tracker.res_alloc[res_type];
 	int err = -EINVAL;
 	int allocated, free, reserved, guaranteed, from_free;
+	int from_rsvd;
+
+	if (slave > dev->persist->num_vfs)
+		return -EINVAL;
 
 	spin_lock(&res_alloc->alloc_lock);
 	allocated = (port > 0) ?
-		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] :
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
 		res_alloc->allocated[slave];
 	free = (port > 0) ? res_alloc->res_port_free[port - 1] :
 		res_alloc->res_free;
@@ -318,11 +327,16 @@ static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
 		res_alloc->res_reserved;
 	guaranteed = res_alloc->guaranteed[slave];
 
-	if (allocated + count > res_alloc->quota[slave])
+	if (allocated + count > res_alloc->quota[slave]) {
+		mlx4_warn(dev, "VF %d port %d res %s: quota exceeded, count %d alloc %d quota %d\n",
+			  slave, port, resource_str(res_type), count,
+			  allocated, res_alloc->quota[slave]);
 		goto out;
+	}
 
 	if (allocated + count <= guaranteed) {
 		err = 0;
+		from_rsvd = count;
 	} else {
 		/* portion may need to be obtained from free area */
 		if (guaranteed - allocated > 0)
@@ -330,25 +344,33 @@ static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
 		else
 			from_free = count;
 
-		if (free - from_free > reserved)
+		from_rsvd = count - from_free;
+
+		if (free - from_free >= reserved)
 			err = 0;
+		else
+			mlx4_warn(dev, "VF %d port %d res %s: free pool empty, free %d from_free %d rsvd %d\n",
+				  slave, port, resource_str(res_type), free,
+				  from_free, reserved);
 	}
 
 	if (!err) {
 		/* grant the request */
 		if (port > 0) {
-			res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] += count;
+			res_alloc->allocated[(port - 1) *
+			(dev->persist->num_vfs + 1) + slave] += count;
 			res_alloc->res_port_free[port - 1] -= count;
+			res_alloc->res_port_rsvd[port - 1] -= from_rsvd;
 		} else {
 			res_alloc->allocated[slave] += count;
 			res_alloc->res_free -= count;
+			res_alloc->res_reserved -= from_rsvd;
 		}
 	}
 
 out:
 	spin_unlock(&res_alloc->alloc_lock);
 	return err;
-
 }
 
 static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
@@ -358,14 +380,38 @@ static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct resource_allocator *res_alloc =
 		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+	int allocated, guaranteed, from_rsvd;
+
+	if (slave > dev->persist->num_vfs)
+		return;
 
 	spin_lock(&res_alloc->alloc_lock);
+
+	allocated = (port > 0) ?
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
+		res_alloc->allocated[slave];
+	guaranteed = res_alloc->guaranteed[slave];
+
+	if (allocated - count >= guaranteed) {
+		from_rsvd = 0;
+	} else {
+		/* portion may need to be returned to reserved area */
+		if (allocated - guaranteed > 0)
+			from_rsvd = count - (allocated - guaranteed);
+		else
+			from_rsvd = count;
+	}
+
 	if (port > 0) {
-		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] -= count;
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] -= count;
 		res_alloc->res_port_free[port - 1] += count;
+		res_alloc->res_port_rsvd[port - 1] += from_rsvd;
 	} else {
 		res_alloc->allocated[slave] -= count;
 		res_alloc->res_free += count;
+		res_alloc->res_reserved += from_rsvd;
 	}
 
 	spin_unlock(&res_alloc->alloc_lock);
@@ -377,7 +423,8 @@ static inline void initialize_res_quotas(struct mlx4_dev *dev,
 					 enum mlx4_resource res_type,
 					 int vf, int num_instances)
 {
-	res_alloc->guaranteed[vf] = num_instances / (2 * (dev->num_vfs + 1));
+	res_alloc->guaranteed[vf] = num_instances /
+				    (2 * (dev->persist->num_vfs + 1));
 	res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf];
 	if (vf == mlx4_master_func_num(dev)) {
 		res_alloc->res_free = num_instances;
@@ -421,11 +468,21 @@ void mlx4_init_quotas(struct mlx4_dev *dev)
 	dev->quotas.mpt =
 		priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf];
 }
+
+static int get_max_gauranteed_vfs_counter(struct mlx4_dev *dev)
+{
+	/* reduce the sink counter */
+	return (dev->caps.max_counters - 1 -
+		(MLX4_PF_COUNTERS_PER_PORT * MLX4_MAX_PORTS))
+		/ MLX4_MAX_PORTS;
+}
+
 int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int i, j;
 	int t;
+	int max_vfs_guarantee_counter = get_max_gauranteed_vfs_counter(dev);
 
 	priv->mfunc.master.res_tracker.slave_list =
 		kzalloc(dev->num_slaves * sizeof(struct slave_list),
@@ -448,21 +505,31 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
 		struct resource_allocator *res_alloc =
 			&priv->mfunc.master.res_tracker.res_alloc[i];
-		res_alloc->quota = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
-		res_alloc->guaranteed = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+		res_alloc->quota = kmalloc((dev->persist->num_vfs + 1) *
+					   sizeof(int), GFP_KERNEL);
+		res_alloc->guaranteed = kmalloc((dev->persist->num_vfs + 1) *
+						sizeof(int), GFP_KERNEL);
 		if (i == RES_MAC || i == RES_VLAN)
 			res_alloc->allocated = kzalloc(MLX4_MAX_PORTS *
-						       (dev->num_vfs + 1) * sizeof(int),
-							GFP_KERNEL);
+						       (dev->persist->num_vfs
+						       + 1) *
+						       sizeof(int), GFP_KERNEL);
 		else
-			res_alloc->allocated = kzalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+			res_alloc->allocated = kzalloc((dev->persist->
+							num_vfs + 1) *
+						       sizeof(int), GFP_KERNEL);
+		/* Reduce the sink counter */
+		if (i == RES_COUNTER)
+			res_alloc->res_free = dev->caps.max_counters - 1;
 
 		if (!res_alloc->quota || !res_alloc->guaranteed ||
 		    !res_alloc->allocated)
 			goto no_mem_err;
 
 		spin_lock_init(&res_alloc->alloc_lock);
-		for (t = 0; t < dev->num_vfs + 1; t++) {
+		for (t = 0; t < dev->persist->num_vfs + 1; t++) {
+			struct mlx4_active_ports actv_ports =
+				mlx4_get_active_ports(dev, t);
 			switch (i) {
 			case RES_QP:
 				initialize_res_quotas(dev, res_alloc, RES_QP,
@@ -492,13 +559,29 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 				break;
 			case RES_MAC:
 				if (t == mlx4_master_func_num(dev)) {
+					int max_vfs_pport = 0;
+					/* Calculate the max vfs per port for */
+					/* both ports.			      */
+					for (j = 0; j < dev->caps.num_ports;
+					     j++) {
+						struct mlx4_slaves_pport slaves_pport =
+							mlx4_phys_to_slaves_pport(dev, j + 1);
+						unsigned current_slaves =
+							bitmap_weight(slaves_pport.slaves,
+								      dev->caps.num_ports) - 1;
+						if (max_vfs_pport < current_slaves)
+							max_vfs_pport =
+								current_slaves;
+					}
 					res_alloc->quota[t] =
-						MLX4_MAX_MAC_NUM - 2 * dev->num_vfs;
-					res_alloc->guaranteed[t] = res_alloc->quota[t];
+						MLX4_MAX_MAC_NUM -
+						2 * max_vfs_pport;
+					res_alloc->guaranteed[t] = 2;
 					for (j = 0; j < MLX4_MAX_PORTS; j++)
-						res_alloc->res_port_free[j] = MLX4_MAX_MAC_NUM;
+						res_alloc->res_port_free[j] =
+							MLX4_MAX_MAC_NUM;
 				} else {
-					res_alloc->quota[t] = 2;
+					res_alloc->quota[t] = MLX4_MAX_MAC_NUM;
 					res_alloc->guaranteed[t] = 2;
 				}
 				break;
@@ -516,17 +599,26 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 				break;
 			case RES_COUNTER:
 				res_alloc->quota[t] = dev->caps.max_counters;
-				res_alloc->guaranteed[t] = 0;
 				if (t == mlx4_master_func_num(dev))
-					res_alloc->res_free = res_alloc->quota[t];
+					res_alloc->guaranteed[t] =
+						MLX4_PF_COUNTERS_PER_PORT *
+						MLX4_MAX_PORTS;
+				else if (t <= max_vfs_guarantee_counter)
+					res_alloc->guaranteed[t] =
+						MLX4_VF_COUNTERS_PER_PORT *
+						MLX4_MAX_PORTS;
+				else
+					res_alloc->guaranteed[t] = 0;
+				res_alloc->res_free -= res_alloc->guaranteed[t];
 				break;
 			default:
 				break;
 			}
 			if (i == RES_MAC || i == RES_VLAN) {
-				for (j = 0; j < MLX4_MAX_PORTS; j++)
-					res_alloc->res_port_rsvd[j] +=
-						res_alloc->guaranteed[t];
+				for (j = 0; j < dev->caps.num_ports; j++)
+					if (test_bit(j, actv_ports.ports))
+						res_alloc->res_port_rsvd[j] +=
+							res_alloc->guaranteed[t];
 			} else {
 				res_alloc->res_reserved += res_alloc->guaranteed[t];
 			}
@@ -562,6 +654,7 @@ void mlx4_free_resource_tracker(struct mlx4_dev *dev,
 			}
 			/* free master's vlans */
 			i = dev->caps.function;
+			mlx4_reset_roce_gids(dev, i);
 			mutex_lock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
 			rem_slave_vlans(dev, i);
 			mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
@@ -608,15 +701,17 @@ static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox,
 	if (MLX4_QP_ST_UD == ts) {
 		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
 		if (mlx4_is_eth(dev, port))
-			qp_ctx->pri_path.mgid_index = mlx4_get_base_gid_ix(dev, slave) | 0x80;
+			qp_ctx->pri_path.mgid_index =
+				mlx4_get_base_gid_ix(dev, slave, port) | 0x80;
 		else
-			qp_ctx->pri_path.mgid_index = 0x80 | slave;
+			qp_ctx->pri_path.mgid_index = slave | 0x80;
 
-	} else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_UC == ts) {
+	} else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_XRC == ts || MLX4_QP_ST_UC == ts) {
 		if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
 			port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
 			if (mlx4_is_eth(dev, port)) {
-				qp_ctx->pri_path.mgid_index += mlx4_get_base_gid_ix(dev, slave);
+				qp_ctx->pri_path.mgid_index +=
+					mlx4_get_base_gid_ix(dev, slave, port);
 				qp_ctx->pri_path.mgid_index &= 0x7f;
 			} else {
 				qp_ctx->pri_path.mgid_index = slave & 0x7F;
@@ -625,7 +720,8 @@ static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox,
 		if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
 			port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
 			if (mlx4_is_eth(dev, port)) {
-				qp_ctx->alt_path.mgid_index += mlx4_get_base_gid_ix(dev, slave);
+				qp_ctx->alt_path.mgid_index +=
+					mlx4_get_base_gid_ix(dev, slave, port);
 				qp_ctx->alt_path.mgid_index &= 0x7f;
 			} else {
 				qp_ctx->alt_path.mgid_index = slave & 0x7F;
@@ -634,29 +730,8 @@ static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox,
 	}
 }
 
-static int check_counter_index_validity(struct mlx4_dev *dev, int slave, int port, int idx)
-{
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct counter_index *counter, *tmp_counter;
-
-	if (slave == 0) {
-		list_for_each_entry_safe(counter, tmp_counter,
-					 &priv->counters_table.global_port_list[port - 1],
-					 list) {
-			if (counter->index == idx)
-				return 0;
-		}
-		return -EINVAL;
-	} else {
-		list_for_each_entry_safe(counter, tmp_counter,
-					 &priv->counters_table.vf_list[slave - 1][port - 1],
-					 list) {
-			if (counter->index == idx)
-				return 0;
-		}
-		return -EINVAL;
-	}
-}
+static int handle_counter(struct mlx4_dev *dev, struct mlx4_qp_context *qpc,
+			  u8 slave, int port);
 
 static int update_vport_qp_param(struct mlx4_dev *dev,
 				 struct mlx4_cmd_mailbox *inbox,
@@ -666,33 +741,16 @@ static int update_vport_qp_param(struct mlx4_dev *dev,
 	struct mlx4_vport_oper_state *vp_oper;
 	struct mlx4_priv *priv;
 	u32 qp_type;
-	int port;
+	int port, err = 0;
 
 	port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1;
 	priv = mlx4_priv(dev);
 	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
 	qp_type	= (be32_to_cpu(qpc->flags) >> 16) & 0xff;
 
-	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH &&
-	    qpc->pri_path.counter_index != MLX4_SINK_COUNTER_INDEX) {
-		if (check_counter_index_validity(dev, slave, port,
-						 qpc->pri_path.counter_index))
-			return -EINVAL;
-	}
-
-	mlx4_dbg(dev, "%s: QP counter_index %d for slave %d port %d\n",
-		 __func__, qpc->pri_path.counter_index, slave, port);
-
-	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK) &&
-	    dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH &&
-	    !mlx4_is_qp_reserved(dev, qpn) &&
-	    qp_type == MLX4_QP_ST_MLX &&
-	    qpc->pri_path.counter_index != 0xFF) {
-		/* disable multicast loopback to qp with same counter */
-		qpc->pri_path.fl |= MLX4_FL_ETH_SRC_CHECK_MC_LB;
-		qpc->pri_path.vlan_control |=
-			MLX4_VLAN_CTRL_ETH_SRC_CHECK_IF_COUNTER;
-	}
+	err = handle_counter(dev, qpc, slave, port);
+	if (err)
+		goto out;
 
 	if (MLX4_VGT != vp_oper->state.default_vlan) {
 		/* the reserved QPs (special, proxy, tunnel)
@@ -701,35 +759,76 @@ static int update_vport_qp_param(struct mlx4_dev *dev,
 		if (mlx4_is_qp_reserved(dev, qpn))
 			return 0;
 
-		/* force strip vlan by clear vsd */
-		qpc->param3 &= ~cpu_to_be32(MLX4_STRIP_VLAN);
+		/* force strip vlan by clear vsd, MLX QP refers to Raw Ethernet */
+		if (qp_type == MLX4_QP_ST_UD ||
+		    (qp_type == MLX4_QP_ST_MLX && mlx4_is_eth(dev, port))) {
+			if (dev->caps.bmme_flags & MLX4_BMME_FLAG_VSD_INIT2RTR) {
+				*(__be32 *)inbox->buf =
+					cpu_to_be32(be32_to_cpu(*(__be32 *)inbox->buf) |
+					MLX4_QP_OPTPAR_VLAN_STRIPPING);
+				qpc->param3 &= ~cpu_to_be32(MLX4_STRIP_VLAN);
+			} else {
+				struct mlx4_update_qp_params params = {.flags = 0};
+
+				err = mlx4_update_qp(dev, qpn, MLX4_UPDATE_QP_VSD, &params);
+				if (err)
+					goto out;
+			}
+		}
+
 		/* preserve IF_COUNTER flag */
 		qpc->pri_path.vlan_control &=
-			MLX4_VLAN_CTRL_ETH_SRC_CHECK_IF_COUNTER;
-		if (MLX4_QP_ST_RC != qp_type) {
-			if (0 != vp_oper->state.default_vlan) {
+			MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+		if (1 /*vp_oper->state.link_state == IFLA_VF_LINK_STATE_DISABLE*/ &&
+		    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP) {
+			qpc->pri_path.vlan_control |=
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+		} else if (0 != vp_oper->state.default_vlan) {
+			if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD)) {
+				/* vst QinQ should block untagged on TX,
+				 * but cvlan is in payload and phv is set so
+				 * hw see it as untagged. Block tagged instead.
+				 */
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			} else { /* vst 802.1Q */
 				qpc->pri_path.vlan_control |=
 					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
 					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
 					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
-			} else { /* priority tagged */
-				qpc->pri_path.vlan_control |=
-					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
-					MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
 			}
+		} else { /* priority tagged */
+			qpc->pri_path.vlan_control |=
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
 		}
+
 		qpc->pri_path.fvl_rx |= MLX4_FVL_RX_FORCE_ETH_VLAN;
 		qpc->pri_path.vlan_index = vp_oper->vlan_idx;
-		qpc->pri_path.fl |= MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN;
+		qpc->pri_path.fl |= MLX4_FL_ETH_HIDE_CQE_VLAN;
+		if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD))
+			qpc->pri_path.fl |= MLX4_FL_SV;
+		else
+			qpc->pri_path.fl |= MLX4_FL_CV;
 		qpc->pri_path.feup |= MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
 		qpc->pri_path.sched_queue &= 0xC7;
 		qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3;
+		qpc->qos_vport = vp_oper->state.qos_vport;
 	}
 	if (vp_oper->state.spoofchk) {
 		qpc->pri_path.feup |= MLX4_FSM_FORCE_ETH_SRC_MAC;
 		qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx;
 	}
-	return 0;
+out:
+	return err;
 }
 
 static int mpt_mask(struct mlx4_dev *dev)
@@ -816,6 +915,85 @@ static void put_res(struct mlx4_dev *dev, int slave, u64 res_id,
 	spin_unlock_irq(mlx4_tlock(dev));
 }
 
+static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			     u64 in_param, u64 *out_param, int port);
+
+static int handle_existing_counter(struct mlx4_dev *dev, u8 slave, int port,
+				   int counter_index)
+{
+	struct res_common *r;
+	struct res_counter *counter;
+	int ret = 0;
+
+	if (counter_index == MLX4_SINK_COUNTER_INDEX(dev))
+		return ret;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, counter_index, RES_COUNTER);
+	if (!r || r->owner != slave) {
+		ret = -EINVAL;
+	} else {
+		counter = container_of(r, struct res_counter, com);
+		if (!counter->port)
+			counter->port = port;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+	return ret;
+}
+
+static int handle_unexisting_counter(struct mlx4_dev *dev,
+				     struct mlx4_qp_context *qpc, u8 slave,
+				     int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *tmp;
+	struct res_counter *counter;
+	u64 counter_idx = MLX4_SINK_COUNTER_INDEX(dev);
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(tmp,
+			    &tracker->slave_list[slave].res_list[RES_COUNTER],
+			    list) {
+		counter = container_of(tmp, struct res_counter, com);
+		if (port == counter->port) {
+			qpc->pri_path.counter_index  = counter->com.res_id;
+			spin_unlock_irq(mlx4_tlock(dev));
+			return 0;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	/* No existing counter, need to allocate a new counter */
+	err = counter_alloc_res(dev, slave, RES_OP_RESERVE, 0, 0, &counter_idx,
+				port);
+	if (err == -ENOENT) {
+		err = 0;
+	} else if (err && err != -ENOSPC) {
+		mlx4_err(dev, "%s: failed to create new counter for slave %d err %d\n",
+			 __func__, slave, err);
+	} else {
+		qpc->pri_path.counter_index = counter_idx;
+		mlx4_dbg(dev, "%s: alloc new counter for slave %d index %d\n",
+			 __func__, slave, qpc->pri_path.counter_index);
+		err = 0;
+	}
+
+	return err;
+}
+
+static int handle_counter(struct mlx4_dev *dev, struct mlx4_qp_context *qpc,
+			  u8 slave, int port)
+{
+	if (qpc->pri_path.counter_index != MLX4_SINK_COUNTER_INDEX(dev))
+		return handle_existing_counter(dev, slave, port,
+					       qpc->pri_path.counter_index);
+
+	return handle_unexisting_counter(dev, qpc, slave, port);
+}
+
 static struct res_common *alloc_qp_tr(int id)
 {
 	struct res_qp *ret;
@@ -909,7 +1087,7 @@ static struct res_common *alloc_srq_tr(int id)
 	return &ret->com;
 }
 
-static struct res_common *alloc_counter_tr(int id)
+static struct res_common *alloc_counter_tr(int id, int port)
 {
 	struct res_counter *ret;
 
@@ -919,6 +1097,7 @@ static struct res_common *alloc_counter_tr(int id)
 
 	ret->com.res_id = id;
 	ret->com.state = RES_COUNTER_ALLOCATED;
+	ret->port = port;
 
 	return &ret->com;
 }
@@ -976,10 +1155,10 @@ static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
 		ret = alloc_srq_tr(id);
 		break;
 	case RES_MAC:
-		printk(KERN_ERR "implementation missing\n");
+		pr_err("implementation missing\n");
 		return NULL;
 	case RES_COUNTER:
-		ret = alloc_counter_tr(id);
+		ret = alloc_counter_tr(id, extra);
 		break;
 	case RES_XRCD:
 		ret = alloc_xrcdn_tr(id);
@@ -996,6 +1175,53 @@ static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
 	return ret;
 }
 
+int mlx4_calc_vf_counters(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_counter *data)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *tmp;
+	struct res_counter *counter;
+	int *counters_arr;
+	int i = 0, err = 0;
+
+	memset(data, 0, sizeof(*data));
+
+	counters_arr = kmalloc_array(dev->caps.max_counters,
+				     sizeof(*counters_arr), GFP_KERNEL);
+	if (!counters_arr)
+		return -ENOMEM;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(tmp,
+			    &tracker->slave_list[slave].res_list[RES_COUNTER],
+			    list) {
+		counter = container_of(tmp, struct res_counter, com);
+		if (counter->port == port) {
+			counters_arr[i] = (int)tmp->res_id;
+			i++;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	counters_arr[i] = -1;
+
+	i = 0;
+
+	while (counters_arr[i] != -1) {
+		err = mlx4_get_counter_stats(dev, counters_arr[i], data,
+					     0);
+		if (err) {
+			memset(data, 0, sizeof(*data));
+			goto table_changed;
+		}
+		i++;
+	}
+
+table_changed:
+	kfree(counters_arr);
+	return 0;
+}
+
 static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
 			 enum mlx4_resource type, int extra)
 {
@@ -1072,10 +1298,10 @@ static int remove_mtt_ok(struct res_mtt *res, int order)
 {
 	if (res->com.state == RES_MTT_BUSY ||
 	    atomic_read(&res->ref_count)) {
-		printk(KERN_DEBUG "%s-%d: state %s, ref_count %d\n",
-		       __func__, __LINE__,
-		       mtt_states_str(res->com.state),
-		       atomic_read(&res->ref_count));
+		pr_devel("%s-%d: state %s, ref_count %d\n",
+			 __func__, __LINE__,
+			 mtt_states_str(res->com.state),
+			 atomic_read(&res->ref_count));
 		return -EBUSY;
 	} else if (res->com.state != RES_MTT_ALLOCATED)
 		return -EPERM;
@@ -1378,13 +1604,14 @@ static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 			r->com.from_state = r->com.state;
 			r->com.to_state = state;
 			r->com.state = RES_EQ_BUSY;
-			if (eq)
-				*eq = r;
 		}
 	}
 
 	spin_unlock_irq(mlx4_tlock(dev));
 
+	if (!err && eq)
+		*eq = r;
+
 	return err;
 }
 
@@ -1398,43 +1625,29 @@ static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
 
 	spin_lock_irq(mlx4_tlock(dev));
 	r = res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn);
-	if (!r)
+	if (!r) {
 		err = -ENOENT;
-	else if (r->com.owner != slave)
+	} else if (r->com.owner != slave) {
 		err = -EPERM;
-	else {
-		switch (state) {
-		case RES_CQ_BUSY:
-			err = -EBUSY;
-			break;
-
-		case RES_CQ_ALLOCATED:
-			if (r->com.state != RES_CQ_HW)
-				err = -EINVAL;
-			else if (atomic_read(&r->ref_count))
-				err = -EBUSY;
-			else
-				err = 0;
-			break;
-
-		case RES_CQ_HW:
-			if (r->com.state != RES_CQ_ALLOCATED)
-				err = -EINVAL;
-			else
-				err = 0;
-			break;
-
-		default:
+	} else if (state == RES_CQ_ALLOCATED) {
+		if (r->com.state != RES_CQ_HW)
 			err = -EINVAL;
-		}
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+		else
+			err = 0;
+	} else if (state != RES_CQ_HW || r->com.state != RES_CQ_ALLOCATED) {
+		err = -EINVAL;
+	} else {
+		err = 0;
+	}
 
-		if (!err) {
-			r->com.from_state = r->com.state;
-			r->com.to_state = state;
-			r->com.state = RES_CQ_BUSY;
-			if (cq)
-				*cq = r;
-		}
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_CQ_BUSY;
+		if (cq)
+			*cq = r;
 	}
 
 	spin_unlock_irq(mlx4_tlock(dev));
@@ -1452,39 +1665,25 @@ static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 
 	spin_lock_irq(mlx4_tlock(dev));
 	r = res_tracker_lookup(&tracker->res_tree[RES_SRQ], index);
-	if (!r)
+	if (!r) {
 		err = -ENOENT;
-	else if (r->com.owner != slave)
+	} else if (r->com.owner != slave) {
 		err = -EPERM;
-	else {
-		switch (state) {
-		case RES_SRQ_BUSY:
+	} else if (state == RES_SRQ_ALLOCATED) {
+		if (r->com.state != RES_SRQ_HW)
 			err = -EINVAL;
-			break;
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+	} else if (state != RES_SRQ_HW || r->com.state != RES_SRQ_ALLOCATED) {
+		err = -EINVAL;
+	}
 
-		case RES_SRQ_ALLOCATED:
-			if (r->com.state != RES_SRQ_HW)
-				err = -EINVAL;
-			else if (atomic_read(&r->ref_count))
-				err = -EBUSY;
-			break;
-
-		case RES_SRQ_HW:
-			if (r->com.state != RES_SRQ_ALLOCATED)
-				err = -EINVAL;
-			break;
-
-		default:
-			err = -EINVAL;
-		}
-
-		if (!err) {
-			r->com.from_state = r->com.state;
-			r->com.to_state = state;
-			r->com.state = RES_SRQ_BUSY;
-			if (srq)
-				*srq = r;
-		}
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_SRQ_BUSY;
+		if (srq)
+			*srq = r;
 	}
 
 	spin_unlock_irq(mlx4_tlock(dev));
@@ -1581,7 +1780,7 @@ static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 			return err;
 
 		if (!fw_reserved(dev, qpn)) {
-			err = __mlx4_qp_alloc_icm(dev, qpn);
+			err = __mlx4_qp_alloc_icm(dev, qpn, GFP_KERNEL);
 			if (err) {
 				res_abort_move(dev, slave, RES_QP, qpn);
 				return err;
@@ -1624,8 +1823,9 @@ static int mtt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 	if (err) {
 		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
 		__mlx4_free_mtt_range(dev, base, order);
-	} else
+	} else {
 		set_param_l(out_param, base);
+	}
 
 	return err;
 }
@@ -1667,7 +1867,7 @@ static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 		if (err)
 			return err;
 
-		err = __mlx4_mpt_alloc_icm(dev, mpt->key);
+		err = __mlx4_mpt_alloc_icm(dev, mpt->key, GFP_KERNEL);
 		if (err) {
 			res_abort_move(dev, slave, RES_MPT, id);
 			return err;
@@ -1799,7 +1999,6 @@ static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port,
 	return 0;
 }
 
-
 static void mac_del_from_slave(struct mlx4_dev *dev, int slave, u64 mac,
 			       int port)
 {
@@ -1852,6 +2051,11 @@ static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 		return err;
 
 	port = !in_port ? get_param_l(out_param) : in_port;
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
 	mac = in_param;
 
 	err = __mlx4_register_mac(dev, port, mac);
@@ -1949,19 +2153,21 @@ static int vlan_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
-	int err = -EINVAL;
+	int err;
 	u16 vlan;
 	int vlan_index;
 	int port;
 
 	port = !in_port ? get_param_l(out_param) : in_port;
 
-	if (!port)
-		return err;
+	if (!port || op != RES_OP_RESERVE_AND_MAP)
+		return -EINVAL;
 
-	if (op != RES_OP_RESERVE_AND_MAP)
-		return err;
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
 
+	if (port < 0)
+		return -EINVAL;
 	/* upstream kernels had NOP for reg/unreg vlan. Continue this. */
 	if (!in_port && port > 0 && port <= dev->caps.num_ports) {
 		slave_state[slave].old_vlan_api = true;
@@ -1989,9 +2195,23 @@ static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 	if (op != RES_OP_RESERVE)
 		return -EINVAL;
 
-	err = __mlx4_counter_alloc(dev, slave, port, &index);
-	if (!err)
+	err = mlx4_grant_resource(dev, slave, RES_COUNTER, 1, 0);
+	if (err)
+		return err;
+
+	err = __mlx4_counter_alloc(dev, &index);
+	if (err) {
+		mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+		return err;
+	}
+
+	err = add_res_range(dev, slave, index, 1, RES_COUNTER, port);
+	if (err) {
+		__mlx4_counter_free(dev, index);
+		mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+	} else {
 		set_param_l(out_param, index);
+	}
 
 	return err;
 }
@@ -2067,8 +2287,7 @@ int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
 
 	case RES_COUNTER:
 		err = counter_alloc_res(dev, slave, vhcr->op_modifier, alop,
-					vhcr->in_param, &vhcr->out_param,
-					(vhcr->in_modifier >> 8) & 0xFF);
+					vhcr->in_param, &vhcr->out_param, 0);
 		break;
 
 	case RES_XRCD:
@@ -2169,16 +2388,15 @@ static int mpt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 		__mlx4_mpt_release(dev, index);
 		break;
 	case RES_OP_MAP_ICM:
-			index = get_param_l(&in_param);
-			id = index & mpt_mask(dev);
-			err = mr_res_start_move_to(dev, slave, id,
-						   RES_MPT_RESERVED, &mpt);
-			if (err)
-				return err;
-
-			__mlx4_mpt_free_icm(dev, mpt->key);
-			res_end_move(dev, slave, RES_MPT, id);
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = mr_res_start_move_to(dev, slave, id,
+					   RES_MPT_RESERVED, &mpt);
+		if (err)
 			return err;
+
+		__mlx4_mpt_free_icm(dev, mpt->key);
+		res_end_move(dev, slave, RES_MPT, id);
 		break;
 	default:
 		err = -EINVAL;
@@ -2246,6 +2464,11 @@ static int mac_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 	switch (op) {
 	case RES_OP_RESERVE_AND_MAP:
 		port = !in_port ? get_param_l(out_param) : in_port;
+		port = mlx4_slave_convert_port(
+				dev, slave, port);
+
+		if (port < 0)
+			return -EINVAL;
 		mac_del_from_slave(dev, slave, in_param, port);
 		__mlx4_unregister_mac(dev, port, in_param);
 		break;
@@ -2265,9 +2488,14 @@ static int vlan_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
 	int err = 0;
 
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
 	switch (op) {
 	case RES_OP_RESERVE_AND_MAP:
-		if (slave_state[slave].old_vlan_api == true)
+		if (slave_state[slave].old_vlan_api)
 			return 0;
 		if (!port)
 			return -EINVAL;
@@ -2283,18 +2511,26 @@ static int vlan_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 }
 
 static int counter_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
-			    u64 in_param, u64 *out_param, int port)
+			    u64 in_param, u64 *out_param)
 {
 	int index;
+	int err;
 
 	if (op != RES_OP_RESERVE)
 		return -EINVAL;
 
 	index = get_param_l(&in_param);
+	if (index == MLX4_SINK_COUNTER_INDEX(dev))
+		return 0;
 
-	__mlx4_counter_free(dev, slave, port, index);
+	err = rem_res_range(dev, slave, index, 1, RES_COUNTER, 0);
+	if (err)
+		return err;
 
-	return 0;
+	__mlx4_counter_free(dev, index);
+	mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+
+	return err;
 }
 
 static int xrcdn_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
@@ -2365,8 +2601,7 @@ int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
 
 	case RES_COUNTER:
 		err = counter_free_res(dev, slave, vhcr->op_modifier, alop,
-				       vhcr->in_param, &vhcr->out_param,
-				       (vhcr->in_modifier >> 8) & 0xFF);
+				       vhcr->in_param, &vhcr->out_param);
 		break;
 
 	case RES_XRCD:
@@ -2484,18 +2719,16 @@ int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
 	if (err)
 		return err;
 
-	/* Currently disable memory windows since this feature isn't tested yet
-	* under virtualization.
-	*/
+	/* Disable memory windows for VFs. */
 	if (!mr_is_region(inbox->buf)) {
-		err = -ENOSYS;
+		err = -EPERM;
 		goto ex_abort;
 	}
 
 	/* Make sure that the PD bits related to the slave id are zeros. */
 	pd = mr_get_pd(inbox->buf);
 	pd_slave = (pd >> 17) & 0x7f;
-	if (pd_slave != 0 && pd_slave != slave) {
+	if (pd_slave != 0 && --pd_slave != slave) {
 		err = -EPERM;
 		goto ex_abort;
 	}
@@ -2596,12 +2829,34 @@ int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
 	if (err)
 		return err;
 
-	if (mpt->com.from_state != RES_MPT_HW) {
+	if (mpt->com.from_state == RES_MPT_MAPPED) {
+		/* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do
+		 * that, the VF must read the MPT. But since the MPT entry memory is not
+		 * in the VF's virtual memory space, it must use QUERY_MPT to obtain the
+		 * entry contents. To guarantee that the MPT cannot be changed, the driver
+		 * must perform HW2SW_MPT before this query and return the MPT entry to HW
+		 * ownership fofollowing the change. The change here allows the VF to
+		 * perform QUERY_MPT also when the entry is in SW ownership.
+		 */
+		struct mlx4_mpt_entry *mpt_entry = mlx4_table_find(
+					&mlx4_priv(dev)->mr_table.dmpt_table,
+					mpt->key, NULL);
+
+		if (NULL == mpt_entry || NULL == outbox->buf) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry));
+
+		err = 0;
+	} else if (mpt->com.from_state == RES_MPT_HW) {
+		err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	} else {
 		err = -EBUSY;
 		goto out;
 	}
 
-	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
 
 out:
 	put_res(dev, slave, id, RES_MPT);
@@ -2636,6 +2891,10 @@ static void adjust_proxy_tun_qkey(struct mlx4_dev *dev, struct mlx4_vhcr *vhcr,
 	context->qkey = cpu_to_be32(qkey);
 }
 
+static int adjust_qp_sched_queue(struct mlx4_dev *dev, int slave,
+				 struct mlx4_qp_context *qpc,
+				 struct mlx4_cmd_mailbox *inbox);
+
 int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
 			     struct mlx4_vhcr *vhcr,
 			     struct mlx4_cmd_mailbox *inbox,
@@ -2658,6 +2917,10 @@ int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
 	struct res_srq *srq;
 	int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff;
 
+	err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+	if (err)
+		return err;
+
 	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_HW, &qp, 0);
 	if (err)
 		return err;
@@ -2778,7 +3041,7 @@ int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
 {
 	int err;
 	int eqn = vhcr->in_modifier;
-	int res_id = (slave << 8) | eqn;
+	int res_id = (slave << 10) | eqn;
 	struct mlx4_eq_context *eqc = inbox->buf;
 	int mtt_base = eq_get_mtt_addr(eqc) / dev->caps.mtt_entry_sz;
 	int mtt_size = eq_get_mtt_size(eqc);
@@ -2819,6 +3082,23 @@ int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
 	return err;
 }
 
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	u8 get = vhcr->op_modifier;
+
+	if (get != 1)
+		return -EPERM;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+	return err;
+}
+
 static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
 			      int len, struct res_mtt **res)
 {
@@ -2844,10 +3124,12 @@ static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
 }
 
 static int verify_qp_parameters(struct mlx4_dev *dev,
+				struct mlx4_vhcr *vhcr,
 				struct mlx4_cmd_mailbox *inbox,
 				enum qp_transition transition, u8 slave)
 {
 	u32			qp_type;
+	u32			qpn;
 	struct mlx4_qp_context	*qp_ctx;
 	enum mlx4_qp_optpar	optpar;
 	int port;
@@ -2857,8 +3139,16 @@ static int verify_qp_parameters(struct mlx4_dev *dev,
 	qp_type	= (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
 	optpar	= be32_to_cpu(*(__be32 *) inbox->buf);
 
+	if (slave != mlx4_master_func_num(dev)) {
+		qp_ctx->params2 &= ~MLX4_QP_BIT_FPP;
+		/* setting QP rate-limit is disallowed for VFs */
+		if (qp_ctx->rate_limit_params)
+			return -EPERM;
+	}
+
 	switch (qp_type) {
 	case MLX4_QP_ST_RC:
+	case MLX4_QP_ST_XRC:
 	case MLX4_QP_ST_UC:
 		switch (transition) {
 		case QP_TRANS_INIT2RTR:
@@ -2866,11 +3156,11 @@ static int verify_qp_parameters(struct mlx4_dev *dev,
 		case QP_TRANS_RTS2RTS:
 		case QP_TRANS_SQD2SQD:
 		case QP_TRANS_SQD2RTS:
-			if (slave != mlx4_master_func_num(dev))
+			if (slave != mlx4_master_func_num(dev)) {
 				if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
 					port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
 					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
-						num_gids = mlx4_get_slave_num_gids(dev, slave);
+						num_gids = mlx4_get_slave_num_gids(dev, slave, port);
 					else
 						num_gids = 1;
 					if (qp_ctx->pri_path.mgid_index >= num_gids)
@@ -2879,18 +3169,33 @@ static int verify_qp_parameters(struct mlx4_dev *dev,
 				if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
 					port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
 					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
-						num_gids = mlx4_get_slave_num_gids(dev, slave);
+						num_gids = mlx4_get_slave_num_gids(dev, slave, port);
 					else
 						num_gids = 1;
 					if (qp_ctx->alt_path.mgid_index >= num_gids)
 						return -EINVAL;
 				}
+			}
 			break;
 		default:
 			break;
 		}
-
 		break;
+
+	case MLX4_QP_ST_MLX:
+		qpn = vhcr->in_modifier & 0x7fffff;
+		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+		if (transition == QP_TRANS_INIT2RTR &&
+		    slave != mlx4_master_func_num(dev) &&
+		    mlx4_is_qp_reserved(dev, qpn) &&
+		    !mlx4_vf_smi_enabled(dev, slave, port)) {
+			/* only enabled VFs may create MLX proxy QPs */
+			mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n",
+				 __func__, slave, port);
+			return -EPERM;
+		}
+		break;
+
 	default:
 		break;
 	}
@@ -2943,7 +3248,7 @@ int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
 			  struct mlx4_cmd_info *cmd)
 {
 	int eqn = vhcr->in_modifier;
-	int res_id = eqn | (slave << 8);
+	int res_id = eqn | (slave << 10);
 	struct res_eq *eq;
 	int err;
 
@@ -2988,7 +3293,7 @@ int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe)
 		return -EINVAL;
 
 	/* check for slave valid, slave not PF, and slave active */
-	if (slave < 0 || slave >= dev->num_slaves ||
+	if (slave < 0 || slave > dev->persist->num_vfs ||
 	    slave == dev->caps.function ||
 	    !priv->mfunc.master.slave_state[slave].active)
 		return 0;
@@ -3000,7 +3305,7 @@ int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe)
 		return 0;
 
 	mutex_lock(&priv->mfunc.master.gen_eqe_mutex[slave]);
-	res_id = (slave << 8) | event_eq->eqn;
+	res_id = (slave << 10) | event_eq->eqn;
 	err = get_res(dev, slave, res_id, RES_EQ, &req);
 	if (err)
 		goto unlock;
@@ -3023,7 +3328,7 @@ int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe)
 
 	memcpy(mailbox->buf, (u8 *) eqe, 28);
 
-	in_modifier = (slave & 0xff) | ((event_eq->eqn & 0xff) << 16);
+	in_modifier = (slave & 0xff) | ((event_eq->eqn & 0x3ff) << 16);
 
 	err = mlx4_cmd(dev, mailbox->dma, in_modifier, 0,
 		       MLX4_CMD_GEN_EQE, MLX4_CMD_TIME_CLASS_B,
@@ -3049,7 +3354,7 @@ int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
 			  struct mlx4_cmd_info *cmd)
 {
 	int eqn = vhcr->in_modifier;
-	int res_id = eqn | (slave << 8);
+	int res_id = eqn | (slave << 10);
 	struct res_eq *eq;
 	int err;
 
@@ -3079,7 +3384,7 @@ int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
 	int cqn = vhcr->in_modifier;
 	struct mlx4_cq_context *cqc = inbox->buf;
 	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
-	struct res_cq *cq;
+	struct res_cq *cq = NULL;
 	struct res_mtt *mtt;
 
 	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq);
@@ -3115,7 +3420,7 @@ int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
 {
 	int err;
 	int cqn = vhcr->in_modifier;
-	struct res_cq *cq;
+	struct res_cq *cq = NULL;
 
 	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq);
 	if (err)
@@ -3254,7 +3559,7 @@ int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
 	int err;
 	int srqn = vhcr->in_modifier;
 	struct res_mtt *mtt;
-	struct res_srq *srq;
+	struct res_srq *srq = NULL;
 	struct mlx4_srq_context *srqc = inbox->buf;
 	int mtt_base = srq_get_mtt_addr(srqc) / dev->caps.mtt_entry_sz;
 
@@ -3298,7 +3603,7 @@ int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
 {
 	int err;
 	int srqn = vhcr->in_modifier;
-	struct res_srq *srq;
+	struct res_srq *srq = NULL;
 
 	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq);
 	if (err)
@@ -3403,6 +3708,39 @@ int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
 	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
 }
 
+static int adjust_qp_sched_queue(struct mlx4_dev *dev, int slave,
+				  struct mlx4_qp_context *qpc,
+				  struct mlx4_cmd_mailbox *inbox)
+{
+	enum mlx4_qp_optpar optpar = be32_to_cpu(*(__be32 *)inbox->buf);
+	u8 pri_sched_queue;
+	int port = mlx4_slave_convert_port(
+		   dev, slave, (qpc->pri_path.sched_queue >> 6 & 1) + 1) - 1;
+
+	if (port < 0)
+		return -EINVAL;
+
+	pri_sched_queue = (qpc->pri_path.sched_queue & ~(1 << 6)) |
+			  ((port & 1) << 6);
+
+	if (optpar & (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE) ||
+	    qpc->pri_path.sched_queue || mlx4_is_eth(dev, port + 1)) {
+		qpc->pri_path.sched_queue = pri_sched_queue;
+	}
+
+	if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+		port = mlx4_slave_convert_port(
+				dev, slave, (qpc->alt_path.sched_queue >> 6 & 1)
+				+ 1) - 1;
+		if (port < 0)
+			return -EINVAL;
+		qpc->alt_path.sched_queue =
+			(qpc->alt_path.sched_queue & ~(1 << 6)) |
+			(port & 1) << 6;
+	}
+	return 0;
+}
+
 static int roce_verify_mac(struct mlx4_dev *dev, int slave,
 				struct mlx4_qp_context *qpc,
 				struct mlx4_cmd_mailbox *inbox)
@@ -3440,7 +3778,10 @@ int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
 	u8 orig_vlan_index = qpc->pri_path.vlan_index;
 	u8 orig_feup = qpc->pri_path.feup;
 
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_INIT2RTR, slave);
+	err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_INIT2RTR, slave);
 	if (err)
 		return err;
 
@@ -3460,12 +3801,9 @@ int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
 		goto out;
 	}
 
-	/* do not modify vport QP params for RSS QPs */
-	if (!(qp->qpc_flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET))) {
-		err = update_vport_qp_param(dev, inbox, slave, qpn);
-		if (err)
-			goto out;
-	}
+	err = update_vport_qp_param(dev, inbox, slave, qpn);
+	if (err)
+		goto out;
 
 	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
 out:
@@ -3495,7 +3833,10 @@ int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
 	int err;
 	struct mlx4_qp_context *context = inbox->buf + 8;
 
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_RTR2RTS, slave);
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTR2RTS, slave);
 	if (err)
 		return err;
 
@@ -3514,7 +3855,10 @@ int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
 	int err;
 	struct mlx4_qp_context *context = inbox->buf + 8;
 
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_RTS2RTS, slave);
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTS2RTS, slave);
 	if (err)
 		return err;
 
@@ -3532,6 +3876,9 @@ int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
 			      struct mlx4_cmd_info *cmd)
 {
 	struct mlx4_qp_context *context = inbox->buf + 8;
+	int err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
 	adjust_proxy_tun_qkey(dev, vhcr, context);
 	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
 }
@@ -3545,7 +3892,10 @@ int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
 	int err;
 	struct mlx4_qp_context *context = inbox->buf + 8;
 
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2SQD, slave);
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2SQD, slave);
 	if (err)
 		return err;
 
@@ -3564,7 +3914,10 @@ int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
 	int err;
 	struct mlx4_qp_context *context = inbox->buf + 8;
 
-	err = verify_qp_parameters(dev, inbox, QP_TRANS_SQD2RTS, slave);
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2RTS, slave);
 	if (err)
 		return err;
 
@@ -3667,16 +4020,26 @@ static int rem_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
 	return err;
 }
 
-static int qp_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-		     int block_loopback, enum mlx4_protocol prot,
+static int qp_attach(struct mlx4_dev *dev, int slave, struct mlx4_qp *qp,
+		     u8 gid[16], int block_loopback, enum mlx4_protocol prot,
 		     enum mlx4_steer_type type, u64 *reg_id)
 {
 	switch (dev->caps.steering_mode) {
-	case MLX4_STEERING_MODE_DEVICE_MANAGED:
-		return mlx4_trans_to_dmfs_attach(dev, qp, gid, gid[5],
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		int port = mlx4_slave_convert_port(dev, slave, gid[5]);
+		if (port < 0)
+			return port;
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, port,
 						block_loopback, prot,
 						reg_id);
+	}
 	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH) {
+			int port = mlx4_slave_convert_port(dev, slave, gid[5]);
+			if (port < 0)
+				return port;
+			gid[5] = port;
+		}
 		return mlx4_qp_attach_common(dev, qp, gid,
 					    block_loopback, prot, type);
 	default:
@@ -3684,9 +4047,9 @@ static int qp_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 	}
 }
 
-static int qp_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-		     enum mlx4_protocol prot, enum mlx4_steer_type type,
-		     u64 reg_id)
+static int qp_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		     u8 gid[16], enum mlx4_protocol prot,
+		     enum mlx4_steer_type type, u64 reg_id)
 {
 	switch (dev->caps.steering_mode) {
 	case MLX4_STEERING_MODE_DEVICE_MANAGED:
@@ -3698,6 +4061,25 @@ static int qp_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 	}
 }
 
+static int mlx4_adjust_port(struct mlx4_dev *dev, int slave,
+			    u8 *gid, enum mlx4_protocol prot)
+{
+	int real_port;
+
+	if (prot != MLX4_PROT_ETH)
+		return 0;
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0 ||
+	    dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		real_port = mlx4_slave_convert_port(dev, slave, gid[5]);
+		if (real_port < 0)
+			return -EINVAL;
+		gid[5] = real_port;
+	}
+
+	return 0;
+}
+
 int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 			       struct mlx4_vhcr *vhcr,
 			       struct mlx4_cmd_mailbox *inbox,
@@ -3723,7 +4105,7 @@ int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 
 	qp.qpn = qpn;
 	if (attach) {
-		err = qp_attach(dev, &qp, gid, block_loopback, prot,
+		err = qp_attach(dev, slave, &qp, gid, block_loopback, prot,
 				type, &reg_id);
 		if (err) {
 			pr_err("Fail to attach rule to qp 0x%x\n", qpn);
@@ -3733,6 +4115,10 @@ int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 		if (err)
 			goto ex_detach;
 	} else {
+		err = mlx4_adjust_port(dev, slave, gid, prot);
+		if (err)
+			goto ex_put;
+
 		err = rem_mcg_res(dev, slave, rqp, gid, prot, type, &reg_id);
 		if (err)
 			goto ex_put;
@@ -3767,7 +4153,7 @@ static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header,
 	    !is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
 		list_for_each_entry_safe(res, tmp, rlist, list) {
 			be_mac = cpu_to_be64(res->mac << 16);
-			if (!memcmp(&be_mac, eth_header->eth.dst_mac, ETH_ALEN))
+			if (ether_addr_equal((u8 *)&be_mac, eth_header->eth.dst_mac))
 				return 0;
 		}
 		pr_err("MAC %pM doesn't belong to VF %d, Steering rule rejected\n",
@@ -3777,6 +4163,22 @@ static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header,
 	return 0;
 }
 
+static void handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl,
+					 struct _rule_hw *eth_header)
+{
+	if (is_multicast_ether_addr(eth_header->eth.dst_mac) ||
+	    is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
+		struct mlx4_net_trans_rule_hw_eth *eth =
+			(struct mlx4_net_trans_rule_hw_eth *)eth_header;
+		struct _rule_hw *next_rule = (struct _rule_hw *)(eth + 1);
+		bool last_rule = next_rule->size == 0 && next_rule->id == 0 &&
+			next_rule->rsvd == 0;
+
+		if (last_rule)
+			ctrl->prio = cpu_to_be16(MLX4_DOMAIN_NIC);
+	}
+}
+
 /*
  * In case of missing eth header, append eth header with a MAC address
  * assigned to the VF.
@@ -3822,7 +4224,7 @@ static int add_eth_header(struct mlx4_dev *dev, int slave,
 		}
 	}
 	if (!be_mac) {
-		pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d .\n",
+		pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d\n",
 		       port);
 		return -EINVAL;
 	}
@@ -3834,9 +4236,93 @@ static int add_eth_header(struct mlx4_dev *dev, int slave,
 	memcpy(eth_header->dst_mac_msk, &mac_msk, ETH_ALEN);
 
 	return 0;
-
 }
 
+#define MLX4_UPD_QP_PATH_MASK_SUPPORTED      (                                \
+	1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX                     |\
+	1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB)
+int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd_info)
+{
+	int err;
+	u32 qpn = vhcr->in_modifier & 0xffffff;
+	struct res_qp *rqp;
+	u64 mac;
+	unsigned port;
+	u64 pri_addr_path_mask;
+	struct mlx4_update_qp_context *cmd;
+	int smac_index;
+
+	cmd = (struct mlx4_update_qp_context *)inbox->buf;
+
+	pri_addr_path_mask = be64_to_cpu(cmd->primary_addr_path_mask);
+	if (cmd->qp_mask || cmd->secondary_addr_path_mask ||
+	    (pri_addr_path_mask & ~MLX4_UPD_QP_PATH_MASK_SUPPORTED))
+		return -EPERM;
+
+	if ((pri_addr_path_mask &
+	     (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB)) &&
+		!(dev->caps.flags2 &
+		  MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB)) {
+		mlx4_warn(dev, "Src check LB for slave %d isn't supported\n",
+			  slave);
+		return -ENOTSUPP;
+	}
+
+	/* Just change the smac for the QP */
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err) {
+		mlx4_err(dev, "Updating qpn 0x%x for slave %d rejected\n", qpn, slave);
+		return err;
+	}
+
+	port = (rqp->sched_queue >> 6 & 1) + 1;
+
+	if (pri_addr_path_mask & (1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX)) {
+		smac_index = cmd->qp_context.pri_path.grh_mylmc;
+		err = mac_find_smac_ix_in_slave(dev, slave, port,
+						smac_index, &mac);
+
+		if (err) {
+			mlx4_err(dev, "Failed to update qpn 0x%x, MAC is invalid. smac_ix: %d\n",
+				 qpn, smac_index);
+			goto err_mac;
+		}
+	}
+
+	err = mlx4_cmd(dev, inbox->dma,
+		       vhcr->in_modifier, 0,
+		       MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_err(dev, "Failed to update qpn on qpn 0x%x, command failed\n", qpn);
+		goto err_mac;
+	}
+
+err_mac:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+static u32 qp_attach_mbox_size(void *mbox)
+{
+	u32 size = sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+	struct _rule_hw  *rule_header;
+
+	rule_header = (struct _rule_hw *)(mbox + size);
+
+	while (rule_header->size) {
+		size += rule_header->size * sizeof(u32);
+		rule_header += 1;
+	}
+	return size;
+}
+
+static int mlx4_do_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule);
+
 int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 					 struct mlx4_vhcr *vhcr,
 					 struct mlx4_cmd_mailbox *inbox,
@@ -3853,26 +4339,38 @@ int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
 	struct _rule_hw  *rule_header;
 	int header_id;
+	struct res_fs_rule *rrule;
+	u32 mbox_size;
 
 	if (dev->caps.steering_mode !=
 	    MLX4_STEERING_MODE_DEVICE_MANAGED)
 		return -EOPNOTSUPP;
 
 	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	err = mlx4_slave_convert_port(dev, slave, ctrl->port);
+	if (err <= 0)
+		return -EINVAL;
+	ctrl->port = err;
 	qpn = be32_to_cpu(ctrl->qpn) & 0xffffff;
 	err = get_res(dev, slave, qpn, RES_QP, &rqp);
 	if (err) {
-		pr_err("Steering rule with qpn 0x%x rejected.\n", qpn);
+		pr_err("Steering rule with qpn 0x%x rejected\n", qpn);
 		return err;
 	}
 	rule_header = (struct _rule_hw *)(ctrl + 1);
 	header_id = map_hw_to_sw_id(be16_to_cpu(rule_header->id));
 
+	if (header_id == MLX4_NET_TRANS_RULE_ID_ETH)
+		handle_eth_header_mcast_prio(ctrl, rule_header);
+
+	if (slave == dev->caps.function)
+		goto execute;
+
 	switch (header_id) {
 	case MLX4_NET_TRANS_RULE_ID_ETH:
 		if (validate_eth_header_mac(slave, rule_header, rlist)) {
 			err = -EINVAL;
-			goto err_put;
+			goto err_put_qp;
 		}
 		break;
 	case MLX4_NET_TRANS_RULE_ID_IB:
@@ -3880,42 +4378,89 @@ int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 	case MLX4_NET_TRANS_RULE_ID_IPV4:
 	case MLX4_NET_TRANS_RULE_ID_TCP:
 	case MLX4_NET_TRANS_RULE_ID_UDP:
-		pr_warn("Can't attach FS rule without L2 headers, adding L2 header.\n");
+		pr_warn("Can't attach FS rule without L2 headers, adding L2 header\n");
 		if (add_eth_header(dev, slave, inbox, rlist, header_id)) {
 			err = -EINVAL;
-			goto err_put;
+			goto err_put_qp;
 		}
 		vhcr->in_modifier +=
 			sizeof(struct mlx4_net_trans_rule_hw_eth) >> 2;
 		break;
 	default:
-		pr_err("Corrupted mailbox.\n");
+		pr_err("Corrupted mailbox\n");
 		err = -EINVAL;
-		goto err_put;
+		goto err_put_qp;
 	}
 
+execute:
 	err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
 			   vhcr->in_modifier, 0,
 			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
 			   MLX4_CMD_NATIVE);
 	if (err)
-		goto err_put;
+		goto err_put_qp;
+
 
 	err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, qpn);
 	if (err) {
-		mlx4_err(dev, "Fail to add flow steering resources.\n ");
-		/* detach rule*/
+		mlx4_err(dev, "Fail to add flow steering resources\n");
+		goto err_detach;
+	}
+
+	err = get_res(dev, slave, vhcr->out_param, RES_FS_RULE, &rrule);
+	if (err)
+		goto err_detach;
+
+	mbox_size = qp_attach_mbox_size(inbox->buf);
+	rrule->mirr_mbox = kmalloc(mbox_size, GFP_KERNEL);
+	if (!rrule->mirr_mbox) {
+		err = -ENOMEM;
+		goto err_put_rule;
+	}
+	rrule->mirr_mbox_size = mbox_size;
+	rrule->mirr_rule_id = 0;
+	memcpy(rrule->mirr_mbox, inbox->buf, mbox_size);
+
+	/* set different port */
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)rrule->mirr_mbox;
+	if (ctrl->port == 1)
+		ctrl->port = 2;
+	else
+		ctrl->port = 1;
+
+	if (mlx4_is_bonded(dev))
+		mlx4_do_mirror_rule(dev, rrule);
+
+	atomic_inc(&rqp->ref_count);
+
+err_put_rule:
+	put_res(dev, slave, vhcr->out_param, RES_FS_RULE);
+err_detach:
+	/* detach rule on error */
+	if (err)
 		mlx4_cmd(dev, vhcr->out_param, 0, 0,
 			 MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
 			 MLX4_CMD_NATIVE);
-		goto err_put;
-	}
-	atomic_inc(&rqp->ref_count);
-err_put:
+err_put_qp:
 	put_res(dev, slave, qpn, RES_QP);
 	return err;
 }
 
+static int mlx4_undo_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule)
+{
+	int err;
+
+	err = rem_res_range(dev, fs_rule->com.owner, fs_rule->com.res_id, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to remove flow steering resources\n");
+		return err;
+	}
+
+	mlx4_cmd(dev, fs_rule->com.res_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH,
+		 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	return 0;
+}
+
 int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
 					 struct mlx4_vhcr *vhcr,
 					 struct mlx4_cmd_mailbox *inbox,
@@ -3925,6 +4470,7 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
 	int err;
 	struct res_qp *rqp;
 	struct res_fs_rule *rrule;
+	u64 mirr_reg_id;
 
 	if (dev->caps.steering_mode !=
 	    MLX4_STEERING_MODE_DEVICE_MANAGED)
@@ -3933,26 +4479,41 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
 	err = get_res(dev, slave, vhcr->in_param, RES_FS_RULE, &rrule);
 	if (err)
 		return err;
+
+	if (!rrule->mirr_mbox) {
+		mlx4_err(dev, "Mirror rules cannot be removed explicitly\n");
+		put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
+		return -EINVAL;
+	}
+	mirr_reg_id = rrule->mirr_rule_id;
+	kfree(rrule->mirr_mbox);
+
 	/* Release the rule form busy state before removal */
 	put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
 	err = get_res(dev, slave, rrule->qpn, RES_QP, &rqp);
 	if (err)
 		return err;
 
+	if (mirr_reg_id && mlx4_is_bonded(dev)) {
+		err = get_res(dev, slave, mirr_reg_id, RES_FS_RULE, &rrule);
+		if (err) {
+			mlx4_err(dev, "Fail to get resource of mirror rule\n");
+		} else {
+			put_res(dev, slave, mirr_reg_id, RES_FS_RULE);
+			mlx4_undo_mirror_rule(dev, rrule);
+		}
+	}
+	err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to remove flow steering resources\n");
+		goto out;
+	}
+
 	err = mlx4_cmd(dev, vhcr->in_param, 0, 0,
 		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
 		       MLX4_CMD_NATIVE);
-	if (!err) {
-		err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE,
-				    0);
+	if (!err)
 		atomic_dec(&rqp->ref_count);
-
-		if (err) {
-			mlx4_err(dev, "Fail to remove flow steering resources.\n ");
-			goto out;
-		}
-	}
-
 out:
 	put_res(dev, slave, rrule->qpn, RES_QP);
 	return err;
@@ -3969,9 +4530,14 @@ int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
 			       struct mlx4_cmd_info *cmd)
 {
 	int err;
+	int index = vhcr->in_modifier & 0xffff;
+
+	err = get_res(dev, slave, index, RES_COUNTER, NULL);
+	if (err)
+		return err;
 
 	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
-
+	put_res(dev, slave, index, RES_COUNTER);
 	return err;
 }
 
@@ -4017,8 +4583,8 @@ static int _move_all_busy(struct mlx4_dev *dev, int slave,
 					if (print)
 						mlx4_dbg(dev,
 							 "%s id 0x%llx is busy\n",
-							  ResourceType(type),
-							  (unsigned long long)r->res_id);
+							  resource_str(type),
+							  (long long)r->res_id);
 					++busy;
 				} else {
 					r->from_state = r->state;
@@ -4068,8 +4634,8 @@ static void rem_slave_qps(struct mlx4_dev *dev, int slave)
 
 	err = move_all_busy(dev, slave, RES_QP);
 	if (err)
-		mlx4_warn(dev, "rem_slave_qps: Could not move all qps to busy"
-			  "for slave %d\n", slave);
+		mlx4_warn(dev, "rem_slave_qps: Could not move all qps to busy for slave %d\n",
+			  slave);
 
 	spin_lock_irq(mlx4_tlock(dev));
 	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
@@ -4107,10 +4673,8 @@ static void rem_slave_qps(struct mlx4_dev *dev, int slave)
 						       MLX4_CMD_TIME_CLASS_A,
 						       MLX4_CMD_NATIVE);
 					if (err)
-						mlx4_dbg(dev, "rem_slave_qps: failed"
-							 " to move slave %d qpn %d to"
-							 " reset\n", slave,
-							 qp->local_qpn);
+						mlx4_dbg(dev, "rem_slave_qps: failed to move slave %d qpn %d to reset\n",
+							 slave, qp->local_qpn);
 					atomic_dec(&qp->rcq->ref_count);
 					atomic_dec(&qp->scq->ref_count);
 					atomic_dec(&qp->mtt->ref_count);
@@ -4144,8 +4708,8 @@ static void rem_slave_srqs(struct mlx4_dev *dev, int slave)
 
 	err = move_all_busy(dev, slave, RES_SRQ);
 	if (err)
-		mlx4_warn(dev, "rem_slave_srqs: Could not move all srqs to "
-			  "busy for slave %d\n", slave);
+		mlx4_warn(dev, "rem_slave_srqs: Could not move all srqs - too busy for slave %d\n",
+			  slave);
 
 	spin_lock_irq(mlx4_tlock(dev));
 	list_for_each_entry_safe(srq, tmp, srq_list, com.list) {
@@ -4175,9 +4739,7 @@ static void rem_slave_srqs(struct mlx4_dev *dev, int slave)
 						       MLX4_CMD_TIME_CLASS_A,
 						       MLX4_CMD_NATIVE);
 					if (err)
-						mlx4_dbg(dev, "rem_slave_srqs: failed"
-							 " to move slave %d srq %d to"
-							 " SW ownership\n",
+						mlx4_dbg(dev, "rem_slave_srqs: failed to move slave %d srq %d to SW ownership\n",
 							 slave, srqn);
 
 					atomic_dec(&srq->mtt->ref_count);
@@ -4212,8 +4774,8 @@ static void rem_slave_cqs(struct mlx4_dev *dev, int slave)
 
 	err = move_all_busy(dev, slave, RES_CQ);
 	if (err)
-		mlx4_warn(dev, "rem_slave_cqs: Could not move all cqs to "
-			  "busy for slave %d\n", slave);
+		mlx4_warn(dev, "rem_slave_cqs: Could not move all cqs - too busy for slave %d\n",
+			  slave);
 
 	spin_lock_irq(mlx4_tlock(dev));
 	list_for_each_entry_safe(cq, tmp, cq_list, com.list) {
@@ -4243,9 +4805,7 @@ static void rem_slave_cqs(struct mlx4_dev *dev, int slave)
 						       MLX4_CMD_TIME_CLASS_A,
 						       MLX4_CMD_NATIVE);
 					if (err)
-						mlx4_dbg(dev, "rem_slave_cqs: failed"
-							 " to move slave %d cq %d to"
-							 " SW ownership\n",
+						mlx4_dbg(dev, "rem_slave_cqs: failed to move slave %d cq %d to SW ownership\n",
 							 slave, cqn);
 					atomic_dec(&cq->mtt->ref_count);
 					state = RES_CQ_ALLOCATED;
@@ -4277,8 +4837,8 @@ static void rem_slave_mrs(struct mlx4_dev *dev, int slave)
 
 	err = move_all_busy(dev, slave, RES_MPT);
 	if (err)
-		mlx4_warn(dev, "rem_slave_mrs: Could not move all mpts to "
-			  "busy for slave %d\n", slave);
+		mlx4_warn(dev, "rem_slave_mrs: Could not move all mpts - too busy for slave %d\n",
+			  slave);
 
 	spin_lock_irq(mlx4_tlock(dev));
 	list_for_each_entry_safe(mpt, tmp, mpt_list, com.list) {
@@ -4313,9 +4873,7 @@ static void rem_slave_mrs(struct mlx4_dev *dev, int slave)
 						     MLX4_CMD_TIME_CLASS_A,
 						     MLX4_CMD_NATIVE);
 					if (err)
-						mlx4_dbg(dev, "rem_slave_mrs: failed"
-							 " to move slave %d mpt %d to"
-							 " SW ownership\n",
+						mlx4_dbg(dev, "rem_slave_mrs: failed to move slave %d mpt %d to SW ownership\n",
 							 slave, mptn);
 					if (mpt->mtt)
 						atomic_dec(&mpt->mtt->ref_count);
@@ -4347,8 +4905,8 @@ static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
 
 	err = move_all_busy(dev, slave, RES_MTT);
 	if (err)
-		mlx4_warn(dev, "rem_slave_mtts: Could not move all mtts to "
-			  "busy for slave %d\n", slave);
+		mlx4_warn(dev, "rem_slave_mtts: Could not move all mtts  - too busy for slave %d\n",
+			  slave);
 
 	spin_lock_irq(mlx4_tlock(dev));
 	list_for_each_entry_safe(mtt, tmp, mtt_list, com.list) {
@@ -4382,6 +4940,91 @@ static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
 	spin_unlock_irq(mlx4_tlock(dev));
 }
 
+static int mlx4_do_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct res_fs_rule *mirr_rule;
+	u64 reg_id;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (!fs_rule->mirr_mbox) {
+		mlx4_err(dev, "rule mirroring mailbox is null\n");
+		return -EINVAL;
+	}
+	memcpy(mailbox->buf, fs_rule->mirr_mbox, fs_rule->mirr_mbox_size);
+	err = mlx4_cmd_imm(dev, mailbox->dma, &reg_id, fs_rule->mirr_mbox_size >> 2, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (err)
+		goto err;
+
+	err = add_res_range(dev, fs_rule->com.owner, reg_id, 1, RES_FS_RULE, fs_rule->qpn);
+	if (err)
+		goto err_detach;
+
+	err = get_res(dev, fs_rule->com.owner, reg_id, RES_FS_RULE, &mirr_rule);
+	if (err)
+		goto err_rem;
+
+	fs_rule->mirr_rule_id = reg_id;
+	mirr_rule->mirr_rule_id = 0;
+	mirr_rule->mirr_mbox_size = 0;
+	mirr_rule->mirr_mbox = NULL;
+	put_res(dev, fs_rule->com.owner, reg_id, RES_FS_RULE);
+
+	return 0;
+err_rem:
+	rem_res_range(dev, fs_rule->com.owner, reg_id, 1, RES_FS_RULE, 0);
+err_detach:
+	mlx4_cmd(dev, reg_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH,
+		 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+err:
+	return err;
+}
+
+static int mlx4_mirror_fs_rules(struct mlx4_dev *dev, bool bond)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct rb_root *root = &tracker->res_tree[RES_FS_RULE];
+	struct rb_node *p;
+	struct res_fs_rule *fs_rule;
+	int err = 0;
+	LIST_HEAD(mirr_list);
+
+	for (p = rb_first(root); p; p = rb_next(p)) {
+		fs_rule = rb_entry(p, struct res_fs_rule, com.node);
+		if ((bond && fs_rule->mirr_mbox_size) ||
+		    (!bond && !fs_rule->mirr_mbox_size))
+			list_add_tail(&fs_rule->mirr_list, &mirr_list);
+	}
+
+	list_for_each_entry(fs_rule, &mirr_list, mirr_list) {
+		if (bond)
+			err += mlx4_do_mirror_rule(dev, fs_rule);
+		else
+			err += mlx4_undo_mirror_rule(dev, fs_rule);
+	}
+	return err;
+}
+
+int mlx4_bond_fs_rules(struct mlx4_dev *dev)
+{
+	return mlx4_mirror_fs_rules(dev, true);
+}
+
+int mlx4_unbond_fs_rules(struct mlx4_dev *dev)
+{
+	return mlx4_mirror_fs_rules(dev, false);
+}
+
 static void rem_slave_fs_rule(struct mlx4_dev *dev, int slave)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -4446,12 +5089,11 @@ static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
 	int state;
 	LIST_HEAD(tlist);
 	int eqn;
-	struct mlx4_cmd_mailbox *mailbox;
 
 	err = move_all_busy(dev, slave, RES_EQ);
 	if (err)
-		mlx4_warn(dev, "rem_slave_eqs: Could not move all eqs to "
-			  "busy for slave %d\n", slave);
+		mlx4_warn(dev, "rem_slave_eqs: Could not move all eqs - too busy for slave %d\n",
+			  slave);
 
 	spin_lock_irq(mlx4_tlock(dev));
 	list_for_each_entry_safe(eq, tmp, eq_list, com.list) {
@@ -4472,21 +5114,13 @@ static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
 					break;
 
 				case RES_EQ_HW:
-					mailbox = mlx4_alloc_cmd_mailbox(dev);
-					if (IS_ERR(mailbox)) {
-						cond_resched();
-						continue;
-					}
-					err = mlx4_cmd_box(dev, slave, 0,
-							   eqn & 0xff, 0,
-							   MLX4_CMD_HW2SW_EQ,
-							   MLX4_CMD_TIME_CLASS_A,
-							   MLX4_CMD_NATIVE);
+					err = mlx4_cmd(dev, slave, eqn & 0x3ff,
+						       1, MLX4_CMD_HW2SW_EQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
 					if (err)
-						mlx4_dbg(dev, "rem_slave_eqs: failed"
-							 " to move slave %d eqs %d to"
-							 " SW ownership\n", slave, eqn);
-					mlx4_free_cmd_mailbox(dev, mailbox);
+						mlx4_dbg(dev, "rem_slave_eqs: failed to move slave %d eqs %d to SW ownership\n",
+							 slave, eqn & 0x3ff);
 					atomic_dec(&eq->mtt->ref_count);
 					state = RES_EQ_RESERVED;
 					break;
@@ -4503,7 +5137,48 @@ static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
 
 static void rem_slave_counters(struct mlx4_dev *dev, int slave)
 {
-	__mlx4_slave_counters_free(dev, slave);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *counter_list =
+		&tracker->slave_list[slave].res_list[RES_COUNTER];
+	struct res_counter *counter;
+	struct res_counter *tmp;
+	int err;
+	int *counters_arr = NULL;
+	int i, j;
+
+	err = move_all_busy(dev, slave, RES_COUNTER);
+	if (err)
+		mlx4_warn(dev, "rem_slave_counters: Could not move all counters - too busy for slave %d\n",
+			  slave);
+
+	counters_arr = kmalloc_array(dev->caps.max_counters,
+				     sizeof(*counters_arr), GFP_KERNEL);
+	if (!counters_arr)
+		return;
+
+	do {
+		i = 0;
+		j = 0;
+		spin_lock_irq(mlx4_tlock(dev));
+		list_for_each_entry_safe(counter, tmp, counter_list, com.list) {
+			if (counter->com.owner == slave) {
+				counters_arr[i++] = counter->com.res_id;
+				rb_erase(&counter->com.node,
+					 &tracker->res_tree[RES_COUNTER]);
+				list_del(&counter->com.list);
+				kfree(counter);
+			}
+		}
+		spin_unlock_irq(mlx4_tlock(dev));
+
+		while (j < i) {
+			__mlx4_counter_free(dev, counters_arr[j++]);
+			mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+		}
+	} while (i);
+
+	kfree(counters_arr);
 }
 
 static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
@@ -4519,8 +5194,8 @@ static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
 
 	err = move_all_busy(dev, slave, RES_XRCD);
 	if (err)
-		mlx4_warn(dev, "rem_slave_xrcdns: Could not move all xrcdns to "
-			  "busy for slave %d\n", slave);
+		mlx4_warn(dev, "rem_slave_xrcdns: Could not move all xrcdns - too busy for slave %d\n",
+			  slave);
 
 	spin_lock_irq(mlx4_tlock(dev));
 	list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) {
@@ -4538,10 +5213,10 @@ static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
 void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-
+	mlx4_reset_roce_gids(dev, slave);
 	mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
-	rem_slave_macs(dev, slave);
 	rem_slave_vlans(dev, slave);
+	rem_slave_macs(dev, slave);
 	rem_slave_fs_rule(dev, slave);
 	rem_slave_qps(dev, slave);
 	rem_slave_srqs(dev, slave);
@@ -4578,6 +5253,7 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
 	u64 qp_path_mask = ((1ULL << MLX4_UPD_QP_PATH_MASK_VLAN_INDEX) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_CV) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_SV) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_FEUP) |
 		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL_RX) |
@@ -4596,17 +5272,28 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		goto out;
-
-	if (!work->vlan_id)
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE) /* block all */
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+	else if (!work->vlan_id)
 		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
 			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
-	else
+	else if (work->vlan_proto == htons(ETH_P_8021AD))
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+	else  /* vst 802.1Q */
 		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
 			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
 			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
 
 	upd_context = mailbox->buf;
-	upd_context->qp_mask = cpu_to_be64(MLX4_UPD_QP_MASK_VSD);
+	upd_context->qp_mask = cpu_to_be64(1ULL << MLX4_UPD_QP_MASK_VSD);
 
 	spin_lock_irq(mlx4_tlock(dev));
 	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
@@ -4645,13 +5332,22 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
 				upd_context->qp_context.pri_path.fvl_rx =
 					qp->fvl_rx | MLX4_FVL_RX_FORCE_ETH_VLAN;
 				upd_context->qp_context.pri_path.fl =
-					qp->pri_path_fl | MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN;
+					qp->pri_path_fl | MLX4_FL_ETH_HIDE_CQE_VLAN;
+				if (work->vlan_proto == htons(ETH_P_8021AD))
+					upd_context->qp_context.pri_path.fl |= MLX4_FL_SV;
+				else
+					upd_context->qp_context.pri_path.fl |= MLX4_FL_CV;
 				upd_context->qp_context.pri_path.feup =
 					qp->feup | MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
 				upd_context->qp_context.pri_path.sched_queue =
 					qp->sched_queue & 0xC7;
 				upd_context->qp_context.pri_path.sched_queue |=
 					((work->qos & 0x7) << 3);
+				upd_context->qp_mask |=
+					cpu_to_be64(1ULL <<
+						    MLX4_UPD_QP_MASK_QOS_VPP);
+				upd_context->qp_context.qos_vport =
+					work->qos_vport;
 			}
 
 			err = mlx4_cmd(dev, mailbox->dma,
@@ -4659,10 +5355,8 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
 				       0, MLX4_CMD_UPDATE_QP,
 				       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 			if (err) {
-				mlx4_info(dev, "UPDATE_QP failed for slave %d, "
-					  "port %d, qpn %d (%d)\n",
-					  work->slave, port, qp->local_qpn,
-					  err);
+				mlx4_info(dev, "UPDATE_QP failed for slave %d, port %d, qpn %d (%d)\n",
+					  work->slave, port, qp->local_qpn, err);
 				errors++;
 			}
 		}
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_sense.c b/sys/dev/mlx4/mlx4_core/mlx4_sense.c
index 5d2ee392d348..f11ea11f81da 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_sense.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_sense.c
@@ -53,7 +53,7 @@ int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
 	}
 
 	if (out_param > 2) {
-		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", (unsigned long long)out_param);
+		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", (long long)out_param);
 		return -EINVAL;
 	}
 
@@ -98,6 +98,10 @@ static void mlx4_sense_port(struct work_struct *work)
 	enum mlx4_port_type stype[MLX4_MAX_PORTS];
 
 	mutex_lock(&priv->port_mutex);
+	if (sense->gone != 0) {
+		mutex_unlock(&priv->port_mutex);
+		return;
+	}
 	mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]);
 
 	if (mlx4_check_port_params(dev, stype))
@@ -107,9 +111,9 @@ static void mlx4_sense_port(struct work_struct *work)
 		mlx4_err(dev, "Failed to change port_types\n");
 
 sense_again:
-	mutex_unlock(&priv->port_mutex);
 	queue_delayed_work(mlx4_wq , &sense->sense_poll,
 			   round_jiffies_relative(MLX4_SENSE_RANGE));
+	mutex_unlock(&priv->port_mutex);
 }
 
 void mlx4_start_sense(struct mlx4_dev *dev)
@@ -120,12 +124,22 @@ void mlx4_start_sense(struct mlx4_dev *dev)
 	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
 		return;
 
+	mutex_lock(&priv->port_mutex);
+	sense->gone = 0;
 	queue_delayed_work(mlx4_wq , &sense->sense_poll,
 			   round_jiffies_relative(MLX4_SENSE_RANGE));
+	mutex_unlock(&priv->port_mutex);
 }
 
 void mlx4_stop_sense(struct mlx4_dev *dev)
 {
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+
+	mutex_lock(&priv->port_mutex);
+	sense->gone = 1;
+	mutex_unlock(&priv->port_mutex);
+
 	cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll);
 }
 
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_srq.c b/sys/dev/mlx4/mlx4_core/mlx4_srq.c
index 877ee67918ee..1a3c66065204 100644
--- a/sys/dev/mlx4/mlx4_core/mlx4_srq.c
+++ b/sys/dev/mlx4/mlx4_core/mlx4_srq.c
@@ -35,6 +35,7 @@
 #include <dev/mlx4/srq.h>
 #include <linux/module.h>
 #include <linux/gfp.h>
+#include <linux/rcupdate.h>
 
 #include "mlx4.h"
 #include "icm.h"
@@ -44,15 +45,12 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
 	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
 	struct mlx4_srq *srq;
 
-	spin_lock(&srq_table->lock);
-
+	rcu_read_lock();
 	srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
+	rcu_read_unlock();
 	if (srq)
 		atomic_inc(&srq->refcount);
-
-	spin_unlock(&srq_table->lock);
-
-	if (!srq) {
+	else {
 		mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
 		return;
 	}
@@ -102,11 +100,11 @@ int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
 	if (*srqn == -1)
 		return -ENOMEM;
 
-	err = mlx4_table_get(dev, &srq_table->table, *srqn);
+	err = mlx4_table_get(dev, &srq_table->table, *srqn, GFP_KERNEL);
 	if (err)
 		goto err_out;
 
-	err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn);
+	err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn, GFP_KERNEL);
 	if (err)
 		goto err_put;
 	return 0;
@@ -187,8 +185,6 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
 	}
 
 	srq_context = mailbox->buf;
-	memset(srq_context, 0, sizeof *srq_context);
-
 	srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) |
 						      srq->srqn);
 	srq_context->logstride          = srq->wqe_shift - 4;
@@ -302,12 +298,11 @@ struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
 {
 	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
 	struct mlx4_srq *srq;
-	unsigned long flags;
 
-	spin_lock_irqsave(&srq_table->lock, flags);
+	rcu_read_lock();
 	srq = radix_tree_lookup(&srq_table->tree,
 				srqn & (dev->caps.num_srqs - 1));
-	spin_unlock_irqrestore(&srq_table->lock, flags);
+	rcu_read_unlock();
 
 	return srq;
 }
diff --git a/sys/dev/mlx4/mlx4_core/mlx4_sys_tune.c b/sys/dev/mlx4/mlx4_core/mlx4_sys_tune.c
deleted file mode 100644
index 87eb30e6ccfe..000000000000
--- a/sys/dev/mlx4/mlx4_core/mlx4_sys_tune.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2010, 2014 Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/mutex.h>
-#include <asm/atomic.h>
-
-#include "mlx4.h"
-
-#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
-
-/* Each CPU is put into a group.  In most cases, the group number is
- * equal to the CPU number of one of the CPUs in the group.  The
- * exception is group NR_CPUS which is the default group.  This is
- * protected by sys_tune_startup_mutex. */
-DEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS;
-
-/* For each group, a count of the number of CPUs in the group which
- * are known to be busy.  A busy CPU might be running the busy loop
- * below or general kernel code.  The count is decremented on entry to
- * the old pm_idle handler and incremented on exit.  The aim is to
- * avoid the count going to zero or negative.  This situation can
- * occur temporarily during module unload or CPU hot-plug but
- * normality will be restored when the affected CPUs next exit the
- * idle loop. */
-static atomic_t busy_cpu_count[NR_CPUS+1];
-
-/* A workqueue item to be executed to cause the CPU to exit from the
- * idle loop. */
-DEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work);
-
-#define sys_tune_set_state(CPU,STATE) \
-	do { } while(0)
-
-
-/* A mutex to protect most of the module datastructures. */
-static DEFINE_MUTEX(sys_tune_startup_mutex);
-
-/* The old pm_idle handler. */
-static void (*old_pm_idle)(void) = NULL;
-
-static void sys_tune_pm_idle(void)
-{
-	atomic_t *busy_cpus_ptr;
-	int busy_cpus;
-	int cpu = smp_processor_id();
-
-	busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]);
-
-	sys_tune_set_state(cpu, 2);
-
-	local_irq_enable();
-	while (!need_resched()) {
-		busy_cpus = atomic_read(busy_cpus_ptr);
-
-		/* If other CPUs in this group are busy then let this
-		 * CPU go idle.  We mustn't let the number of busy
-		 * CPUs drop below 1. */
-		if ( busy_cpus > 1 &&
-		     old_pm_idle != NULL &&
-		     ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus,
-				      busy_cpus-1) == busy_cpus ) ) {
-			local_irq_disable();
-			sys_tune_set_state(cpu, 3);
-			/* This check might not be necessary, but it
-			 * seems safest to include it because there
-			 * might be a kernel version which requires
-			 * it. */
-			if (need_resched())
-				local_irq_enable();
-			else
-				old_pm_idle();
-			/* This CPU is busy again. */
-			sys_tune_set_state(cpu, 1);
-			atomic_add(1, busy_cpus_ptr);
-			return;
-		}
-
-		cpu_relax();
-	}
-	sys_tune_set_state(cpu, 0);
-}
-
-
-void sys_tune_work_func(struct work_struct *work)
-{
-	/* Do nothing.  Since this function is running in process
-	 * context, the idle thread isn't running on this CPU. */
-}
-
-
-#ifdef CONFIG_SMP
-static void sys_tune_smp_call(void *info)
-{
-	schedule_work(&get_cpu_var(sys_tune_cpu_work));
-	put_cpu_var(sys_tune_cpu_work);
-}
-#endif
-
-
-#ifdef CONFIG_SMP
-static void sys_tune_refresh(void)
-{
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
-        on_each_cpu(&sys_tune_smp_call, NULL, 0, 1);
-#else
-        on_each_cpu(&sys_tune_smp_call, NULL, 1);
-#endif
-}
-#else
-static void sys_tune_refresh(void)
-{
-	/* The current thread is executing on the one and only CPU so
-	 * the idle thread isn't running. */
-}
-#endif
-
-
-
-static int sys_tune_cpu_group(int cpu)
-{
-#ifdef CONFIG_SMP
-	const cpumask_t *mask;
-	int other_cpu;
-	int group;
-
-#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP)
-	/* Keep one hyperthread busy per core. */
-	mask = topology_thread_cpumask(cpu);
-#else
-	return cpu;
-#endif
-	for_each_cpu_mask(cpu, *(mask))	{
-		group = per_cpu(idle_cpu_group, other_cpu);
-		if (group != NR_CPUS)
-			return group;
-	}
-#endif
-
-	return cpu;
-}
-
-
-static void sys_tune_add_cpu(int cpu)
-{
-	int group;
-
-	/* Do nothing if this CPU has already been added. */
-	if (per_cpu(idle_cpu_group, cpu) != NR_CPUS)
-		return;
-
-	group = sys_tune_cpu_group(cpu);
-	per_cpu(idle_cpu_group, cpu) = group;
-	atomic_inc(&(busy_cpu_count[group]));
-
-}
-
-static void sys_tune_del_cpu(int cpu)
-{
-
-	int group;
-
-	if (per_cpu(idle_cpu_group, cpu) == NR_CPUS)
-		return;
-
-	group = per_cpu(idle_cpu_group, cpu);
-	/* If the CPU was busy, this can cause the count to drop to
-	 * zero.  To rectify this, we need to cause one of the other
-	 * CPUs in the group to exit the idle loop.  If the CPU was
-	 * not busy then this causes the contribution for this CPU to
-	 * go to -1 which can cause the overall count to drop to zero
-	 * or go negative.  To rectify this situation we need to cause
-	 * this CPU to exit the idle loop. */
-	atomic_dec(&(busy_cpu_count[group]));
-	per_cpu(idle_cpu_group, cpu) = NR_CPUS;
-
-}
-
-
-static int sys_tune_cpu_notify(struct notifier_block *self,
-			       unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-	
-	switch(action) {
-#ifdef CPU_ONLINE_FROZEN
-	case CPU_ONLINE_FROZEN:
-#endif
-	case CPU_ONLINE:
-		mutex_lock(&sys_tune_startup_mutex);
-		sys_tune_add_cpu(cpu);
-		mutex_unlock(&sys_tune_startup_mutex);
-		/* The CPU might have already entered the idle loop in
-		 * the wrong group.  Make sure it exits the idle loop
-		 * so that it picks up the correct group. */
-		sys_tune_refresh();
-		break;
-
-#ifdef CPU_DEAD_FROZEN
-	case CPU_DEAD_FROZEN:
-#endif
-	case CPU_DEAD:
-		mutex_lock(&sys_tune_startup_mutex);
-		sys_tune_del_cpu(cpu);
-		mutex_unlock(&sys_tune_startup_mutex);
-		/* The deleted CPU may have been the only busy CPU in
-		 * the group.  Make sure one of the other CPUs in the
-		 * group exits the idle loop. */
-		sys_tune_refresh();
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-
-static struct notifier_block sys_tune_cpu_nb = {
-	.notifier_call = sys_tune_cpu_notify,
-};
-
-
-static void sys_tune_ensure_init(void)
-{
-	BUG_ON (old_pm_idle != NULL);
-
-	/* Atomically update pm_idle to &sys_tune_pm_idle.  The old value
-	 * is stored in old_pm_idle before installing the new
-	 * handler. */
-	do {
-		old_pm_idle = pm_idle;
-	} while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) !=
-		 old_pm_idle);
-}
-#endif
-
-void sys_tune_fini(void)
-{
-#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
-	void (*old)(void);
-	int cpu;
-
-	unregister_cpu_notifier(&sys_tune_cpu_nb);
-
-	mutex_lock(&sys_tune_startup_mutex);
-
-
-	old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle);
-
-	for_each_online_cpu(cpu)
-		sys_tune_del_cpu(cpu);
-
-	mutex_unlock(&sys_tune_startup_mutex);
-	
-	/* Our handler may still be executing on other CPUs.
-	 * Schedule this thread on all CPUs to make sure all
-	 * idle threads get interrupted. */
-	sys_tune_refresh();
-
-	/* Make sure the work item has finished executing on all CPUs.
-	 * This in turn ensures that all idle threads have been
-	 * interrupted. */
-	flush_scheduled_work();
-#endif /* CONFIG_X86 */
-}
-
-void sys_tune_init(void)
-{
-#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu),
-			  sys_tune_work_func);
-	}
-
-	/* Start by registering the handler to ensure we don't miss
-	 * any updates. */
-	register_cpu_notifier(&sys_tune_cpu_nb);
-
-	mutex_lock(&sys_tune_startup_mutex);
-
-	for_each_online_cpu(cpu)
-		sys_tune_add_cpu(cpu);
-
-	sys_tune_ensure_init();
-
-
-	mutex_unlock(&sys_tune_startup_mutex);
-
-	/* Ensure our idle handler starts to run. */
-	sys_tune_refresh();
-#endif
-}
-
diff --git a/sys/dev/mlx4/mlx4_en/en.h b/sys/dev/mlx4/mlx4_en/en.h
index 5e4b4fa60841..ae6e1821014a 100644
--- a/sys/dev/mlx4/mlx4_en/en.h
+++ b/sys/dev/mlx4/mlx4_en/en.h
@@ -75,6 +75,7 @@
 #define MIN_RX_RINGS		4
 #define TXBB_SIZE		64
 #define HEADROOM		(2048 / TXBB_SIZE + 1)
+#define INIT_OWNER_BIT		0xffffffff
 #define STAMP_STRIDE		64
 #define STAMP_DWORDS		(STAMP_STRIDE / 4)
 #define STAMP_SHIFT		31
@@ -131,11 +132,13 @@ enum mlx4_en_alloc_type {
 #define MAX_TX_RINGS			(MLX4_EN_MAX_TX_RING_P_UP * \
 					MLX4_EN_NUM_UP)
 
+#define MLX4_EN_NO_VLAN			0xffff
+
 #define MLX4_EN_DEF_TX_RING_SIZE	1024
 #define MLX4_EN_DEF_RX_RING_SIZE  	1024
 
 /* Target number of bytes to coalesce with interrupt moderation */
-#define MLX4_EN_RX_COAL_TARGET	0x20000
+#define MLX4_EN_RX_COAL_TARGET	44
 #define MLX4_EN_RX_COAL_TIME	0x10
 
 #define MLX4_EN_TX_COAL_PKTS	64
@@ -192,6 +195,13 @@ enum mlx4_en_alloc_type {
 #define GET_AVG_PERF_COUNTER(cnt)	(0)
 #endif /* MLX4_EN_PERF_STAT */
 
+/* Constants for TX flow */
+enum {
+	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
+	MAX_BF = 256,
+	MIN_PKT_LEN = 17,
+};
+
 /*
  * Configurables
  */
@@ -264,7 +274,6 @@ struct mlx4_en_tx_ring {
 	int blocked;
 	struct mlx4_en_tx_info *tx_info;
 	u8 queue_index;
-	cpuset_t affinity_mask;
 	struct buf_ring *br;
 	u32 last_nr_txbb;
 	struct mlx4_qp qp;
@@ -272,14 +281,14 @@ struct mlx4_en_tx_ring {
 	int qpn;
 	enum mlx4_qp_state qp_state;
 	struct mlx4_srq dummy;
-	unsigned long bytes;
-	unsigned long packets;
-	unsigned long tx_csum;
-	unsigned long queue_stopped;
-	unsigned long oversized_packets;
-	unsigned long wake_queue;
-	unsigned long tso_packets;
-	unsigned long defrag_attempts;
+	u64 bytes;
+	u64 packets;
+	u64 tx_csum;
+	u64 queue_stopped;
+	u64 oversized_packets;
+	u64 wake_queue;
+	u64 tso_packets;
+	u64 defrag_attempts;
 	struct mlx4_bf bf;
 	bool bf_enabled;
 	int hwtstamp_tx_type;
@@ -322,16 +331,16 @@ struct mlx4_en_rx_ring {
 	int qpn;
 	u8 *buf;
 	struct mlx4_en_rx_mbuf *mbuf;
-	unsigned long errors;
-	unsigned long bytes;
-	unsigned long packets;
+	u64 errors;
+	u64 bytes;
+	u64 packets;
 #ifdef LL_EXTENDED_STATS
-	unsigned long yields;
-	unsigned long misses;
-	unsigned long cleaned;
+	u64 yields;
+	u64 misses;
+	u64 cleaned;
 #endif
-	unsigned long csum_ok;
-	unsigned long csum_none;
+	u64 csum_ok;
+	u64 csum_none;
 	int hwtstamp_rx_filter;
 	int numa_node;
 	struct lro_ctrl lro;
@@ -385,14 +394,14 @@ struct mlx4_en_cq {
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	unsigned int state;
-#define MLX4_EN_CQ_STATEIDLE        0
-#define MLX4_EN_CQ_STATENAPI     1    /* NAPI owns this CQ */
-#define MLX4_EN_CQ_STATEPOLL     2    /* poll owns this CQ */
-#define MLX4_CQ_LOCKED (MLX4_EN_CQ_STATENAPI | MLX4_EN_CQ_STATEPOLL)
-#define MLX4_EN_CQ_STATENAPI_YIELD  4    /* NAPI yielded this CQ */
-#define MLX4_EN_CQ_STATEPOLL_YIELD  8    /* poll yielded this CQ */
-#define CQ_YIELD (MLX4_EN_CQ_STATENAPI_YIELD | MLX4_EN_CQ_STATEPOLL_YIELD)
-#define CQ_USER_PEND (MLX4_EN_CQ_STATEPOLL | MLX4_EN_CQ_STATEPOLL_YIELD)
+#define MLX4_EN_CQ_STATE_IDLE        0
+#define MLX4_EN_CQ_STATE_NAPI     1    /* NAPI owns this CQ */
+#define MLX4_EN_CQ_STATE_POLL     2    /* poll owns this CQ */
+#define MLX4_CQ_LOCKED (MLX4_EN_CQ_STATE_NAPI | MLX4_EN_CQ_STATE_POLL)
+#define MLX4_EN_CQ_STATE_NAPI_YIELD  4    /* NAPI yielded this CQ */
+#define MLX4_EN_CQ_STATE_POLL_YIELD  8    /* poll yielded this CQ */
+#define CQ_YIELD (MLX4_EN_CQ_STATE_NAPI_YIELD | MLX4_EN_CQ_STATE_POLL_YIELD)
+#define CQ_USER_PEND (MLX4_EN_CQ_STATE_POLL | MLX4_EN_CQ_STATE_POLL_YIELD)
 	spinlock_t poll_lock; /* protects from LLS/napi conflicts */
 #endif  /* CONFIG_NET_RX_BUSY_POLL */
 };
@@ -408,6 +417,7 @@ struct mlx4_en_port_profile {
 	u8 tx_pause;
 	u8 tx_ppp;
 	int rss_rings;
+	int inline_thold;
 };
 
 struct mlx4_en_profile {
@@ -451,24 +461,30 @@ struct mlx4_en_rss_map {
 	enum mlx4_qp_state indir_state;
 };
 
+enum mlx4_en_port_flag {
+	MLX4_EN_PORT_ANC = 1<<0, /* Auto-negotiation complete */
+	MLX4_EN_PORT_ANE = 1<<1, /* Auto-negotiation enabled */
+};
+
 struct mlx4_en_port_state {
 	int link_state;
 	int link_speed;
-	int transciver;
-	int autoneg;
+	int transceiver;
+	u32 flags;
 };
 
-enum mlx4_en_mclist_act {
-	MCLIST_NONE,
-	MCLIST_REM,
-	MCLIST_ADD,
+enum mlx4_en_addr_list_act {
+	MLX4_ADDR_LIST_NONE,
+	MLX4_ADDR_LIST_REM,
+	MLX4_ADDR_LIST_ADD,
 };
 
-struct mlx4_en_mc_list {
+struct mlx4_en_addr_list {
 	struct list_head	list;
-	enum mlx4_en_mclist_act	action;
+	enum mlx4_en_addr_list_act	action;
 	u8			addr[ETH_ALEN];
 	u64			reg_id;
+	u64			tunnel_reg_id;
 };
 
 #ifdef CONFIG_MLX4_EN_DCB
@@ -476,6 +492,7 @@ struct mlx4_en_mc_list {
 #define MLX4_EN_BW_MIN 1
 #define MLX4_EN_BW_MAX 100 /* Utilize 100% of the line */
 
+#define MLX4_EN_TC_VENDOR 0
 #define MLX4_EN_TC_ETS 7
 
 #endif
@@ -541,6 +558,7 @@ struct mlx4_en_priv {
 	bool port_up;
 	int port;
 	int registered;
+	int gone;
 	int allocated;
 	int stride;
 	unsigned char current_mac[ETH_ALEN + 2];
@@ -570,12 +588,17 @@ struct mlx4_en_priv {
 	struct mlx4_en_perf_stats pstats;
 	struct mlx4_en_pkt_stats pkstats;
 	struct mlx4_en_pkt_stats pkstats_last;
-	struct mlx4_en_flow_stats flowstats[MLX4_NUM_PRIORITIES];
+	struct mlx4_en_flow_stats_rx rx_priority_flowstats[MLX4_NUM_PRIORITIES];
+	struct mlx4_en_flow_stats_tx tx_priority_flowstats[MLX4_NUM_PRIORITIES];
+	struct mlx4_en_flow_stats_rx rx_flowstats;
+	struct mlx4_en_flow_stats_tx tx_flowstats;
 	struct mlx4_en_port_stats port_stats;
 	struct mlx4_en_vport_stats vport_stats;
 	struct mlx4_en_vf_stats vf_stats;
 	struct list_head mc_list;
-	struct list_head curr_list;
+	struct list_head uc_list;
+	struct list_head curr_mc_list;
+	struct list_head curr_uc_list;
 	u64 broadcast_id;
 	struct mlx4_en_stat_out_mbox hw_stats;
 	int vids[128];
@@ -592,8 +615,6 @@ struct mlx4_en_priv {
 	struct sysctl_oid *stat_sysctl;
 	struct sysctl_ctx_list conf_ctx;
 	struct sysctl_ctx_list stat_ctx;
-#define MLX4_EN_MAC_HASH_IDX 5
-	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
 
 #ifdef CONFIG_MLX4_EN_DCB
 	struct ieee_ets ets;
@@ -606,6 +627,7 @@ struct mlx4_en_priv {
 	struct list_head filters;
 	struct hlist_head filter_hash[1 << MLX4_EN_FILTER_HASH_SHIFT];
 #endif
+	u64 tunnel_reg_id;
 	struct en_port *vf_ports[MLX4_MAX_NUM_VF];
 	unsigned long last_ifq_jiffies;
 	u64 if_counters_rx_errors;
@@ -623,11 +645,16 @@ struct mlx4_mac_entry {
 	u64 reg_id;
 };
 
+static inline struct mlx4_cqe *mlx4_en_get_cqe(u8 *buf, int idx, int cqe_sz)
+{
+	return (struct mlx4_cqe *)(buf + idx * cqe_sz);
+}
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq)
 {
 	spin_lock_init(&cq->poll_lock);
-	cq->state = MLX4_EN_CQ_STATEIDLE;
+	cq->state = MLX4_EN_CQ_STATE_IDLE;
 }
 
 /* called from the device poll rutine to get ownership of a cq */
@@ -636,12 +663,12 @@ static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq)
 	int rc = true;
 	spin_lock(&cq->poll_lock);
 	if (cq->state & MLX4_CQ_LOCKED) {
-		WARN_ON(cq->state & MLX4_EN_CQ_STATENAPI);
-		cq->state |= MLX4_EN_CQ_STATENAPI_YIELD;
+		WARN_ON(cq->state & MLX4_EN_CQ_STATE_NAPI);
+		cq->state |= MLX4_EN_CQ_STATE_NAPI_YIELD;
 		rc = false;
 	} else
 		/* we don't care if someone yielded */
-		cq->state = MLX4_EN_CQ_STATENAPI;
+		cq->state = MLX4_EN_CQ_STATE_NAPI;
 	spin_unlock(&cq->poll_lock);
 	return rc;
 }
@@ -651,12 +678,12 @@ static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq)
 {
 	int rc = false;
 	spin_lock(&cq->poll_lock);
-	WARN_ON(cq->state & (MLX4_EN_CQ_STATEPOLL |
-			     MLX4_EN_CQ_STATENAPI_YIELD));
+	WARN_ON(cq->state & (MLX4_EN_CQ_STATE_POLL |
+			       MLX4_EN_CQ_STATE_NAPI_YIELD));
 
-	if (cq->state & MLX4_EN_CQ_STATEPOLL_YIELD)
+	if (cq->state & MLX4_EN_CQ_STATE_POLL_YIELD)
 		rc = true;
-	cq->state = MLX4_EN_CQ_STATEIDLE;
+	cq->state = MLX4_EN_CQ_STATE_IDLE;
 	spin_unlock(&cq->poll_lock);
 	return rc;
 }
@@ -671,14 +698,14 @@ static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq)
 		struct mlx4_en_priv *priv = netdev_priv(dev);
 		struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring];
 
-		cq->state |= MLX4_EN_CQ_STATEPOLL_YIELD;
+		cq->state |= MLX4_EN_CQ_STATE_POLL_YIELD;
 		rc = false;
 #ifdef LL_EXTENDED_STATS
 		rx_ring->yields++;
 #endif
 	} else
 		/* preserve yield marks */
-		cq->state |= MLX4_EN_CQ_STATEPOLL;
+		cq->state |= MLX4_EN_CQ_STATE_POLL;
 	spin_unlock_bh(&cq->poll_lock);
 	return rc;
 }
@@ -688,17 +715,17 @@ static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq)
 {
 	int rc = false;
 	spin_lock_bh(&cq->poll_lock);
-	WARN_ON(cq->state & (MLX4_EN_CQ_STATENAPI));
+	WARN_ON(cq->state & (MLX4_EN_CQ_STATE_NAPI));
 
-	if (cq->state & MLX4_EN_CQ_STATEPOLL_YIELD)
+	if (cq->state & MLX4_EN_CQ_STATE_POLL_YIELD)
 		rc = true;
-	cq->state = MLX4_EN_CQ_STATEIDLE;
+	cq->state = MLX4_EN_CQ_STATE_IDLE;
 	spin_unlock_bh(&cq->poll_lock);
 	return rc;
 }
 
 /* true if a socket is polling, even if it did not get the lock */
-static inline bool mlx4_en_cq_ll_polling(struct mlx4_en_cq *cq)
+static inline bool mlx4_en_cq_busy_polling(struct mlx4_en_cq *cq)
 {
 	WARN_ON(!(cq->state & MLX4_CQ_LOCKED));
 	return cq->state & CQ_USER_PEND;
@@ -728,7 +755,7 @@ static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq)
 	return false;
 }
 
-static inline bool mlx4_en_cq_ll_polling(struct mlx4_en_cq *cq)
+static inline bool mlx4_en_cq_busy_polling(struct mlx4_en_cq *cq)
 {
 	return false;
 }
@@ -770,6 +797,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
 			     int cq, int user_prio);
 void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
 				struct mlx4_en_tx_ring *ring);
+void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev);
 void mlx4_en_qflush(struct ifnet *dev);
 
 int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
@@ -883,10 +911,10 @@ enum {
         {                                                       \
         if ((priv)->registered)                                 \
                 printk(level "%s: %s: " format, DRV_NAME,       \
-                        (priv->dev)->if_xname, ## arg); \
+                        (priv)->dev->if_xname, ## arg); \
         else                                                    \
                 printk(level "%s: %s: Port %d: " format,        \
-                        DRV_NAME, dev_name(&priv->mdev->pdev->dev), \
+                        DRV_NAME, dev_name(&(priv)->mdev->pdev->dev), \
                         (priv)->port, ## arg);                  \
         }
 
@@ -905,12 +933,12 @@ do {								\
 
 #define mlx4_err(mdev, format, arg...)			\
 	pr_err("%s %s: " format, DRV_NAME,		\
-	       dev_name(&mdev->pdev->dev), ##arg)
+	       dev_name(&(mdev)->pdev->dev), ##arg)
 #define mlx4_info(mdev, format, arg...)			\
 	pr_info("%s %s: " format, DRV_NAME,		\
-		dev_name(&mdev->pdev->dev), ##arg)
+		dev_name(&(mdev)->pdev->dev), ##arg)
 #define mlx4_warn(mdev, format, arg...)			\
 	pr_warning("%s %s: " format, DRV_NAME,		\
-		   dev_name(&mdev->pdev->dev), ##arg)
+		   dev_name(&(mdev)->pdev->dev), ##arg)
 
 #endif
diff --git a/sys/dev/mlx4/mlx4_en/en_port.h b/sys/dev/mlx4/mlx4_en/en_port.h
index 6301717e1814..59b9a2598f34 100644
--- a/sys/dev/mlx4/mlx4_en/en_port.h
+++ b/sys/dev/mlx4/mlx4_en/en_port.h
@@ -53,7 +53,30 @@ enum {
 	MLX4_MCAST_ENABLE       = 2,
 };
 
+enum mlx4_link_mode {
+	MLX4_1000BASE_CX_SGMII	 = 0,
+	MLX4_1000BASE_KX	 = 1,
+	MLX4_10GBASE_CX4	 = 2,
+	MLX4_10GBASE_KX4	 = 3,
+	MLX4_10GBASE_KR		 = 4,
+	MLX4_20GBASE_KR2	 = 5,
+	MLX4_40GBASE_CR4	 = 6,
+	MLX4_40GBASE_KR4	 = 7,
+	MLX4_56GBASE_KR4	 = 8,
+	MLX4_10GBASE_CR		 = 12,
+	MLX4_10GBASE_SR		 = 13,
+	MLX4_40GBASE_SR4	 = 15,
+	MLX4_56GBASE_CR4	 = 17,
+	MLX4_56GBASE_SR4	 = 18,
+	MLX4_100BASE_TX		 = 24,
+	MLX4_1000BASE_T		 = 25,
+	MLX4_10GBASE_T		 = 26,
+};
+
+#define MLX4_PROT_MASK(link_mode) (1<<(link_mode))
+
 enum {
+	MLX4_EN_100M_SPEED	= 0x04,
 	MLX4_EN_10G_SPEED_XAUI	= 0x00,
 	MLX4_EN_10G_SPEED_XFI	= 0x01,
 	MLX4_EN_1G_SPEED	= 0x02,
@@ -66,12 +89,13 @@ enum {
 struct mlx4_en_query_port_context {
 	u8 link_up;
 #define MLX4_EN_LINK_UP_MASK	0x80
+#define MLX4_EN_ANC_MASK	0x40
 	u8 autoneg;
 #define MLX4_EN_AUTONEG_MASK	0x80
 	__be16 mtu;
 	u8 reserved2;
 	u8 link_speed;
-#define MLX4_EN_SPEED_MASK	0x6b
+#define MLX4_EN_SPEED_MASK	0x6f
 	u16 reserved3[5];
 	__be64 mac;
 	u8 transceiver;
@@ -559,5 +583,4 @@ struct mlx4_en_stat_out_mbox {
 	__be32 TDROP;
 };
 
-
 #endif
diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_cq.c b/sys/dev/mlx4/mlx4_en/mlx4_en_cq.c
index febef4bebc0e..f19ce2f1e1ef 100644
--- a/sys/dev/mlx4/mlx4_en/mlx4_en_cq.c
+++ b/sys/dev/mlx4/mlx4_en/mlx4_en_cq.c
@@ -31,13 +31,14 @@
  *
  */
 
+#include <linux/rcupdate.h>
+
 #include <dev/mlx4/cq.h>
 #include <dev/mlx4/qp.h>
 #include <dev/mlx4/cmd.h>
 
 #include "en.h"
 
-
 static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event)
 {
 	return;
@@ -53,11 +54,11 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 	struct mlx4_en_cq *cq;
 	int err;
 
-	cq = kzalloc_node(sizeof(struct mlx4_en_cq), GFP_KERNEL, node);
+	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, node);
 	if (!cq) {
-		cq = kzalloc(sizeof(struct mlx4_en_cq), GFP_KERNEL);
+		cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 		if (!cq) {
-			en_err(priv, "Failed to allocate CW struture\n");
+			en_err(priv, "Failed to allocate CQ structure\n");
 			return -ENOMEM;
 		}
 	}
@@ -80,6 +81,7 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 
 	cq->ring = ring;
 	cq->is_tx = mode;
+	cq->vector = mdev->dev->caps.num_comp_vectors;
 	spin_lock_init(&cq->lock);
 
 	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
@@ -91,7 +93,7 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 	if (err)
 		goto err_res;
 
-	cq->buf = (struct mlx4_cqe *) cq->wqres.buf.direct.buf;
+	cq->buf = (struct mlx4_cqe *)cq->wqres.buf.direct.buf;
 	*pcq = cq;
 
 	return 0;
@@ -100,17 +102,17 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
 err_cq:
 	kfree(cq);
+	*pcq = NULL;
 	return err;
 }
 
-
 int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 			int cq_idx)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
 	int err = 0;
-	char name[25];
 	int timestamp_en = 0;
+	bool assigned_eq = false;
 
 	cq->dev = mdev->pndev[priv->port];
 	cq->mcq.set_ci_db  = cq->wqres.db.db;
@@ -120,22 +122,19 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	memset(cq->buf, 0, cq->buf_size);
 
 	if (cq->is_tx == RX) {
-		if (mdev->dev->caps.comp_pool) {
-			if (!cq->vector) {
-				sprintf(name, "%s-%d", if_name(priv->dev),
-					cq->ring);
-				/* Set IRQ for specific name (per ring) */
-				if (mlx4_assign_eq(mdev->dev, name, &cq->vector)) {
-					cq->vector = (cq->ring + 1 + priv->port)
-					    % mdev->dev->caps.num_comp_vectors;
-					mlx4_warn(mdev, "Failed Assigning an EQ to "
-						  "%s ,Falling back to legacy EQ's\n",
-						  name);
-				}
+		if (!mlx4_is_eq_vector_valid(mdev->dev, priv->port,
+					     cq->vector)) {
+			cq->vector = cq_idx % mdev->dev->caps.num_comp_vectors;
+
+			err = mlx4_assign_eq(mdev->dev, priv->port,
+					     &cq->vector);
+			if (err) {
+				mlx4_err(mdev, "Failed assigning an EQ to CQ vector %d\n",
+					 cq->vector);
+				goto free_eq;
 			}
-		} else {
-			cq->vector = (cq->ring + 1 + priv->port) %
-				mdev->dev->caps.num_comp_vectors;
+
+			assigned_eq = true;
 		}
 	} else {
 		struct mlx4_en_cq *rx_cq;
@@ -150,11 +149,12 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 
 	if (!cq->is_tx)
 		cq->size = priv->rx_ring[cq->ring]->actual_size;
+
 	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt,
 			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
 			    cq->vector, 0, timestamp_en);
 	if (err)
-		return err;
+		goto free_eq;
 
 	cq->mcq.comp  = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
 	cq->mcq.event = mlx4_en_cq_event;
@@ -167,6 +167,12 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 
 
 	return 0;
+
+free_eq:
+	if (assigned_eq)
+		mlx4_release_eq(mdev->dev, cq->vector);
+	cq->vector = mdev->dev->caps.num_comp_vectors;
+	return err;
 }
 
 void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
@@ -178,24 +184,28 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
 	taskqueue_free(cq->tq);
 	mlx4_en_unmap_buffer(&cq->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
-	if (priv->mdev->dev->caps.comp_pool && cq->vector)
+	if (mlx4_is_eq_vector_valid(mdev->dev, priv->port, cq->vector) &&
+	    cq->is_tx == RX)
 		mlx4_release_eq(priv->mdev->dev, cq->vector);
+	cq->vector = 0;
+	cq->buf_size = 0;
+	cq->buf = NULL;
 	kfree(cq);
 	*pcq = NULL;
 }
 
 void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
 {
-        struct mlx4_en_dev *mdev = priv->mdev;
+	taskqueue_drain(cq->tq, &cq->cq_task);
+	if (!cq->is_tx) {
+		synchronize_rcu();
+	} else {
+		del_timer_sync(&cq->timer);
+	}
 
-        taskqueue_drain(cq->tq, &cq->cq_task);
-        if (cq->is_tx)
-                del_timer_sync(&cq->timer);
-
-        mlx4_cq_free(mdev->dev, &cq->mcq);
+	mlx4_cq_free(priv->mdev->dev, &cq->mcq);
 }
 
-
 /* Set rx cq moderation parameters */
 int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
 {
diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_main.c b/sys/dev/mlx4/mlx4_en/mlx4_en_main.c
index bacef4ada6bc..3f57a038ec53 100644
--- a/sys/dev/mlx4/mlx4_en/mlx4_en_main.c
+++ b/sys/dev/mlx4/mlx4_en/mlx4_en_main.c
@@ -58,7 +58,7 @@
 
 /* Enable RSS UDP traffic */
 MLX4_EN_PARM_INT(udp_rss, 1,
-		 "Enable RSS for incoming UDP traffic");
+		 "Enable RSS for incoming UDP traffic or disabled (0)");
 
 /* Priority pausing */
 MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
@@ -66,9 +66,11 @@ MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
 MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]."
 			   " Per priority bit mask");
 
-#define MAX_PFC_TX	0xff
-#define MAX_PFC_RX	0xff
+MLX4_EN_PARM_INT(inline_thold, MAX_INLINE,
+		 "Threshold for using inline data (range: 17-104, default: 104)");
 
+#define MAX_PFC_TX     0xff
+#define MAX_PFC_RX     0xff
 
 static int mlx4_en_get_profile(struct mlx4_en_dev *mdev)
 {
@@ -93,6 +95,7 @@ static int mlx4_en_get_profile(struct mlx4_en_dev *mdev)
 		params->prof[i].tx_ring_num = params->num_tx_rings_p_up *
 			MLX4_EN_NUM_UP;
 		params->prof[i].rss_rings = 0;
+		params->prof[i].inline_thold = inline_thold;
 	}
 
 	return 0;
@@ -142,7 +145,7 @@ static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr,
 static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
 {
 	struct mlx4_en_dev *mdev = endev_ptr;
-	int i, ret;
+	int i;
 
 	mutex_lock(&mdev->state_lock);
 	mdev->device_up = false;
@@ -154,28 +157,34 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
 
 	flush_workqueue(mdev->workqueue);
 	destroy_workqueue(mdev->workqueue);
-	ret = mlx4_mr_free(dev, &mdev->mr);
-	if (ret)
-		mlx4_err(mdev, "Error deregistering MR. The system may have become unstable.");
+	(void) mlx4_mr_free(dev, &mdev->mr);
 	iounmap(mdev->uar_map);
 	mlx4_uar_free(dev, &mdev->priv_uar);
 	mlx4_pd_free(dev, mdev->priv_pdn);
 	kfree(mdev);
 }
 
+static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
+{
+	int i;
+	struct mlx4_en_dev *mdev = ctx;
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
+			mdev->pndev[i] = NULL;
+	}
+}
+
 static void *mlx4_en_add(struct mlx4_dev *dev)
 {
 	struct mlx4_en_dev *mdev;
 	int i;
-	int err;
 
-	mdev = kzalloc(sizeof *mdev, GFP_KERNEL);
-	if (!mdev) {
-		dev_err(&dev->pdev->dev, "Device struct alloc failed, "
-			"aborting.\n");
-		err = -ENOMEM;
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
 		goto err_free_res;
-	}
 
 	if (mlx4_pd_alloc(dev, &mdev->priv_pdn))
 		goto err_free_dev;
@@ -190,8 +199,8 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 	spin_lock_init(&mdev->uar_lock);
 
 	mdev->dev = dev;
-	mdev->dma_device = &(dev->pdev->dev);
-	mdev->pdev = dev->pdev;
+	mdev->dma_device = &dev->persist->pdev->dev;
+	mdev->pdev = dev->persist->pdev;
 	mdev->device_up = false;
 
 	mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
@@ -211,9 +220,8 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 	}
 
 	/* Build device profile according to supplied module parameters */
-	err = mlx4_en_get_profile(mdev);
-	if (err) {
-		mlx4_err(mdev, "Bad module parameters, aborting.\n");
+	if (mlx4_en_get_profile(mdev)) {
+		mlx4_err(mdev, "Bad module parameters, aborting\n");
 		goto err_mr;
 	}
 
@@ -222,50 +230,25 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
 		mdev->port_cnt++;
 
-
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-		if (!dev->caps.comp_pool) {
-			mdev->profile.prof[i].rx_ring_num =
-				rounddown_pow_of_two(max_t(int, MIN_RX_RINGS,
-							   min_t(int,
-								 dev->caps.num_comp_vectors,
-								 DEF_RX_RINGS)));
-		} else {
-			mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(
-				min_t(int, dev->caps.comp_pool /
-				      dev->caps.num_ports, MAX_MSIX_P_PORT));
-		}
-	}
+	/* Set default number of RX rings*/
+	mlx4_en_set_num_rx_rings(mdev);
 
 	/* Create our own workqueue for reset/multicast tasks
 	 * Note: we cannot use the shared workqueue because of deadlocks caused
 	 *       by the rtnl lock */
 	mdev->workqueue = create_singlethread_workqueue("mlx4_en");
-	if (!mdev->workqueue) {
-		err = -ENOMEM;
+	if (!mdev->workqueue)
 		goto err_mr;
-	}
 
 	/* At this stage all non-port specific tasks are complete:
 	 * mark the card state as up */
 	mutex_init(&mdev->state_lock);
 	mdev->device_up = true;
 
-	/* Setup ports */
-
-	/* Create a netdev for each port */
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-		mlx4_info(mdev, "Activating port:%d\n", i);
-		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
-			mdev->pndev[i] = NULL;
-	}
-
 	return mdev;
 
 err_mr:
-	err = mlx4_mr_free(dev, &mdev->mr);
-	if (err)
-		mlx4_err(mdev, "Error deregistering MR. The system may have become unstable.");
+	(void) mlx4_mr_free(dev, &mdev->mr);
 err_map:
 	if (mdev->uar_map)
 		iounmap(mdev->uar_map);
@@ -285,45 +268,40 @@ static struct mlx4_interface mlx4_en_interface = {
 	.event		= mlx4_en_event,
 	.get_dev	= mlx4_en_get_netdev,
 	.protocol	= MLX4_PROT_ETH,
+	.activate	= mlx4_en_activate,
 };
 
 static void mlx4_en_verify_params(void)
 {
-        if (pfctx > MAX_PFC_TX) {
-                pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - "
-                                "should be in range 0-0x%x, will be changed to default (0)\n",
-                                pfctx, MAX_PFC_TX);
-                pfctx = 0;
-        }
+	if (pfctx > MAX_PFC_TX) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n",
+			pfctx, MAX_PFC_TX);
+		pfctx = 0;
+	}
 
-        if (pfcrx > MAX_PFC_RX) {
-                pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - "
-                                "should be in range 0-0x%x, will be changed to default (0)\n",
-                                pfcrx, MAX_PFC_RX);
-                pfcrx = 0;
-        }
+	if (pfcrx > MAX_PFC_RX) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n",
+			pfcrx, MAX_PFC_RX);
+		pfcrx = 0;
+	}
+
+	if (inline_thold < MIN_PKT_LEN || inline_thold > MAX_INLINE) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter inline_thold %d - should be in range %d-%d, will be changed to default (%d)\n",
+			inline_thold, MIN_PKT_LEN, MAX_INLINE, MAX_INLINE);
+		inline_thold = MAX_INLINE;
+	}
 }
 
-
 static int __init mlx4_en_init(void)
 {
-        mlx4_en_verify_params();
+	mlx4_en_verify_params();
 
-#ifdef CONFIG_DEBUG_FS
-	int err = 0;
-	err = mlx4_en_register_debugfs();
-	if (err)
-		pr_err(KERN_ERR "Failed to register debugfs\n");
-#endif
 	return mlx4_register_interface(&mlx4_en_interface);
 }
 
 static void __exit mlx4_en_cleanup(void)
 {
 	mlx4_unregister_interface(&mlx4_en_interface);
-#ifdef CONFIG_DEBUG_FS
-	mlx4_en_unregister_debugfs();
-#endif
 }
 
 module_init(mlx4_en_init);
diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c b/sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c
index c7f169aa549b..72fd57a7ba88 100644
--- a/sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c
+++ b/sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c
@@ -34,6 +34,7 @@
 #include <linux/etherdevice.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 #ifdef CONFIG_NET_RX_BUSY_POLL
 #include <net/busy_poll.h>
 #endif
@@ -73,7 +74,7 @@ static int mlx4_en_low_latency_recv(struct napi_struct *napi)
 
 	done = mlx4_en_process_rx_cq(dev, cq, 4);
 #ifdef LL_EXTENDED_STATS
-	if (done)
+	if (likely(done))
 		rx_ring->cleaned += done;
 	else
 		rx_ring->misses++;
@@ -118,7 +119,7 @@ static enum mlx4_net_trans_rule_id mlx4_ip_proto_to_trans_rule_id(u8 ip_proto)
 	case IPPROTO_TCP:
 		return MLX4_NET_TRANS_RULE_ID_TCP;
 	default:
-		return -EPROTONOSUPPORT;
+		return MLX4_NET_TRANS_RULE_NUM;
 	}
 };
 
@@ -165,7 +166,7 @@ static void mlx4_en_filter_work(struct work_struct *work)
 	int rc;
 	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
 
-	if (spec_tcp_udp.id < 0) {
+	if (spec_tcp_udp.id >= MLX4_NET_TRANS_RULE_NUM) {
 		en_warn(priv, "RFS: ignoring unsupported ip protocol (%d)\n",
 			filter->ip_proto);
 		goto ignore;
@@ -344,8 +345,7 @@ mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	return ret;
 }
 
-void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv,
-			     struct mlx4_en_rx_ring *rx_ring)
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv)
 {
 	struct mlx4_en_filter *filter, *tmp;
 	LIST_HEAD(del_list);
@@ -450,6 +450,25 @@ static void mlx4_en_vlan_rx_kill_vid(void *arg, struct net_device *dev, u16 vid)
 
 }
 
+static int mlx4_en_tunnel_steer_add(struct mlx4_en_priv *priv, unsigned char *addr,
+				    int qpn, u64 *reg_id)
+{
+	int err;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN ||
+	    priv->mdev->dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC)
+		return 0; /* do nothing */
+
+	err = mlx4_tunnel_steer_add(priv->mdev->dev, addr, priv->port, qpn,
+				    MLX4_DOMAIN_NIC, reg_id);
+	if (err) {
+		en_err(priv, "failed to add vxlan steering rule, err %d\n", err);
+		return err;
+	}
+	en_dbg(DRV, priv, "added vxlan steering rule, mac %pM reg_id %llx\n", addr, (long long)*reg_id);
+	return 0;
+}
+
 static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv,
 				unsigned char *mac, int *qpn, u64 *reg_id)
 {
@@ -533,10 +552,8 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_dev *dev = mdev->dev;
-	struct mlx4_mac_entry *entry;
 	int index = 0;
 	int err = 0;
-	u64 reg_id;
 	int *qpn = &priv->base_qpn;
 	u64 mac = mlx4_mac_to_u64(IF_LLADDR(priv->dev));
 
@@ -556,39 +573,15 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
 		return 0;
 	}
 
-	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, 0);
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP);
 	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
 	if (err) {
 		en_err(priv, "Failed to reserve qp for mac registration\n");
-		goto qp_err;
+		mlx4_unregister_mac(dev, priv->port, mac);
+		return err;
 	}
 
-	err = mlx4_en_uc_steer_add(priv, IF_LLADDR(priv->dev), qpn, &reg_id);
-	if (err)
-		goto steer_err;
-
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry) {
-		err = -ENOMEM;
-		goto alloc_err;
-	}
-	memcpy(entry->mac, IF_LLADDR(priv->dev), sizeof(entry->mac));
-	entry->reg_id = reg_id;
-
-	hlist_add_head(&entry->hlist,
-			   &priv->mac_hash[entry->mac[MLX4_EN_MAC_HASH_IDX]]);
-
 	return 0;
-
-alloc_err:
-	mlx4_en_uc_steer_release(priv, IF_LLADDR(priv->dev), *qpn, reg_id);
-
-steer_err:
-	mlx4_qp_release_range(dev, *qpn, 1);
-
-qp_err:
-	mlx4_unregister_mac(dev, priv->port, mac);
-	return err;
 }
 
 static void mlx4_en_put_qp(struct mlx4_en_priv *priv)
@@ -596,34 +589,13 @@ static void mlx4_en_put_qp(struct mlx4_en_priv *priv)
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_dev *dev = mdev->dev;
 	int qpn = priv->base_qpn;
-	u64 mac;
 
 	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
-		mac = mlx4_mac_to_u64(IF_LLADDR(priv->dev));
+		u64 mac = mlx4_mac_to_u64(IF_LLADDR(priv->dev));
 		en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n",
 		       IF_LLADDR(priv->dev));
 		mlx4_unregister_mac(dev, priv->port, mac);
 	} else {
-		struct mlx4_mac_entry *entry;
-		struct hlist_node *tmp;
-		struct hlist_head *bucket;
-		unsigned int i;
-
-		for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) {
-			bucket = &priv->mac_hash[i];
-			hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
-				mac = mlx4_mac_to_u64(entry->mac);
-				en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n",
-				       entry->mac);
-				mlx4_en_uc_steer_release(priv, entry->mac,
-							 qpn, entry->reg_id);
-
-				mlx4_unregister_mac(dev, priv->port, mac);
-				hlist_del(&entry->hlist);
-				kfree(entry);
-			}
-		}
-
 		en_dbg(DRV, priv, "Releasing qp: port %d, qpn %d\n",
 		       priv->port, qpn);
 		mlx4_qp_release_range(dev, qpn, 1);
@@ -631,10 +603,48 @@ static void mlx4_en_put_qp(struct mlx4_en_priv *priv)
 	}
 }
 
-static void mlx4_en_clear_list(struct net_device *dev)
+static void mlx4_en_clear_uclist(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
-	struct mlx4_en_mc_list *tmp, *mc_to_del;
+	struct mlx4_en_addr_list *tmp, *uc_to_del;
+
+	list_for_each_entry_safe(uc_to_del, tmp, &priv->uc_list, list) {
+		list_del(&uc_to_del->list);
+		kfree(uc_to_del);
+	}
+}
+
+static void mlx4_en_cache_uclist(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_addr_list *tmp;
+	struct ifaddr *ifa;
+
+	mlx4_en_clear_uclist(dev);
+
+	if_addr_rlock(dev);
+	TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) {
+		if (ifa->ifa_addr->sa_family != AF_LINK)
+			continue;
+		if (((struct sockaddr_dl *)ifa->ifa_addr)->sdl_alen !=
+				ETHER_ADDR_LEN)
+			continue;
+		tmp = kzalloc(sizeof(struct mlx4_en_addr_list), GFP_ATOMIC);
+		if (tmp == NULL) {
+			en_err(priv, "Failed to allocate address list\n");
+			break;
+		}
+		memcpy(tmp->addr,
+			LLADDR((struct sockaddr_dl *)ifa->ifa_addr), ETH_ALEN);
+		list_add_tail(&tmp->list, &priv->uc_list);
+	}
+	if_addr_runlock(dev);
+}
+
+static void mlx4_en_clear_mclist(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_addr_list *tmp, *mc_to_del;
 
 	list_for_each_entry_safe(mc_to_del, tmp, &priv->mc_list, list) {
 		list_del(&mc_to_del->list);
@@ -644,35 +654,36 @@ static void mlx4_en_clear_list(struct net_device *dev)
 
 static void mlx4_en_cache_mclist(struct net_device *dev)
 {
-        struct ifmultiaddr *ifma;
-	struct mlx4_en_mc_list *tmp;
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_addr_list *tmp;
+	struct ifmultiaddr *ifma;
 
-        if_maddr_rlock(dev);
-        TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
-                if (ifma->ifma_addr->sa_family != AF_LINK)
-                        continue;
-                if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
-                                ETHER_ADDR_LEN)
-                        continue;
-                /* Make sure the list didn't grow. */
-		tmp = kzalloc(sizeof(struct mlx4_en_mc_list), GFP_ATOMIC);
+	mlx4_en_clear_mclist(dev);
+
+	if_maddr_rlock(dev);
+	TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
+		if (ifma->ifma_addr->sa_family != AF_LINK)
+			continue;
+		if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
+				ETHER_ADDR_LEN)
+			continue;
+		tmp = kzalloc(sizeof(struct mlx4_en_addr_list), GFP_ATOMIC);
 		if (tmp == NULL) {
-			en_err(priv, "Failed to allocate multicast list\n");
+			en_err(priv, "Failed to allocate address list\n");
 			break;
 		}
 		memcpy(tmp->addr,
 			LLADDR((struct sockaddr_dl *)ifma->ifma_addr), ETH_ALEN);
 		list_add_tail(&tmp->list, &priv->mc_list);
-        }
-        if_maddr_runlock(dev);
+	}
+	if_maddr_runlock(dev);
 }
 
-static void update_mclist_flags(struct mlx4_en_priv *priv,
+static void update_addr_list_flags(struct mlx4_en_priv *priv,
 				struct list_head *dst,
 				struct list_head *src)
 {
-	struct mlx4_en_mc_list *dst_tmp, *src_tmp, *new_mc;
+	struct mlx4_en_addr_list *dst_tmp, *src_tmp, *new_mc;
 	bool found;
 
 	/* Find all the entries that should be removed from dst,
@@ -687,7 +698,7 @@ static void update_mclist_flags(struct mlx4_en_priv *priv,
 			}
 		}
 		if (!found)
-			dst_tmp->action = MCLIST_REM;
+			dst_tmp->action = MLX4_ADDR_LIST_REM;
 	}
 
 	/* Add entries that exist in src but not in dst
@@ -697,21 +708,21 @@ static void update_mclist_flags(struct mlx4_en_priv *priv,
 		found = false;
 		list_for_each_entry(dst_tmp, dst, list) {
 			if (!memcmp(dst_tmp->addr, src_tmp->addr, ETH_ALEN)) {
-				dst_tmp->action = MCLIST_NONE;
+				dst_tmp->action = MLX4_ADDR_LIST_NONE;
 				found = true;
 				break;
 			}
 		}
 		if (!found) {
-			new_mc = kmalloc(sizeof(struct mlx4_en_mc_list),
+			new_mc = kmalloc(sizeof(struct mlx4_en_addr_list),
 					 GFP_KERNEL);
 			if (!new_mc) {
 				en_err(priv, "Failed to allocate current multicast list\n");
 				return;
 			}
 			memcpy(new_mc, src_tmp,
-			       sizeof(struct mlx4_en_mc_list));
-			new_mc->action = MCLIST_ADD;
+			       sizeof(struct mlx4_en_addr_list));
+			new_mc->action = MLX4_ADDR_LIST_ADD;
 			list_add_tail(&new_mc->list, dst);
 		}
 	}
@@ -731,6 +742,7 @@ static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv,
 				     struct mlx4_en_dev *mdev)
 {
 	int err = 0;
+
 	if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
 		priv->flags |= MLX4_EN_FLAG_PROMISC;
 
@@ -833,7 +845,7 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
 				 struct net_device *dev,
 				 struct mlx4_en_dev *mdev)
 {
-	struct mlx4_en_mc_list *mclist, *tmp;
+	struct mlx4_en_addr_list *addr_list, *tmp;
 	u8 mc_list[16] = {0};
 	int err = 0;
 	u64 mcast_addr = 0;
@@ -893,6 +905,28 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
 			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
 		}
 
+		/* Update unicast list */
+		mlx4_en_cache_uclist(dev);
+
+		update_addr_list_flags(priv, &priv->curr_uc_list, &priv->uc_list);
+
+		list_for_each_entry_safe(addr_list, tmp, &priv->curr_uc_list, list) {
+			if (addr_list->action == MLX4_ADDR_LIST_REM) {
+				mlx4_en_uc_steer_release(priv, addr_list->addr,
+							       priv->rss_map.indir_qp.qpn,
+							       addr_list->reg_id);
+				/* remove from list */
+				list_del(&addr_list->list);
+				kfree(addr_list);
+			} else if (addr_list->action == MLX4_ADDR_LIST_ADD) {
+				err = mlx4_en_uc_steer_add(priv, addr_list->addr,
+							   &priv->rss_map.indir_qp.qpn,
+							   &addr_list->reg_id);
+				if (err)
+					en_err(priv, "Fail to add unicast address\n");
+			}
+		}
+
 		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
 					  0, MLX4_MCAST_DISABLE);
 		if (err)
@@ -905,8 +939,8 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
 		/* Update multicast list - we cache all addresses so they won't
 		 * change while HW is updated holding the command semaphor */
 		mlx4_en_cache_mclist(dev);
-		list_for_each_entry(mclist, &priv->mc_list, list) {
-			mcast_addr = mlx4_mac_to_u64(mclist->addr);
+		list_for_each_entry(addr_list, &priv->mc_list, list) {
+			mcast_addr = mlx4_mac_to_u64(addr_list->addr);
 			mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
 					mcast_addr, 0, MLX4_MCAST_CONFIG);
 		}
@@ -915,26 +949,33 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
 		if (err)
 			en_err(priv, "Failed enabling multicast filter\n");
 
-		update_mclist_flags(priv, &priv->curr_list, &priv->mc_list);
-		list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
-			if (mclist->action == MCLIST_REM) {
+		update_addr_list_flags(priv, &priv->curr_mc_list, &priv->mc_list);
+
+		list_for_each_entry_safe(addr_list, tmp, &priv->curr_mc_list, list) {
+			if (addr_list->action == MLX4_ADDR_LIST_REM) {
 				/* detach this address and delete from list */
-				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				memcpy(&mc_list[10], addr_list->addr, ETH_ALEN);
 				mc_list[5] = priv->port;
 				err = mlx4_multicast_detach(mdev->dev,
 							    &priv->rss_map.indir_qp,
 							    mc_list,
 							    MLX4_PROT_ETH,
-							    mclist->reg_id);
+							    addr_list->reg_id);
 				if (err)
 					en_err(priv, "Fail to detach multicast address\n");
 
+				if (addr_list->tunnel_reg_id) {
+					err = mlx4_flow_detach(priv->mdev->dev, addr_list->tunnel_reg_id);
+					if (err)
+						en_err(priv, "Failed to detach multicast address\n");
+				}
+
 				/* remove from list */
-				list_del(&mclist->list);
-				kfree(mclist);
-			} else if (mclist->action == MCLIST_ADD) {
+				list_del(&addr_list->list);
+				kfree(addr_list);
+			} else if (addr_list->action == MLX4_ADDR_LIST_ADD) {
 				/* attach the address */
-				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				memcpy(&mc_list[10], addr_list->addr, ETH_ALEN);
 				/* needed for B0 steering support */
 				mc_list[5] = priv->port;
 				err = mlx4_multicast_attach(mdev->dev,
@@ -942,10 +983,14 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
 							    mc_list,
 							    priv->port, 0,
 							    MLX4_PROT_ETH,
-							    &mclist->reg_id);
+							    &addr_list->reg_id);
 				if (err)
 					en_err(priv, "Fail to attach multicast address\n");
 
+				err = mlx4_en_tunnel_steer_add(priv, &mc_list[10], priv->base_qpn,
+							       &addr_list->tunnel_reg_id);
+				if (err)
+					en_err(priv, "Failed to attach multicast address\n");
 			}
 		}
 	}
@@ -958,7 +1003,6 @@ static void mlx4_en_do_set_rx_mode(struct work_struct *work)
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct net_device *dev = priv->dev;
 
-
 	mutex_lock(&mdev->state_lock);
 	if (!mdev->device_up) {
 		en_dbg(HW, priv, "Card is not up, ignoring rx mode change.\n");
@@ -998,24 +1042,6 @@ static void mlx4_en_do_set_rx_mode(struct work_struct *work)
 	mutex_unlock(&mdev->state_lock);
 }
 
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static void mlx4_en_netpoll(struct net_device *dev)
-{
-	struct mlx4_en_priv *priv = netdev_priv(dev);
-	struct mlx4_en_cq *cq;
-	unsigned long flags;
-	int i;
-
-	for (i = 0; i < priv->rx_ring_num; i++) {
-		cq = priv->rx_cq[i];
-		spin_lock_irqsave(&cq->lock, flags);
-		napi_synchronize(&cq->napi);
-		mlx4_en_process_rx_cq(dev, cq, 0);
-		spin_unlock_irqrestore(&cq->lock, flags);
-	}
-}
-#endif
-
 static void mlx4_en_watchdog_timeout(void *arg)
 {
         struct mlx4_en_priv *priv = arg;
@@ -1038,10 +1064,10 @@ static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
 	/* If we haven't received a specific coalescing setting
 	 * (module param), we set the moderation parameters as follows:
 	 * - moder_cnt is set to the number of mtu sized packets to
-	 *   satisfy our coelsing target.
+	 *   satisfy our coalescing target.
 	 * - moder_time is set to a fixed value.
 	 */
-	priv->rx_frames = MLX4_EN_RX_COAL_TARGET / priv->dev->if_mtu + 1;
+	priv->rx_frames = MLX4_EN_RX_COAL_TARGET;
 	priv->rx_usecs = MLX4_EN_RX_COAL_TIME;
 	priv->tx_frames = MLX4_EN_TX_COAL_PKTS;
 	priv->tx_usecs = MLX4_EN_TX_COAL_TIME;
@@ -1126,6 +1152,7 @@ static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv)
 			priv->last_moder_time[ring] = moder_time;
 			cq = priv->rx_cq[ring];
 			cq->moder_time = moder_time;
+			cq->moder_cnt = priv->rx_frames;
 			err = mlx4_en_set_cq_moder(priv, cq);
 			if (err)
 				en_err(priv, "Failed modifying moderation for cq:%d\n",
@@ -1149,7 +1176,10 @@ static void mlx4_en_do_get_stats(struct work_struct *work)
 	mutex_lock(&mdev->state_lock);
 	if (mdev->device_up) {
 		if (priv->port_up) {
-                        err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
+			if (mlx4_is_slave(mdev->dev))
+				err = mlx4_en_get_vport_stats(mdev, priv->port);
+			else
+				err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
 			if (err)
 				en_dbg(HW, priv, "Could not update stats\n");
 
@@ -1236,7 +1266,9 @@ int mlx4_en_start_port(struct net_device *dev)
 	}
 
 	INIT_LIST_HEAD(&priv->mc_list);
-	INIT_LIST_HEAD(&priv->curr_list);
+	INIT_LIST_HEAD(&priv->uc_list);
+	INIT_LIST_HEAD(&priv->curr_mc_list);
+	INIT_LIST_HEAD(&priv->curr_uc_list);
 	INIT_LIST_HEAD(&priv->ethtool_list);
 
 	/* Calculate Rx buf size */
@@ -1281,12 +1313,8 @@ int mlx4_en_start_port(struct net_device *dev)
 	}
 	mdev->mac_removed[priv->port] = 0;
 
-	/* gets default allocated counter index from func cap */
-	/* or sink counter index if no resources */
-	priv->counter_index = mdev->dev->caps.def_counter_index[priv->port - 1];
-
-	en_dbg(DRV, priv, "%s: default counter index %d for port %d\n",
-	       __func__, priv->counter_index, priv->port);
+	priv->counter_index =
+			mlx4_get_default_counter_index(mdev->dev, priv->port);
 
 	err = mlx4_en_config_rss_steer(priv);
 	if (err) {
@@ -1332,7 +1360,7 @@ int mlx4_en_start_port(struct net_device *dev)
 
 		/* Set initial ownership of all Tx TXBBs to SW (1) */
 		for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE)
-			*((u32 *) (tx_ring->buf + j)) = 0xffffffff;
+			*((u32 *) (tx_ring->buf + j)) = INIT_OWNER_BIT;
 		++tx_index;
 	}
 
@@ -1415,7 +1443,7 @@ void mlx4_en_stop_port(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
-	struct mlx4_en_mc_list *mclist, *tmp;
+	struct mlx4_en_addr_list *addr_list, *tmp;
 	int i;
 	u8 mc_list[16] = {0};
 
@@ -1433,10 +1461,7 @@ void mlx4_en_stop_port(struct net_device *dev)
 
 	/* Set port as not active */
 	priv->port_up = false;
-	if (priv->counter_index != 0xff) {
-		mlx4_counter_free(mdev->dev, priv->port, priv->counter_index);
-		priv->counter_index = 0xff;
-	}
+	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
 
 	/* Promsicuous mode */
 	if (mdev->dev->caps.steering_mode ==
@@ -1464,21 +1489,33 @@ void mlx4_en_stop_port(struct net_device *dev)
 		}
 	}
 
+	/* Detach All unicasts */
+	list_for_each_entry(addr_list, &priv->curr_uc_list, list) {
+		mlx4_en_uc_steer_release(priv, addr_list->addr,
+					 priv->rss_map.indir_qp.qpn,
+					 addr_list->reg_id);
+	}
+	mlx4_en_clear_uclist(dev);
+	list_for_each_entry_safe(addr_list, tmp, &priv->curr_uc_list, list) {
+		list_del(&addr_list->list);
+		kfree(addr_list);
+	}
+
 	/* Detach All multicasts */
 	memset(&mc_list[10], 0xff, ETH_ALEN);
 	mc_list[5] = priv->port; /* needed for B0 steering support */
 	mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list,
 			      MLX4_PROT_ETH, priv->broadcast_id);
-	list_for_each_entry(mclist, &priv->curr_list, list) {
-		memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+	list_for_each_entry(addr_list, &priv->curr_mc_list, list) {
+		memcpy(&mc_list[10], addr_list->addr, ETH_ALEN);
 		mc_list[5] = priv->port;
 		mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp,
-				      mc_list, MLX4_PROT_ETH, mclist->reg_id);
+				      mc_list, MLX4_PROT_ETH, addr_list->reg_id);
 	}
-	mlx4_en_clear_list(dev);
-	list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
-		list_del(&mclist->list);
-		kfree(mclist);
+	mlx4_en_clear_mclist(dev);
+	list_for_each_entry_safe(addr_list, tmp, &priv->curr_mc_list, list) {
+		list_del(&addr_list->list);
+		kfree(addr_list);
 	}
 
 	/* Flush multicast filter */
@@ -1716,10 +1753,16 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
 
 	en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
 
-        if (priv->vlan_attach != NULL)
-                EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
-        if (priv->vlan_detach != NULL)
-                EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
+	/* don't allow more IOCTLs */
+	priv->gone = 1;
+
+	/* XXX wait a bit to allow IOCTL handlers to complete */
+	pause("W", hz);
+
+	if (priv->vlan_attach != NULL)
+		EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
+	if (priv->vlan_detach != NULL)
+		EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
 
 	/* Unregister device - this will close the port if it was up */
 	if (priv->registered) {
@@ -1805,9 +1848,12 @@ static int mlx4_en_calc_media(struct mlx4_en_priv *priv)
 	if (priv->last_link_state == MLX4_DEV_EVENT_PORT_DOWN)
 		return (active);
 	active |= IFM_FDX;
-	trans_type = priv->port_state.transciver;
+	trans_type = priv->port_state.transceiver;
 	/* XXX I don't know all of the transceiver values. */
 	switch (priv->port_state.link_speed) {
+	case 100:
+		active |= IFM_100_T;
+		break;
 	case 1000:
 		active |= IFM_1000_T;
 		break;
@@ -1904,10 +1950,15 @@ static int mlx4_en_ioctl(struct ifnet *dev, u_long command, caddr_t data)
 	error = 0;
 	mask = 0;
 	priv = dev->if_softc;
+
+	/* check if detaching */
+	if (priv == NULL || priv->gone != 0)
+		return (ENXIO);
+
 	mdev = priv->mdev;
 	ifr = (struct ifreq *) data;
-	switch (command) {
 
+	switch (command) {
 	case SIOCSIFMTU:
 		error = -mlx4_en_change_mtu(dev, ifr->ifr_mtu);
 		break;
@@ -2156,9 +2207,6 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	}
 #endif
 
-	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i)
-		INIT_HLIST_HEAD(&priv->mac_hash[i]);
-
 	/* Query for default mac and max mtu */
 	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
         priv->mac = mdev->dev->caps.def_mac[priv->port];
@@ -2602,7 +2650,6 @@ static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv)
         struct sysctl_oid *coal;
         struct sysctl_oid_list *coal_list;
 	const char *pnameunit;
-
         dev = priv->dev;
         ctx = &priv->conf_ctx;
 	pnameunit = device_get_nameunit(priv->mdev->pdev->dev.bsddev);
@@ -2641,7 +2688,6 @@ static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv)
         SYSCTL_ADD_STRING(ctx, node_list, OID_AUTO, "device_name",
 	    CTLFLAG_RD, __DECONST(void *, pnameunit), 0,
 	    "PCI device name");
-
         /* Add coalescer configuration. */
         coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO,
             "coalesce", CTLFLAG_RD, NULL, "Interrupt coalesce configuration");
@@ -2700,124 +2746,123 @@ static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv)
 	    &priv->pstats.rx_coal_avg, "RX average coalesced completions");
 #endif
 
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tso_packets", CTLFLAG_RD,
-	    &priv->port_stats.tso_packets, "TSO packets sent");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "queue_stopped", CTLFLAG_RD,
-	    &priv->port_stats.queue_stopped, "Queue full");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "wake_queue", CTLFLAG_RD,
-	    &priv->port_stats.wake_queue, "Queue resumed after full");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_timeout", CTLFLAG_RD,
-	    &priv->port_stats.tx_timeout, "Transmit timeouts");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_oversized_packets", CTLFLAG_RD,
-	    &priv->port_stats.oversized_packets, "TX oversized packets, m_defrag failed");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_alloc_failed", CTLFLAG_RD,
-	    &priv->port_stats.rx_alloc_failed, "RX failed to allocate mbuf");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_good", CTLFLAG_RD,
-	    &priv->port_stats.rx_chksum_good, "RX checksum offload success");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_none", CTLFLAG_RD,
-	    &priv->port_stats.rx_chksum_none, "RX without checksum offload");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_chksum_offload",
-	    CTLFLAG_RD, &priv->port_stats.tx_chksum_offload,
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tso_packets", CTLFLAG_RD,
+	    &priv->port_stats.tso_packets, 0, "TSO packets sent");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "queue_stopped", CTLFLAG_RD,
+	    &priv->port_stats.queue_stopped, 0, "Queue full");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "wake_queue", CTLFLAG_RD,
+	    &priv->port_stats.wake_queue, 0, "Queue resumed after full");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_timeout", CTLFLAG_RD,
+	    &priv->port_stats.tx_timeout, 0, "Transmit timeouts");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_oversized_packets", CTLFLAG_RD,
+	    &priv->port_stats.oversized_packets, 0, "TX oversized packets, m_defrag failed");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_alloc_failed", CTLFLAG_RD,
+	    &priv->port_stats.rx_alloc_failed, 0, "RX failed to allocate mbuf");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_chksum_good", CTLFLAG_RD,
+            &priv->port_stats.rx_chksum_good, 0, "RX checksum offload success");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_chksum_none", CTLFLAG_RD,
+	    &priv->port_stats.rx_chksum_none, 0, "RX without checksum offload");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_chksum_offload",
+	    CTLFLAG_RD, &priv->port_stats.tx_chksum_offload, 0,
 	    "TX checksum offloads");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "defrag_attempts", CTLFLAG_RD,
-	    &priv->port_stats.defrag_attempts, "Oversized chains defragged");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "defrag_attempts",
+	    CTLFLAG_RD, &priv->port_stats.defrag_attempts, 0,
+	    "Oversized chains defragged");
 
 	/* Could strdup the names and add in a loop.  This is simpler. */
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_bytes", CTLFLAG_RD,
-	    &priv->pkstats.rx_bytes, "RX Bytes");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_packets, "RX packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_multicast_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_multicast_packets, "RX Multicast Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_broadcast_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_broadcast_packets, "RX Broadcast Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_errors", CTLFLAG_RD,
-	    &priv->pkstats.rx_errors, "RX Errors");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_dropped", CTLFLAG_RD,
-	    &priv->pkstats.rx_dropped, "RX Dropped");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_length_errors", CTLFLAG_RD,
-	    &priv->pkstats.rx_length_errors, "RX Length Errors");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_over_errors", CTLFLAG_RD,
-	    &priv->pkstats.rx_over_errors, "RX Over Errors");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_crc_errors", CTLFLAG_RD,
-	    &priv->pkstats.rx_crc_errors, "RX CRC Errors");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_jabbers", CTLFLAG_RD,
-	    &priv->pkstats.rx_jabbers, "RX Jabbers");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_bytes", CTLFLAG_RD,
+	    &priv->pkstats.rx_bytes, 0, "RX Bytes");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_packets, 0, "RX packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_multicast_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_multicast_packets, 0, "RX Multicast Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_broadcast_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_broadcast_packets, 0, "RX Broadcast Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_errors", CTLFLAG_RD,
+	    &priv->pkstats.rx_errors, 0, "RX Errors");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_dropped", CTLFLAG_RD,
+	    &priv->pkstats.rx_dropped, 0, "RX Dropped");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_length_errors", CTLFLAG_RD,
+	    &priv->pkstats.rx_length_errors, 0, "RX Length Errors");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_over_errors", CTLFLAG_RD,
+	    &priv->pkstats.rx_over_errors, 0, "RX Over Errors");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_crc_errors", CTLFLAG_RD,
+	    &priv->pkstats.rx_crc_errors, 0, "RX CRC Errors");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_jabbers", CTLFLAG_RD,
+	    &priv->pkstats.rx_jabbers, 0, "RX Jabbers");
 
-
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_in_range_length_error", CTLFLAG_RD,
-	    &priv->pkstats.rx_in_range_length_error, "RX IN_Range Length Error");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_out_range_length_error",
-		CTLFLAG_RD, &priv->pkstats.rx_out_range_length_error,
-		"RX Out Range Length Error");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_lt_64_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_lt_64_bytes_packets, "RX Lt 64 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_127_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_127_bytes_packets, "RX 127 bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_255_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_255_bytes_packets, "RX 255 bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_511_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_511_bytes_packets, "RX 511 bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_1023_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_1023_bytes_packets, "RX 1023 bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_1518_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_1518_bytes_packets, "RX 1518 bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_1522_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_1522_bytes_packets, "RX 1522 bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_1548_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_1548_bytes_packets, "RX 1548 bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_gt_1548_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.rx_gt_1548_bytes_packets,
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_in_range_length_error", CTLFLAG_RD,
+	    &priv->pkstats.rx_in_range_length_error, 0, "RX IN_Range Length Error");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_out_range_length_error",
+	    CTLFLAG_RD, &priv->pkstats.rx_out_range_length_error, 0,
+	    "RX Out Range Length Error");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_lt_64_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_lt_64_bytes_packets, 0, "RX Lt 64 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_127_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_127_bytes_packets, 0, "RX 127 bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_255_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_255_bytes_packets, 0, "RX 255 bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_511_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_511_bytes_packets, 0, "RX 511 bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_1023_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_1023_bytes_packets, 0, "RX 1023 bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_1518_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_1518_bytes_packets, 0, "RX 1518 bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_1522_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_1522_bytes_packets, 0, "RX 1522 bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_1548_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_1548_bytes_packets, 0, "RX 1548 bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_gt_1548_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.rx_gt_1548_bytes_packets, 0,
 	    "RX Greater Then 1548 bytes Packets");
 
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_packets, "TX packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_bytes", CTLFLAG_RD,
-	    &priv->pkstats.tx_bytes, "TX Bytes");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_multicast_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_multicast_packets, "TX Multicast Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_broadcast_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_broadcast_packets, "TX Broadcast Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_errors", CTLFLAG_RD,
-	    &priv->pkstats.tx_errors, "TX Errors");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_dropped", CTLFLAG_RD,
-	    &priv->pkstats.tx_dropped, "TX Dropped");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_lt_64_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_lt_64_bytes_packets, "TX Less Then 64 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_127_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_127_bytes_packets, "TX 127 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_255_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_255_bytes_packets, "TX 255 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_511_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_511_bytes_packets, "TX 511 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_1023_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_1023_bytes_packets, "TX 1023 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_1518_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_1518_bytes_packets, "TX 1518 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_1522_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_1522_bytes_packets, "TX 1522 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_1548_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_1548_bytes_packets, "TX 1548 Bytes Packets");
-	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_gt_1548_bytes_packets", CTLFLAG_RD,
-	    &priv->pkstats.tx_gt_1548_bytes_packets,
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_packets, 0, "TX packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_bytes", CTLFLAG_RD,
+	    &priv->pkstats.tx_bytes, 0, "TX Bytes");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_multicast_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_multicast_packets, 0, "TX Multicast Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_broadcast_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_broadcast_packets, 0, "TX Broadcast Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_errors", CTLFLAG_RD,
+	    &priv->pkstats.tx_errors, 0, "TX Errors");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_dropped", CTLFLAG_RD,
+	    &priv->pkstats.tx_dropped, 0, "TX Dropped");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_lt_64_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_lt_64_bytes_packets, 0, "TX Less Then 64 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_127_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_127_bytes_packets, 0, "TX 127 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_255_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_255_bytes_packets, 0, "TX 255 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_511_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_511_bytes_packets, 0, "TX 511 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_1023_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_1023_bytes_packets, 0, "TX 1023 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_1518_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_1518_bytes_packets, 0, "TX 1518 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_1522_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_1522_bytes_packets, 0, "TX 1522 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_1548_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_1548_bytes_packets, 0, "TX 1548 Bytes Packets");
+	SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_gt_1548_bytes_packets", CTLFLAG_RD,
+	    &priv->pkstats.tx_gt_1548_bytes_packets, 0,
 	    "TX Greater Then 1548 Bytes Packets");
 
-
-
 	for (i = 0; i < priv->tx_ring_num; i++) {
 		tx_ring = priv->tx_ring[i];
 		snprintf(namebuf, sizeof(namebuf), "tx_ring%d", i);
 		ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf,
 		    CTLFLAG_RD, NULL, "TX Ring");
 		ring_list = SYSCTL_CHILDREN(ring_node);
-		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets",
-		    CTLFLAG_RD, &tx_ring->packets, "TX packets");
-		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes",
-		    CTLFLAG_RD, &tx_ring->bytes, "TX bytes");
-		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "tso_packets",
-		    CTLFLAG_RD, &tx_ring->tso_packets, "TSO packets");
-		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "defrag_attempts",
-		    CTLFLAG_RD, &tx_ring->defrag_attempts, "Oversized chains defragged");
+		SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "packets",
+		    CTLFLAG_RD, &tx_ring->packets, 0, "TX packets");
+		SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "bytes",
+		    CTLFLAG_RD, &tx_ring->bytes, 0, "TX bytes");
+		SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "tso_packets",
+		    CTLFLAG_RD, &tx_ring->tso_packets, 0, "TSO packets");
+		SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "defrag_attempts",
+		    CTLFLAG_RD, &tx_ring->defrag_attempts, 0,
+		    "Oversized chains defragged");
 	}
 
 	for (i = 0; i < priv->rx_ring_num; i++) {
@@ -2826,11 +2871,11 @@ static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv)
 		ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf,
 		    CTLFLAG_RD, NULL, "RX Ring");
 		ring_list = SYSCTL_CHILDREN(ring_node);
-		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets",
-		    CTLFLAG_RD, &rx_ring->packets, "RX packets");
-		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes",
-		    CTLFLAG_RD, &rx_ring->bytes, "RX bytes");
-		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error",
-		    CTLFLAG_RD, &rx_ring->errors, "RX soft errors");
+		SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "packets",
+		    CTLFLAG_RD, &rx_ring->packets, 0, "RX packets");
+		SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "bytes",
+		    CTLFLAG_RD, &rx_ring->bytes, 0, "RX bytes");
+		SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "error",
+		    CTLFLAG_RD, &rx_ring->errors, 0, "RX soft errors");
 	}
 }
diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_port.c b/sys/dev/mlx4/mlx4_en/mlx4_en_port.c
index 07fcd6b0b4c3..4f3b29b5b296 100644
--- a/sys/dev/mlx4/mlx4_en/mlx4_en_port.c
+++ b/sys/dev/mlx4/mlx4_en/mlx4_en_port.c
@@ -39,7 +39,6 @@
 
 #include "en_port.h"
 #include "en.h"
-#define EN_IFQ_MIN_INTERVAL 3000
 
 
 int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv)
@@ -57,7 +56,6 @@ int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv)
 		return PTR_ERR(mailbox);
 
 	filter = mailbox->buf;
-	memset(filter, 0, sizeof(*filter));
 	for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) {
 		entry = 0;
 		for (j = 0; j < 32; j++) {
@@ -84,7 +82,6 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
 	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
-	memset(mailbox->buf, 0, sizeof(*qport_context));
 	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
 			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
 			   MLX4_CMD_WRAPPED);
@@ -96,6 +93,9 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
 	 * already synchronized, no need in locking */
 	state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK);
 	switch (qport_context->link_speed & MLX4_EN_SPEED_MASK) {
+	case MLX4_EN_100M_SPEED:
+		state->link_speed = 100;
+		break;
 	case MLX4_EN_1G_SPEED:
 		state->link_speed = 1000;
 		break;
@@ -116,14 +116,39 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
 		state->link_speed = -1;
 		break;
 	}
-	state->transciver = qport_context->transceiver;
-	state->autoneg = !!(qport_context->autoneg & MLX4_EN_AUTONEG_MASK);
+
+	state->transceiver = qport_context->transceiver;
+
+	state->flags = 0; /* Reset and recalculate the port flags */
+	state->flags |= (qport_context->link_up & MLX4_EN_ANC_MASK) ?
+		MLX4_EN_PORT_ANC : 0;
+	state->flags |= (qport_context->autoneg & MLX4_EN_AUTONEG_MASK) ?
+		MLX4_EN_PORT_ANE : 0;
 
 out:
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
 	return err;
 }
 
+/* Each counter set is located in struct mlx4_en_stat_out_mbox
+ * with a const offset between its prio components.
+ * This function runs over a counter set and sum all of it's prio components.
+ */
+static u64 en_stats_adder(__be64 *start, __be64 *next, int num)
+{
+	__be64 *curr = start;
+	u64 ret = 0;
+	int i;
+	int offset = next - start;
+
+	for (i = 0; i < num; i++) {
+		ret += be64_to_cpu(*curr);
+		curr += offset;
+	}
+
+	return ret;
+}
+
 static void mlx4_en_fold_software_stats(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -159,51 +184,20 @@ static void mlx4_en_fold_software_stats(struct net_device *dev)
 
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 {
+	struct mlx4_counter tmp_vport_stats;
 	struct mlx4_en_stat_out_mbox *mlx4_en_stats;
 	struct mlx4_en_stat_out_flow_control_mbox *flowstats;
 	struct net_device *dev = mdev->pndev[port];
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_vport_stats *vport_stats = &priv->vport_stats;
-	struct mlx4_cmd_mailbox *mailbox = NULL;
-	struct mlx4_cmd_mailbox *mailbox_flow = NULL;
+	struct mlx4_cmd_mailbox *mailbox;
 	u64 in_mod = reset << 8 | port;
 	int err;
-	int i;
-	int do_if_stat = 1;
-	unsigned long period = (unsigned long) (jiffies - priv->last_ifq_jiffies);
-	struct mlx4_en_vport_stats tmp_vport_stats;
-
-	if (jiffies_to_msecs(period) < EN_IFQ_MIN_INTERVAL ||
-				priv->counter_index == 0xff)
-		do_if_stat = 0;
+	int i, counter_index;
 
 	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
-	if (IS_ERR(mailbox)) {
-		err = PTR_ERR(mailbox);
-		goto mailbox_out;
-	}
-
-	mailbox_flow = mlx4_alloc_cmd_mailbox(mdev->dev);
-	if (IS_ERR(mailbox_flow)) {
-		mlx4_free_cmd_mailbox(mdev->dev, mailbox);
-		err = PTR_ERR(mailbox_flow);
-		goto mailbox_out;
-	}
-
-	/* 0xffs indicates invalid value */
-	memset(mailbox_flow->buf, 0xff, sizeof(*flowstats) *
-		MLX4_NUM_PRIORITIES);
-
-	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) {
-		memset(mailbox_flow->buf, 0, sizeof(*flowstats));
-		err = mlx4_cmd_box(mdev->dev, 0, mailbox_flow->dma,
-				   in_mod | 1<<12, 0, MLX4_CMD_DUMP_ETH_STATS,
-				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
-
-		if (err)
-			goto out;
-	}
-
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
 	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
 			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B,
 			   MLX4_CMD_NATIVE);
@@ -220,292 +214,83 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 		priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
 		priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
 	}
-
 	priv->port_stats.tx_chksum_offload = 0;
 	priv->port_stats.queue_stopped = 0;
 	priv->port_stats.wake_queue = 0;
 	priv->port_stats.oversized_packets = 0;
 	priv->port_stats.tso_packets = 0;
 	priv->port_stats.defrag_attempts = 0;
+
 	for (i = 0; i < priv->tx_ring_num; i++) {
-		priv->port_stats.tx_chksum_offload += priv->tx_ring[i]->tx_csum;
-		priv->port_stats.queue_stopped += priv->tx_ring[i]->queue_stopped;
-		priv->port_stats.wake_queue += priv->tx_ring[i]->wake_queue;
-		priv->port_stats.oversized_packets += priv->tx_ring[i]->oversized_packets;
-		priv->port_stats.tso_packets += priv->tx_ring[i]->tso_packets;
-		priv->port_stats.defrag_attempts += priv->tx_ring[i]->defrag_attempts;
+		const struct mlx4_en_tx_ring *ring;
+		ring = priv->tx_ring[i];
+
+		priv->port_stats.tx_chksum_offload += ring->tx_csum;
+		priv->port_stats.queue_stopped     += ring->queue_stopped;
+		priv->port_stats.wake_queue        += ring->wake_queue;
+		priv->port_stats.oversized_packets += ring->oversized_packets;
+		priv->port_stats.tso_packets       += ring->tso_packets;
+		priv->port_stats.defrag_attempts   += ring->defrag_attempts;
 	}
-	/* RX Statistics */
-	priv->pkstats.rx_packets = be64_to_cpu(mlx4_en_stats->RTOT_prio_0) +
-		be64_to_cpu(mlx4_en_stats->RTOT_prio_1) +
-		be64_to_cpu(mlx4_en_stats->RTOT_prio_2) +
-		be64_to_cpu(mlx4_en_stats->RTOT_prio_3) +
-		be64_to_cpu(mlx4_en_stats->RTOT_prio_4) +
-		be64_to_cpu(mlx4_en_stats->RTOT_prio_5) +
-		be64_to_cpu(mlx4_en_stats->RTOT_prio_6) +
-		be64_to_cpu(mlx4_en_stats->RTOT_prio_7) +
-		be64_to_cpu(mlx4_en_stats->RTOT_novlan);
-	priv->pkstats.rx_bytes = be64_to_cpu(mlx4_en_stats->ROCT_prio_0) +
-		be64_to_cpu(mlx4_en_stats->ROCT_prio_1) +
-		be64_to_cpu(mlx4_en_stats->ROCT_prio_2) +
-		be64_to_cpu(mlx4_en_stats->ROCT_prio_3) +
-		be64_to_cpu(mlx4_en_stats->ROCT_prio_4) +
-		be64_to_cpu(mlx4_en_stats->ROCT_prio_5) +
-		be64_to_cpu(mlx4_en_stats->ROCT_prio_6) +
-		be64_to_cpu(mlx4_en_stats->ROCT_prio_7) +
-		be64_to_cpu(mlx4_en_stats->ROCT_novlan);
-	priv->pkstats.rx_multicast_packets = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) +
-		be64_to_cpu(mlx4_en_stats->MCAST_prio_1) +
-		be64_to_cpu(mlx4_en_stats->MCAST_prio_2) +
-		be64_to_cpu(mlx4_en_stats->MCAST_prio_3) +
-		be64_to_cpu(mlx4_en_stats->MCAST_prio_4) +
-		be64_to_cpu(mlx4_en_stats->MCAST_prio_5) +
-		be64_to_cpu(mlx4_en_stats->MCAST_prio_6) +
-		be64_to_cpu(mlx4_en_stats->MCAST_prio_7) +
-		be64_to_cpu(mlx4_en_stats->MCAST_novlan);
-	priv->pkstats.rx_broadcast_packets = be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) +
-		be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) +
-		be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) +
-		be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) +
-		be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) +
-		be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) +
-		be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) +
-		be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) +
-		be64_to_cpu(mlx4_en_stats->RBCAST_novlan);
-	priv->pkstats.rx_errors = be64_to_cpu(mlx4_en_stats->PCS) +
-		be32_to_cpu(mlx4_en_stats->RJBBR) +
-		be32_to_cpu(mlx4_en_stats->RCRC) +
-		be32_to_cpu(mlx4_en_stats->RRUNT) +
-		be64_to_cpu(mlx4_en_stats->RInRangeLengthErr) +
-		be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr) +
-		be32_to_cpu(mlx4_en_stats->RSHORT) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_prio_0) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_prio_1) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_prio_2) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_prio_3) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_prio_4) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_prio_5) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_prio_6) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_prio_7) +
-		be64_to_cpu(mlx4_en_stats->RGIANT_novlan);
-	priv->pkstats.rx_dropped = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+
+	priv->pkstats.rx_errors =
+			   be64_to_cpu(mlx4_en_stats->PCS) +
+			   be32_to_cpu(mlx4_en_stats->RJBBR) +
+			   be32_to_cpu(mlx4_en_stats->RCRC) +
+			   be32_to_cpu(mlx4_en_stats->RRUNT) +
+			   be64_to_cpu(mlx4_en_stats->RInRangeLengthErr) +
+			   be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr) +
+			   be32_to_cpu(mlx4_en_stats->RSHORT) +
+			   en_stats_adder(&mlx4_en_stats->RGIANT_prio_0,
+					  &mlx4_en_stats->RGIANT_prio_1,
+					  NUM_PRIORITIES);
+	priv->pkstats.tx_errors =
+	    en_stats_adder(&mlx4_en_stats->TGIANT_prio_0,
+					  &mlx4_en_stats->TGIANT_prio_1,
+					  NUM_PRIORITIES);
+	priv->pkstats.rx_multicast_packets =
+	    en_stats_adder(&mlx4_en_stats->MCAST_prio_0,
+					  &mlx4_en_stats->MCAST_prio_1,
+					  NUM_PRIORITIES);
+	priv->pkstats.rx_dropped = be32_to_cpu(mlx4_en_stats->RDROP);
 	priv->pkstats.rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
 	priv->pkstats.rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
 	priv->pkstats.rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
-	priv->pkstats.rx_jabbers = be32_to_cpu(mlx4_en_stats->RJBBR);
-	priv->pkstats.rx_in_range_length_error = be64_to_cpu(mlx4_en_stats->RInRangeLengthErr);
-	priv->pkstats.rx_out_range_length_error = be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr);
-	priv->pkstats.rx_lt_64_bytes_packets = be64_to_cpu(mlx4_en_stats->R64_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R64_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R64_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R64_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R64_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R64_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R64_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R64_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R64_novlan);
-	priv->pkstats.rx_127_bytes_packets = be64_to_cpu(mlx4_en_stats->R127_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R127_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R127_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R127_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R127_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R127_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R127_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R127_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R127_novlan);
-	priv->pkstats.rx_255_bytes_packets = be64_to_cpu(mlx4_en_stats->R255_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R255_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R255_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R255_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R255_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R255_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R255_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R255_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R255_novlan);
-	priv->pkstats.rx_511_bytes_packets = be64_to_cpu(mlx4_en_stats->R511_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R511_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R511_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R511_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R511_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R511_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R511_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R511_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R511_novlan);
-	priv->pkstats.rx_1023_bytes_packets = be64_to_cpu(mlx4_en_stats->R1023_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R1023_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R1023_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R1023_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R1023_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R1023_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R1023_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R1023_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R1023_novlan);
-	priv->pkstats.rx_1518_bytes_packets = be64_to_cpu(mlx4_en_stats->R1518_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R1518_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R1518_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R1518_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R1518_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R1518_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R1518_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R1518_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R1518_novlan);
-	priv->pkstats.rx_1522_bytes_packets = be64_to_cpu(mlx4_en_stats->R1522_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R1522_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R1522_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R1522_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R1522_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R1522_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R1522_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R1522_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R1522_novlan);
-	priv->pkstats.rx_1548_bytes_packets = be64_to_cpu(mlx4_en_stats->R1548_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R1548_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R1548_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R1548_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R1548_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R1548_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R1548_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R1548_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R1548_novlan);
-	priv->pkstats.rx_gt_1548_bytes_packets = be64_to_cpu(mlx4_en_stats->R2MTU_prio_0) +
-		be64_to_cpu(mlx4_en_stats->R2MTU_prio_1) +
-		be64_to_cpu(mlx4_en_stats->R2MTU_prio_2) +
-		be64_to_cpu(mlx4_en_stats->R2MTU_prio_3) +
-		be64_to_cpu(mlx4_en_stats->R2MTU_prio_4) +
-		be64_to_cpu(mlx4_en_stats->R2MTU_prio_5) +
-		be64_to_cpu(mlx4_en_stats->R2MTU_prio_6) +
-		be64_to_cpu(mlx4_en_stats->R2MTU_prio_7) +
-		be64_to_cpu(mlx4_en_stats->R2MTU_novlan);
+	priv->pkstats.rx_dropped = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	priv->pkstats.tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP);
 
-	/* Tx Stats */
-	priv->pkstats.tx_packets = be64_to_cpu(mlx4_en_stats->TTOT_prio_0) +
-		be64_to_cpu(mlx4_en_stats->TTOT_prio_1) +
-		be64_to_cpu(mlx4_en_stats->TTOT_prio_2) +
-		be64_to_cpu(mlx4_en_stats->TTOT_prio_3) +
-		be64_to_cpu(mlx4_en_stats->TTOT_prio_4) +
-		be64_to_cpu(mlx4_en_stats->TTOT_prio_5) +
-		be64_to_cpu(mlx4_en_stats->TTOT_prio_6) +
-		be64_to_cpu(mlx4_en_stats->TTOT_prio_7) +
-		be64_to_cpu(mlx4_en_stats->TTOT_novlan);
-	priv->pkstats.tx_bytes = be64_to_cpu(mlx4_en_stats->TOCT_prio_0) +
-		be64_to_cpu(mlx4_en_stats->TOCT_prio_1) +
-		be64_to_cpu(mlx4_en_stats->TOCT_prio_2) +
-		be64_to_cpu(mlx4_en_stats->TOCT_prio_3) +
-		be64_to_cpu(mlx4_en_stats->TOCT_prio_4) +
-		be64_to_cpu(mlx4_en_stats->TOCT_prio_5) +
-		be64_to_cpu(mlx4_en_stats->TOCT_prio_6) +
-		be64_to_cpu(mlx4_en_stats->TOCT_prio_7) +
-		be64_to_cpu(mlx4_en_stats->TOCT_novlan);
-	priv->pkstats.tx_multicast_packets = be64_to_cpu(mlx4_en_stats->TMCAST_prio_0) +
-		be64_to_cpu(mlx4_en_stats->TMCAST_prio_1) +
-		be64_to_cpu(mlx4_en_stats->TMCAST_prio_2) +
-		be64_to_cpu(mlx4_en_stats->TMCAST_prio_3) +
-		be64_to_cpu(mlx4_en_stats->TMCAST_prio_4) +
-		be64_to_cpu(mlx4_en_stats->TMCAST_prio_5) +
-		be64_to_cpu(mlx4_en_stats->TMCAST_prio_6) +
-		be64_to_cpu(mlx4_en_stats->TMCAST_prio_7) +
-		be64_to_cpu(mlx4_en_stats->TMCAST_novlan);
-	priv->pkstats.tx_broadcast_packets = be64_to_cpu(mlx4_en_stats->TBCAST_prio_0) +
-		be64_to_cpu(mlx4_en_stats->TBCAST_prio_1) +
-		be64_to_cpu(mlx4_en_stats->TBCAST_prio_2) +
-		be64_to_cpu(mlx4_en_stats->TBCAST_prio_3) +
-		be64_to_cpu(mlx4_en_stats->TBCAST_prio_4) +
-		be64_to_cpu(mlx4_en_stats->TBCAST_prio_5) +
-		be64_to_cpu(mlx4_en_stats->TBCAST_prio_6) +
-		be64_to_cpu(mlx4_en_stats->TBCAST_prio_7) +
-		be64_to_cpu(mlx4_en_stats->TBCAST_novlan);
-	priv->pkstats.tx_errors = be64_to_cpu(mlx4_en_stats->TGIANT_prio_0) +
-		be64_to_cpu(mlx4_en_stats->TGIANT_prio_1) +
-		be64_to_cpu(mlx4_en_stats->TGIANT_prio_2) +
-		be64_to_cpu(mlx4_en_stats->TGIANT_prio_3) +
-		be64_to_cpu(mlx4_en_stats->TGIANT_prio_4) +
-		be64_to_cpu(mlx4_en_stats->TGIANT_prio_5) +
-		be64_to_cpu(mlx4_en_stats->TGIANT_prio_6) +
-		be64_to_cpu(mlx4_en_stats->TGIANT_prio_7) +
-		be64_to_cpu(mlx4_en_stats->TGIANT_novlan);
-	priv->pkstats.tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP) -
-		priv->pkstats.tx_errors;
-	priv->pkstats.tx_lt_64_bytes_packets = be64_to_cpu(mlx4_en_stats->T64_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T64_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T64_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T64_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T64_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T64_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T64_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T64_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T64_novlan);
-	priv->pkstats.tx_127_bytes_packets = be64_to_cpu(mlx4_en_stats->T127_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T127_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T127_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T127_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T127_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T127_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T127_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T127_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T127_novlan);
-	priv->pkstats.tx_255_bytes_packets = be64_to_cpu(mlx4_en_stats->T255_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T255_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T255_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T255_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T255_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T255_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T255_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T255_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T255_novlan);
-	priv->pkstats.tx_511_bytes_packets = be64_to_cpu(mlx4_en_stats->T511_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T511_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T511_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T511_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T511_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T511_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T511_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T511_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T511_novlan);
-	priv->pkstats.tx_1023_bytes_packets = be64_to_cpu(mlx4_en_stats->T1023_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T1023_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T1023_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T1023_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T1023_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T1023_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T1023_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T1023_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T1023_novlan);
-	priv->pkstats.tx_1518_bytes_packets = be64_to_cpu(mlx4_en_stats->T1518_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T1518_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T1518_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T1518_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T1518_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T1518_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T1518_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T1518_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T1518_novlan);
-	priv->pkstats.tx_1522_bytes_packets = be64_to_cpu(mlx4_en_stats->T1522_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T1522_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T1522_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T1522_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T1522_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T1522_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T1522_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T1522_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T1522_novlan);
-	priv->pkstats.tx_1548_bytes_packets = be64_to_cpu(mlx4_en_stats->T1548_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T1548_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T1548_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T1548_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T1548_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T1548_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T1548_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T1548_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T1548_novlan);
-	priv->pkstats.tx_gt_1548_bytes_packets = be64_to_cpu(mlx4_en_stats->T2MTU_prio_0) +
-		be64_to_cpu(mlx4_en_stats->T2MTU_prio_1) +
-		be64_to_cpu(mlx4_en_stats->T2MTU_prio_2) +
-		be64_to_cpu(mlx4_en_stats->T2MTU_prio_3) +
-		be64_to_cpu(mlx4_en_stats->T2MTU_prio_4) +
-		be64_to_cpu(mlx4_en_stats->T2MTU_prio_5) +
-		be64_to_cpu(mlx4_en_stats->T2MTU_prio_6) +
-		be64_to_cpu(mlx4_en_stats->T2MTU_prio_7) +
-		be64_to_cpu(mlx4_en_stats->T2MTU_novlan);
+	/* RX stats */
+	priv->pkstats.rx_packets = en_stats_adder(&mlx4_en_stats->RTOT_prio_0,
+					   &mlx4_en_stats->RTOT_prio_1,
+					   NUM_PRIORITIES);
+	priv->pkstats.rx_bytes = en_stats_adder(&mlx4_en_stats->ROCT_prio_0,
+					 &mlx4_en_stats->ROCT_prio_1,
+					 NUM_PRIORITIES);
+	priv->pkstats.rx_broadcast_packets =
+			en_stats_adder(&mlx4_en_stats->RBCAST_prio_0,
+				       &mlx4_en_stats->RBCAST_prio_1,
+				       NUM_PRIORITIES);
+	priv->pkstats.rx_jabbers = be32_to_cpu(mlx4_en_stats->RJBBR);
+	priv->pkstats.rx_in_range_length_error =
+		be64_to_cpu(mlx4_en_stats->RInRangeLengthErr);
+	priv->pkstats.rx_out_range_length_error =
+		be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr);
+
+	/* Tx stats */
+	priv->pkstats.tx_packets = en_stats_adder(&mlx4_en_stats->TTOT_prio_0,
+					   &mlx4_en_stats->TTOT_prio_1,
+					   NUM_PRIORITIES);
+	priv->pkstats.tx_bytes = en_stats_adder(&mlx4_en_stats->TOCT_prio_0,
+					 &mlx4_en_stats->TOCT_prio_1,
+					 NUM_PRIORITIES);
+	priv->pkstats.tx_multicast_packets =
+		en_stats_adder(&mlx4_en_stats->TMCAST_prio_0,
+			       &mlx4_en_stats->TMCAST_prio_1,
+			       NUM_PRIORITIES);
+	priv->pkstats.tx_broadcast_packets =
+		en_stats_adder(&mlx4_en_stats->TBCAST_prio_0,
+			       &mlx4_en_stats->TBCAST_prio_1,
+			       NUM_PRIORITIES);
 
 	priv->pkstats.rx_prio[0][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
 	priv->pkstats.rx_prio[0][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_0);
@@ -544,49 +329,22 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	priv->pkstats.tx_prio[8][0] = be64_to_cpu(mlx4_en_stats->TTOT_novlan);
 	priv->pkstats.tx_prio[8][1] = be64_to_cpu(mlx4_en_stats->TOCT_novlan);
 
-	flowstats = mailbox_flow->buf;
-
-	for (i = 0; i < MLX4_NUM_PRIORITIES; i++)	{
-		priv->flowstats[i].rx_pause =
-			be64_to_cpu(flowstats[i].rx_pause);
-		priv->flowstats[i].rx_pause_duration =
-			be64_to_cpu(flowstats[i].rx_pause_duration);
-		priv->flowstats[i].rx_pause_transition =
-			be64_to_cpu(flowstats[i].rx_pause_transition);
-		priv->flowstats[i].tx_pause =
-			be64_to_cpu(flowstats[i].tx_pause);
-		priv->flowstats[i].tx_pause_duration =
-			be64_to_cpu(flowstats[i].tx_pause_duration);
-		priv->flowstats[i].tx_pause_transition =
-			be64_to_cpu(flowstats[i].tx_pause_transition);
-	}
-
 	mlx4_en_fold_software_stats(dev);
 
 	spin_unlock(&priv->stats_lock);
 
 	memset(&tmp_vport_stats, 0, sizeof(tmp_vport_stats));
+	counter_index = mlx4_get_default_counter_index(mdev->dev, port);
+	err = mlx4_get_counter_stats(mdev->dev, counter_index,
+				     &tmp_vport_stats, reset);
 
-	err = mlx4_get_vport_ethtool_stats(mdev->dev, port,
-					   &tmp_vport_stats, reset);
 	spin_lock(&priv->stats_lock);
 	if (!err) {
 		/* ethtool stats format */
-		vport_stats->rx_unicast_packets = tmp_vport_stats.rx_unicast_packets;
-		vport_stats->rx_unicast_bytes = tmp_vport_stats.rx_unicast_bytes;
-		vport_stats->rx_multicast_packets = tmp_vport_stats.rx_multicast_packets;
-		vport_stats->rx_multicast_bytes = tmp_vport_stats.rx_multicast_bytes;
-		vport_stats->rx_broadcast_packets = tmp_vport_stats.rx_broadcast_packets;
-		vport_stats->rx_broadcast_bytes = tmp_vport_stats.rx_broadcast_bytes;
-		vport_stats->rx_dropped = tmp_vport_stats.rx_dropped;
-		vport_stats->rx_errors = tmp_vport_stats.rx_errors;
-		vport_stats->tx_unicast_packets = tmp_vport_stats.tx_unicast_packets;
-		vport_stats->tx_unicast_bytes = tmp_vport_stats.tx_unicast_bytes;
-		vport_stats->tx_multicast_packets = tmp_vport_stats.tx_multicast_packets;
-		vport_stats->tx_multicast_bytes = tmp_vport_stats.tx_multicast_bytes;
-		vport_stats->tx_broadcast_packets = tmp_vport_stats.tx_broadcast_packets;
-		vport_stats->tx_broadcast_bytes = tmp_vport_stats.tx_broadcast_bytes;
-		vport_stats->tx_errors = tmp_vport_stats.tx_errors;
+		vport_stats->rx_bytes = be64_to_cpu(tmp_vport_stats.rx_bytes);
+		vport_stats->rx_frames = be64_to_cpu(tmp_vport_stats.rx_frames);
+		vport_stats->tx_bytes = be64_to_cpu(tmp_vport_stats.tx_bytes);
+		vport_stats->tx_frames = be64_to_cpu(tmp_vport_stats.tx_frames);
 	}
 
 #if __FreeBSD_version >= 1100000
@@ -623,13 +381,128 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 
 	spin_unlock(&priv->stats_lock);
 
-out:
-	mlx4_free_cmd_mailbox(mdev->dev, mailbox_flow);
-	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	/* 0xffs indicates invalid value */
+	memset(mailbox->buf, 0xff, sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
 
-mailbox_out:
-	if (do_if_stat)
-		priv->last_ifq_jiffies = jiffies;
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) {
+		memset(mailbox->buf, 0,
+		       sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
+		err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma,
+				   in_mod | MLX4_DUMP_ETH_STATS_FLOW_CONTROL,
+				   0, MLX4_CMD_DUMP_ETH_STATS,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
+	}
+
+	flowstats = mailbox->buf;
+
+	spin_lock(&priv->stats_lock);
+
+	for (i = 0; i < MLX4_NUM_PRIORITIES; i++)	{
+		priv->rx_priority_flowstats[i].rx_pause =
+			be64_to_cpu(flowstats[i].rx_pause);
+		priv->rx_priority_flowstats[i].rx_pause_duration =
+			be64_to_cpu(flowstats[i].rx_pause_duration);
+		priv->rx_priority_flowstats[i].rx_pause_transition =
+			be64_to_cpu(flowstats[i].rx_pause_transition);
+		priv->tx_priority_flowstats[i].tx_pause =
+			be64_to_cpu(flowstats[i].tx_pause);
+		priv->tx_priority_flowstats[i].tx_pause_duration =
+			be64_to_cpu(flowstats[i].tx_pause_duration);
+		priv->tx_priority_flowstats[i].tx_pause_transition =
+			be64_to_cpu(flowstats[i].tx_pause_transition);
+	}
+
+	/* if pfc is not in use, all priorities counters have the same value */
+	priv->rx_flowstats.rx_pause =
+		be64_to_cpu(flowstats[0].rx_pause);
+	priv->rx_flowstats.rx_pause_duration =
+		be64_to_cpu(flowstats[0].rx_pause_duration);
+	priv->rx_flowstats.rx_pause_transition =
+		be64_to_cpu(flowstats[0].rx_pause_transition);
+	priv->tx_flowstats.tx_pause =
+		be64_to_cpu(flowstats[0].tx_pause);
+	priv->tx_flowstats.tx_pause_duration =
+		be64_to_cpu(flowstats[0].tx_pause_duration);
+	priv->tx_flowstats.tx_pause_transition =
+		be64_to_cpu(flowstats[0].tx_pause_transition);
+
+	spin_unlock(&priv->stats_lock);
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
+int mlx4_en_get_vport_stats(struct mlx4_en_dev *mdev, u8 port)
+{
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct mlx4_counter tmp_vport_stats;
+	struct mlx4_en_vf_stats *vf_stats = &priv->vf_stats;
+	int err, i, counter_index;
+
+	spin_lock(&priv->stats_lock);
+
+	priv->pkstats.rx_packets = 0;
+	priv->pkstats.rx_bytes = 0;
+	priv->port_stats.rx_chksum_good = 0;
+	priv->port_stats.rx_chksum_none = 0;
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->pkstats.rx_packets += priv->rx_ring[i]->packets;
+		priv->pkstats.rx_bytes += priv->rx_ring[i]->bytes;
+		priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
+		priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
+	}
+	priv->pkstats.tx_packets = 0;
+	priv->pkstats.tx_bytes = 0;
+	priv->port_stats.tx_chksum_offload = 0;
+	priv->port_stats.queue_stopped = 0;
+	priv->port_stats.wake_queue = 0;
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		const struct mlx4_en_tx_ring *ring = priv->tx_ring[i];
+
+		priv->pkstats.tx_packets += ring->packets;
+		priv->pkstats.tx_bytes += ring->bytes;
+		priv->port_stats.tx_chksum_offload += ring->tx_csum;
+		priv->port_stats.queue_stopped     += ring->queue_stopped;
+		priv->port_stats.wake_queue        += ring->wake_queue;
+		priv->port_stats.oversized_packets += priv->tx_ring[i]->oversized_packets;
+	}
+
+	spin_unlock(&priv->stats_lock);
+
+	memset(&tmp_vport_stats, 0, sizeof(tmp_vport_stats));
+
+	counter_index = mlx4_get_default_counter_index(mdev->dev, port);
+	err = mlx4_get_counter_stats(mdev->dev, counter_index,
+				     &tmp_vport_stats, 0);
+
+	if (!err) {
+		spin_lock(&priv->stats_lock);
+
+		vf_stats->rx_bytes = be64_to_cpu(tmp_vport_stats.rx_bytes);
+		vf_stats->rx_frames = be64_to_cpu(tmp_vport_stats.rx_frames);
+		vf_stats->tx_bytes = be64_to_cpu(tmp_vport_stats.tx_bytes);
+		vf_stats->tx_frames = be64_to_cpu(tmp_vport_stats.tx_frames);
+
+		priv->pkstats.rx_packets = vf_stats->rx_frames;
+		priv->pkstats.rx_bytes = vf_stats->rx_bytes;
+		priv->pkstats.tx_packets = vf_stats->tx_frames;
+		priv->pkstats.tx_bytes = vf_stats->tx_bytes;
+
+		/* PF&VFs are not expected to report errors in ifconfig.
+		 * rx_errors will be reprted in PF's ethtool statistics,
+		 * see: mlx4_en_DUMP_ETH_STATS
+		 */
+		priv->pkstats.rx_errors = 0;
+		priv->pkstats.rx_dropped = 0;
+		priv->pkstats.tx_dropped = 0;
+		priv->pkstats.rx_multicast_packets = 0;
+
+		spin_unlock(&priv->stats_lock);
+	}
 
 	return err;
 }
diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_resources.c b/sys/dev/mlx4/mlx4_en/mlx4_en_resources.c
index fd270ede1ff1..db8beb501594 100644
--- a/sys/dev/mlx4/mlx4_en/mlx4_en_resources.c
+++ b/sys/dev/mlx4/mlx4_en/mlx4_en_resources.c
@@ -70,7 +70,7 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 		/* disable multicast loopback to qp with same counter */
 		context->pri_path.fl |= MLX4_FL_ETH_SRC_CHECK_MC_LB;
 		context->pri_path.vlan_control |=
-			MLX4_VLAN_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+			MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
 	}
 
 	context->cqn_send = cpu_to_be32(cqn);
diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_rx.c b/sys/dev/mlx4/mlx4_en/mlx4_en_rx.c
index 300478205920..7266cfa83ad6 100644
--- a/sys/dev/mlx4/mlx4_en/mlx4_en_rx.c
+++ b/sys/dev/mlx4/mlx4_en/mlx4_en_rx.c
@@ -252,6 +252,26 @@ static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
 	}
 }
 
+void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
+{
+	int i;
+	int num_of_eqs;
+	int num_rx_rings;
+	struct mlx4_dev *dev = mdev->dev;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		num_of_eqs = max_t(int, MIN_RX_RINGS,
+				   min_t(int,
+					 mlx4_get_eqs_per_port(mdev->dev, i),
+					 DEF_RX_RINGS));
+
+		num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS :
+							   num_of_eqs;
+		mdev->profile.prof[i].rx_ring_num =
+			rounddown_pow_of_two(num_rx_rings);
+	}
+}
+
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -259,7 +279,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 	    MLX4_NET_IP_ALIGN;
 
 	if (eff_mtu > MJUM16BYTES) {
-		en_err(priv, "MTU(%d) is too big\n", dev->if_mtu);
+		en_err(priv, "MTU(%u) is too big\n", (unsigned)dev->if_mtu);
                 eff_mtu = MJUM16BYTES;
         } else if (eff_mtu > MJUM9BYTES) {
                 eff_mtu = MJUM16BYTES;
@@ -399,7 +419,7 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
 			__be32 *ptr = (__be32 *)ring->buf;
 			__be32 stamp = cpu_to_be32(1 << STAMP_SHIFT);
 			*ptr = stamp;
-			/* Move pointer to start of rx section */	
+			/* Move pointer to start of rx section */
 			ring->buf += TXBB_SIZE;
 		}
 
@@ -607,7 +627,7 @@ mlx4_en_rss_hash(__be16 status, int udp_rss)
  * was added in the beginning of each cqe (the real data is in the corresponding 32B).
  * The following calc ensures that when factor==1, it means we are aligned to 64B
  * and we get the real cqe data*/
-#define CQE_FACTOR_INDEX(index, factor) ((index << factor) + factor)
+#define CQE_FACTOR_INDEX(index, factor) (((index) << (factor)) + (factor))
 int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -676,7 +696,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		M_HASHTYPE_SET(mb, mlx4_en_rss_hash(cqe->status, udp_rss));
 		mb->m_pkthdr.rcvif = dev;
 		if (be32_to_cpu(cqe->vlan_my_qpn) &
-		    MLX4_CQE_VLAN_PRESENT_MASK) {
+		    MLX4_CQE_CVLAN_PRESENT_MASK) {
 			mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid);
 			mb->m_flags |= M_VLANTAG;
 		}
@@ -802,7 +822,7 @@ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
 		return -ENOMEM;
 	}
 
-	err = mlx4_qp_alloc(mdev->dev, qpn, qp);
+	err = mlx4_qp_alloc(mdev->dev, qpn, qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed to allocate qp #%x\n", qpn);
 		goto out;
@@ -842,7 +862,7 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
 		en_err(priv, "Failed reserving drop qpn\n");
 		return err;
 	}
-	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp);
+	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed allocating drop qp\n");
 		mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
@@ -930,7 +950,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 	}
 
 	/* Configure RSS indirection qp */
-	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp);
+	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed to allocate RSS indirection QP\n");
 		goto rss_err;
diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c b/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c
index ad0b3ccefdcf..f3d700502977 100644
--- a/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c
+++ b/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c
@@ -52,17 +52,6 @@
 
 #include "en.h"
 
-enum {
-	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
-	MAX_BF = 256,
-	MIN_PKT_LEN = 17,
-};
-
-static int inline_thold __read_mostly = MAX_INLINE;
-
-module_param_named(inline_thold, inline_thold, uint, 0444);
-MODULE_PARM_DESC(inline_thold, "threshold for using inline data");
-
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 			   struct mlx4_en_tx_ring **pring, u32 size,
 			   u16 stride, int node, int queue_idx)
@@ -101,7 +90,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	ring->size = size;
 	ring->size_mask = size - 1;
 	ring->stride = stride;
-	ring->inline_thold = MAX(MIN_PKT_LEN, MIN(inline_thold, MAX_INLINE));
+	ring->inline_thold = MAX(MIN_PKT_LEN, MIN(priv->prof->inline_thold, MAX_INLINE));
 	mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF);
 	mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF);
 
@@ -163,13 +152,13 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	       ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map);
 
 	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
-				    MLX4_RESERVE_BF_QP);
+				    MLX4_RESERVE_ETH_BF_QP);
 	if (err) {
 		en_err(priv, "failed reserving qp for TX ring\n");
 		goto err_map;
 	}
 
-	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp);
+	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
 		goto err_reserve;
@@ -185,8 +174,6 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	} else
 		ring->bf_enabled = true;
 	ring->queue_index = queue_idx;
-	if (queue_idx < priv->num_tx_rings_p_up )
-		CPU_SET(queue_idx, &ring->affinity_mask);
 
 	*pring = ring;
 	return 0;
@@ -447,8 +434,8 @@ static int mlx4_en_process_tx_cq(struct net_device *dev,
 		ring->blocked = 0;
 		if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
 			atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE);
-		ring->wake_queue++;
 		priv->port_stats.wake_queue++;
+		ring->wake_queue++;
 	}
 	return (0);
 }
@@ -752,7 +739,7 @@ static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp
 	/* check for VLAN tag */
 	if (mb->m_flags & M_VLANTAG) {
 		tx_desc->ctrl.vlan_tag = cpu_to_be16(mb->m_pkthdr.ether_vtag);
-		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN;
+		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN;
 	} else {
 		tx_desc->ctrl.vlan_tag = 0;
 		tx_desc->ctrl.ins_vlan = 0;
@@ -930,7 +917,7 @@ static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp
 	ring->prod += tx_info->nr_txbb;
 
 	if (ring->bf_enabled && bf_size <= MAX_BF &&
-	    (tx_desc->ctrl.ins_vlan != MLX4_WQE_CTRL_INS_VLAN)) {
+	    (tx_desc->ctrl.ins_vlan != MLX4_WQE_CTRL_INS_CVLAN)) {
 
 		/* store doorbell number */
 		*(volatile __be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn);
diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib.h b/sys/dev/mlx4/mlx4_ib/mlx4_ib.h
index e395d6d5f499..d04bd8cdb797 100644
--- a/sys/dev/mlx4/mlx4_ib/mlx4_ib.h
+++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib.h
@@ -771,9 +771,6 @@ int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
 			u8 *mac, int *is_mcast, u8 port);
 
-int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index,
-		       union mlx4_counter *counter, u8 clear);
-
 static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
 {
 	u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_cq.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_cq.c
index cec37169f3d5..11aab31f9e31 100644
--- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_cq.c
+++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_cq.c
@@ -105,12 +105,8 @@ int mlx4_ib_modify_cq(struct ib_cq *cq,
 		if (cq_attr->cq_cap_flags & IB_CQ_TIMESTAMP)
 			return -ENOTSUPP;
 
-		if (cq_attr->cq_cap_flags & IB_CQ_IGNORE_OVERRUN) {
-			if (dev->dev->caps.cq_flags & MLX4_DEV_CAP_CQ_FLAG_IO)
-				err = mlx4_cq_ignore_overrun(dev->dev, &mcq->mcq);
-			else
-				err = -ENOSYS;
-		}
+		if (cq_attr->cq_cap_flags & IB_CQ_IGNORE_OVERRUN)
+			return -ENOSYS;
 	}
 
 	if (!err)
@@ -127,7 +123,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
 	int err;
 
 	err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
-			     PAGE_SIZE * 2, &buf->buf);
+			     PAGE_SIZE * 2, &buf->buf, GFP_KERNEL);
 
 	if (err)
 		goto out;
@@ -138,7 +134,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
 	if (err)
 		goto err_buf;
 
-	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf);
+	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL);
 	if (err)
 		goto err_mtt;
 
@@ -248,7 +244,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 
 		uar = &to_mucontext(context)->uar;
 	} else {
-		err = mlx4_db_alloc(dev->dev, &cq->db, 1);
+		err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL);
 		if (err)
 			goto err_cq;
 
@@ -509,13 +505,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 
 int mlx4_ib_ignore_overrun_cq(struct ib_cq *ibcq)
 {
-	struct mlx4_ib_dev *dev = to_mdev(ibcq->device);
-	struct mlx4_ib_cq *cq = to_mcq(ibcq);
-
-	if (dev->dev->caps.fw_ver < MLX4_FW_VER_IGNORE_OVERRUN_CQ)
-		return -ENOSYS;
-
-	return mlx4_cq_ignore_overrun(dev->dev, &cq->mcq);
+	return -ENOSYS;
 }
 
 int mlx4_ib_destroy_cq(struct ib_cq *cq)
@@ -862,6 +852,8 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 		}
 
 		if (timestamp_en) {
+			const struct mlx4_ts_cqe *ts_cqe =
+			    (const struct mlx4_ts_cqe *)cqe;
 			/* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is
 			 * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't
 			 * supported */
@@ -869,9 +861,9 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 				wc->ts.timestamp = 0;
 			} else {
 				wc->ts.timestamp =
-					((u64)(be32_to_cpu(cqe->timestamp_16_47)
-					       + !cqe->timestamp_0_15) << 16)
-					| be16_to_cpu(cqe->timestamp_0_15);
+					((u64)(be32_to_cpu(ts_cqe->timestamp_hi)
+					       + !ts_cqe->timestamp_lo) << 16)
+					| be16_to_cpu(ts_cqe->timestamp_lo);
 				wc->wc_flags |= IB_WC_WITH_TIMESTAMP;
 			}
 		} else {
@@ -895,7 +887,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 			wc->wc_flags	  |= IB_WC_WITH_SL;
 		}
 		if ((be32_to_cpu(cqe->vlan_my_qpn) &
-		    MLX4_CQE_VLAN_PRESENT_MASK) && !timestamp_en) {
+		    MLX4_CQE_CVLAN_PRESENT_MASK) && !timestamp_en) {
 			wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
 				MLX4_CQE_VID_MASK;
 			wc->wc_flags	  |= IB_WC_WITH_VLAN;
diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c
index e86764b68f40..e0ff3d8013ee 100644
--- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c
+++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_mad.c
@@ -788,204 +788,51 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
 }
 
-static void edit_counter_ext(struct mlx4_if_stat_extended *cnt, void *counters,
-			     __be16 attr_id)
+static void edit_counter(struct mlx4_counter *cnt, void *counters,
+			 __be16 attr_id)
 {
 	switch (attr_id) {
 	case IB_PMA_PORT_COUNTERS:
 	{
 		struct ib_pma_portcounters *pma_cnt =
-				(struct ib_pma_portcounters *)counters;
-		pma_cnt->port_xmit_data =
-			cpu_to_be32((be64_to_cpu(cnt->counters[0].
-						 IfTxUnicastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfTxMulticastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfTxBroadcastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfTxDroppedOctets)) >> 2);
-		pma_cnt->port_rcv_data  =
-			cpu_to_be32((be64_to_cpu(cnt->counters[0].
-						 IfRxUnicastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfRxMulticastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfRxBroadcastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfRxNoBufferOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfRxErrorOctets)) >> 2);
-		pma_cnt->port_xmit_packets =
-			cpu_to_be32(be64_to_cpu(cnt->counters[0].
-						IfTxUnicastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfTxMulticastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfTxBroadcastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfTxDroppedFrames));
-		pma_cnt->port_rcv_packets  =
-			cpu_to_be32(be64_to_cpu(cnt->counters[0].
-						IfRxUnicastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfRxMulticastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfRxBroadcastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfRxNoBufferFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfRxErrorFrames));
-		pma_cnt->port_rcv_errors = cpu_to_be32(be64_to_cpu(cnt->
-						       counters[0].
-						       IfRxErrorFrames));
-		break;
-	}
+			(struct ib_pma_portcounters *)counters;
 
-	case IB_PMA_PORT_COUNTERS_EXT:
-	{
-		struct ib_pma_portcounters_ext *pma_cnt_ext =
-				(struct ib_pma_portcounters_ext *)counters;
-
-		pma_cnt_ext->port_xmit_data =
-			cpu_to_be64((be64_to_cpu(cnt->counters[0].
-						 IfTxUnicastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfTxMulticastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfTxBroadcastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfTxDroppedOctets)) >> 2);
-		pma_cnt_ext->port_rcv_data  =
-			cpu_to_be64((be64_to_cpu(cnt->counters[0].
-						 IfRxUnicastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfRxMulticastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfRxBroadcastOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfRxNoBufferOctets) +
-				     be64_to_cpu(cnt->counters[0].
-						 IfRxErrorOctets)) >> 2);
-		pma_cnt_ext->port_xmit_packets =
-			cpu_to_be64(be64_to_cpu(cnt->counters[0].
-						IfTxUnicastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfTxMulticastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfTxBroadcastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfTxDroppedFrames));
-		pma_cnt_ext->port_rcv_packets  =
-			cpu_to_be64(be64_to_cpu(cnt->counters[0].
-						IfRxUnicastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfRxMulticastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfRxBroadcastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfRxNoBufferFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfRxErrorFrames));
-		pma_cnt_ext->port_unicast_xmit_packets = cnt->counters[0].
-						IfTxUnicastFrames;
-		pma_cnt_ext->port_unicast_rcv_packets = cnt->counters[0].
-						IfRxUnicastFrames;
-		pma_cnt_ext->port_multicast_xmit_packets =
-			cpu_to_be64(be64_to_cpu(cnt->counters[0].
-						IfTxMulticastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfTxBroadcastFrames));
-		pma_cnt_ext->port_multicast_rcv_packets =
-			cpu_to_be64(be64_to_cpu(cnt->counters[0].
-						IfTxMulticastFrames) +
-				    be64_to_cpu(cnt->counters[0].
-						IfTxBroadcastFrames));
-
-		break;
-	}
-
-	default:
-		pr_warn("Unsupported attr_id 0x%x\n", attr_id);
-		break;
-	}
-
-}
-
-static void edit_counter(struct mlx4_if_stat_basic *cnt, void *counters,
-			 __be16	attr_id)
-{
-	switch (attr_id) {
-	case IB_PMA_PORT_COUNTERS:
-	{
-		struct ib_pma_portcounters *pma_cnt =
-				(struct ib_pma_portcounters *) counters;
-		pma_cnt->port_xmit_data =
-			cpu_to_be32(be64_to_cpu(
-				    cnt->counters[0].IfTxOctets) >> 2);
-		pma_cnt->port_rcv_data  =
-			cpu_to_be32(be64_to_cpu(
-				    cnt->counters[0].IfRxOctets) >> 2);
-		pma_cnt->port_xmit_packets =
-			cpu_to_be32(be64_to_cpu(cnt->counters[0].IfTxFrames));
-		pma_cnt->port_rcv_packets  =
-			cpu_to_be32(be64_to_cpu(cnt->counters[0].IfRxFrames));
+		ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data,
+				     (be64_to_cpu(cnt->tx_bytes) >> 2));
+		ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data,
+				     (be64_to_cpu(cnt->rx_bytes) >> 2));
+		ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets,
+				     be64_to_cpu(cnt->tx_frames));
+		ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets,
+				     be64_to_cpu(cnt->rx_frames));
 		break;
 	}
 	case IB_PMA_PORT_COUNTERS_EXT:
 	{
 		struct ib_pma_portcounters_ext *pma_cnt_ext =
-				(struct ib_pma_portcounters_ext *) counters;
+			(struct ib_pma_portcounters_ext *)counters;
 
 		pma_cnt_ext->port_xmit_data =
-			cpu_to_be64((be64_to_cpu(cnt->counters[0].
-						 IfTxOctets) >> 2));
-		pma_cnt_ext->port_rcv_data  =
-			cpu_to_be64((be64_to_cpu(cnt->counters[0].
-						 IfRxOctets) >> 2));
-		pma_cnt_ext->port_xmit_packets = cnt->counters[0].IfTxFrames;
-		pma_cnt_ext->port_rcv_packets  = cnt->counters[0].IfRxFrames;
+			cpu_to_be64(be64_to_cpu(cnt->tx_bytes) >> 2);
+		pma_cnt_ext->port_rcv_data =
+			cpu_to_be64(be64_to_cpu(cnt->rx_bytes) >> 2);
+		pma_cnt_ext->port_xmit_packets = cnt->tx_frames;
+		pma_cnt_ext->port_rcv_packets = cnt->rx_frames;
 		break;
 	}
 	default:
-		pr_warn("Unsupported attr_id 0x%x\n", attr_id);
 		break;
 	}
 }
 
-int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index,
-		       union mlx4_counter *counter, u8 clear)
-{
-	struct mlx4_cmd_mailbox *mailbox;
-	int err;
-	u32 inmod = counter_index | ((clear & 1) << 31);
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
-	if (IS_ERR(mailbox))
-		return IB_MAD_RESULT_FAILURE;
-
-	err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0,
-			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
-			   MLX4_CMD_NATIVE);
-	if (!err)
-		memcpy(counter, mailbox->buf, MLX4_IF_STAT_SZ(1));
-
-	mlx4_free_cmd_mailbox(dev->dev, mailbox);
-
-	return err;
-}
-
 static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-			struct ib_wc *in_wc, struct ib_grh *in_grh,
-			struct ib_mad *in_mad, struct ib_mad *out_mad)
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
+	struct mlx4_counter counter_stats;
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 	int err;
 	u32 counter_index = dev->counters[port_num - 1].counter_index & 0xffff;
-	u8 mode;
-	char				counter_buf[MLX4_IF_STAT_SZ(1)] __aligned(8);
-	union  mlx4_counter		*counter = (union mlx4_counter *)
-						   counter_buf;
 
 	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
 		return -EINVAL;
@@ -996,23 +843,21 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 		memset(out_mad->data, 0, sizeof out_mad->data);
 		err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
 	} else {
-		if (mlx4_ib_query_if_stat(dev, counter_index, counter, 0))
+		memset(&counter_stats, 0, sizeof(counter_stats));
+		err = mlx4_get_counter_stats(dev->dev,
+					     counter_index,
+					     &counter_stats, 0);
+		if (err)
 			return IB_MAD_RESULT_FAILURE;
 
 		memset(out_mad->data, 0, sizeof(out_mad->data));
-		mode = counter->control.cnt_mode & 0xFF;
 		err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-		switch (mode & 0xf) {
+		switch (counter_stats.counter_mode & 0xf) {
 		case 0:
-			edit_counter((void *)counter,
+			edit_counter(&counter_stats,
 				     (void *)(out_mad->data + 40),
 				     in_mad->mad_hdr.attr_id);
 			break;
-		case 1:
-			edit_counter_ext((void *)counter,
-					 (void *)(out_mad->data + 40),
-					 in_mad->mad_hdr.attr_id);
-			break;
 		default:
 			err = IB_MAD_RESULT_FAILURE;
 		}
@@ -1106,7 +951,7 @@ static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num)
 
 	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
 		mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
-					    MLX4_EQ_PORT_INFO_LID_CHANGE_MASK, 0, 0);
+					    MLX4_EQ_PORT_INFO_LID_CHANGE_MASK);
 }
 
 static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
@@ -1118,7 +963,7 @@ static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
 		if (!dev->sriov.is_going_down) {
 			mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0);
 			mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
-						    MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK, 0, 0);
+						    MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK);
 		}
 	}
 	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
@@ -1204,11 +1049,6 @@ void handle_port_mgmt_change_event(struct work_struct *work)
 			u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid);
 			u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf;
 			update_sm_ah(dev, port, lid, sl);
-			mlx4_ib_dispatch_event(dev, port, IB_EVENT_SM_CHANGE);
-			if (mlx4_is_master(dev->dev))
-				mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
-							    changed_attr & MSTR_SM_CHANGE_MASK,
-							    lid, sl);
 		}
 
 		/* Check if it is a lid change event */
@@ -1221,7 +1061,7 @@ void handle_port_mgmt_change_event(struct work_struct *work)
 			/*if master, notify all slaves*/
 			if (mlx4_is_master(dev->dev))
 				mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
-							    MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK, 0, 0);
+							    MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK);
 		}
 
 		if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
@@ -1436,7 +1276,7 @@ static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port)
 		return slave;
 
 	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
-	vfs = dev->dev->num_vfs;
+	vfs = dev->dev->persist->num_vfs;
 
 	if (slave == 0)
 		return 0;
diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c
index 298e7e698937..0f089ea0a51c 100644
--- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c
+++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c
@@ -77,28 +77,6 @@ int mlx4_ib_sm_guid_assign = 1;
 module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
 MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)");
 
-enum {
-	MAX_NUM_STR_BITMAP = 1 << 15,
-	DEFAULT_TBL_VAL = -1
-};
-
-static struct mlx4_dbdf2val_lst dev_assign_str = {
-	.name		= "dev_assign_str param",
-	.num_vals	= 1,
-	.def_val	= {DEFAULT_TBL_VAL},
-	.range		= {0, MAX_NUM_STR_BITMAP - 1}
-};
-module_param_string(dev_assign_str, dev_assign_str.str,
-		    sizeof(dev_assign_str.str), 0444);
-MODULE_PARM_DESC(dev_assign_str,
-		 "Map device function numbers to IB device numbers (e.g. '0000:04:00.0-0,002b:1c:0b.a-1,...').\n"
-		 "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for IB device numbers (e.g. 1).\n"
-		 "\t\tMax supported devices - 32");
-
-
-static unsigned long *dev_num_str_bitmap;
-static spinlock_t dev_num_str_lock;
-
 static const char mlx4_ib_version[] =
 	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
 	DRV_VERSION "\n";
@@ -117,8 +95,6 @@ struct dev_rec {
 	int	nr;
 };
 
-static int dr_active;
-
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
 
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device*,
@@ -169,6 +145,7 @@ int mlx4_ib_query_device(struct ib_device *ibdev,
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
 	int err = -ENOMEM;
+	struct mlx4_clock_params clock_params;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
@@ -213,8 +190,6 @@ int mlx4_ib_query_device(struct ib_device *ibdev,
 		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
 		props->device_cap_flags |= IB_DEVICE_XRC;
-	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)
-		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
 
 	if (check_flow_steering_support(dev->dev))
 		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
@@ -235,7 +210,7 @@ int mlx4_ib_query_device(struct ib_device *ibdev,
 	}
 	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
 		0xffffff;
-	props->vendor_part_id	   = dev->dev->pdev->device;
+	props->vendor_part_id	   = dev->dev->persist->pdev->device;
 	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
 	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
 
@@ -267,9 +242,12 @@ int mlx4_ib_query_device(struct ib_device *ibdev,
 					   props->max_mcast_grp;
 	props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
 	props->hca_core_clock = dev->dev->caps.hca_core_clock;
+
+	if (!mlx4_is_slave(dev->dev))
+		err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
 	if (dev->dev->caps.hca_core_clock > 0)
 		props->comp_mask |= IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK;
-	if (dev->dev->caps.cq_timestamp) {
+	if (!err && !mlx4_is_slave(dev->dev)) {
 		props->timestamp_mask = 0xFFFFFFFFFFFF;
 		props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK;
 	}
@@ -831,8 +809,9 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 		if (io_remap_pfn_range(vma, vma->vm_start,
-				       (pci_resource_start(dev->dev->pdev,
-				       params.bar) + params.offset)
+				       (pci_resource_start(dev->dev->persist->pdev,
+							   params.bar) +
+					params.offset)
 				       >> PAGE_SHIFT,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
@@ -1088,12 +1067,12 @@ static int parse_flow_attr(struct mlx4_dev *dev,
 	default:
 		return -EINVAL;
 	}
-	if (map_sw_to_hw_steering_id(dev, type) < 0 ||
-	    hw_rule_sz(dev, type) < 0)
+	if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 ||
+	    mlx4_hw_rule_sz(dev, type) < 0)
 		return -EINVAL;
-	mlx4_spec->id = cpu_to_be16(map_sw_to_hw_steering_id(dev, type));
-	mlx4_spec->size = hw_rule_sz(dev, type) >> 2;
-	return hw_rule_sz(dev, type);
+	mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type));
+	mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2;
+	return mlx4_hw_rule_sz(dev, type);
 }
 
 static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
@@ -1125,7 +1104,8 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att
 		pr_err("Invalid domain value.\n");
 		return -EINVAL;
 	}
-	if (map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
+
+	if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
 		return -EINVAL;
 
 	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
@@ -1136,7 +1116,7 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att
 
 	ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
 				 flow_attr->priority);
-	ctrl->type = map_sw_to_hw_steering_mode(mdev->dev, flow_type);
+	ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type);
 	ctrl->port = flow_attr->port;
 	ctrl->qpn = cpu_to_be32(qp->qp_num);
 
@@ -1511,7 +1491,7 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
-	return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
+	return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
 }
 
 static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
@@ -1541,33 +1521,16 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
 		       dev->dev->board_id);
 }
 
-static ssize_t show_vsd(struct device *device, struct device_attribute *attr,
-			  char *buf)
-{
-	struct mlx4_ib_dev *dev =
-		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
-	ssize_t len = MLX4_VSD_LEN;
-
-	if (dev->dev->vsd_vendor_id == PCI_VENDOR_ID_MELLANOX)
-		len = sprintf(buf, "%.*s\n", MLX4_VSD_LEN, dev->dev->vsd);
-	else
-		memcpy(buf, dev->dev->vsd, MLX4_VSD_LEN);
-
-	return len;
-}
-
 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
 static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
-static DEVICE_ATTR(vsd,      S_IRUGO, show_vsd,    NULL);
 
 static struct device_attribute *mlx4_class_attributes[] = {
 	&dev_attr_hw_rev,
 	&dev_attr_fw_ver,
 	&dev_attr_hca_type,
-	&dev_attr_board_id,
-	&dev_attr_vsd
+	&dev_attr_board_id
 };
 
 static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev, u8 port)
@@ -1951,7 +1914,8 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev)
 	int i;
 
 	if (mlx4_is_master(ibdev->dev)) {
-		for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
+		for (slave = 0; slave <= ibdev->dev->persist->num_vfs;
+		     ++slave) {
 			for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
 				for (i = 0;
 				     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
@@ -1978,82 +1942,52 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev)
 
 static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 {
-	char name[32];
-	int eq_per_port = 0;
-	int added_eqs = 0;
-	int total_eqs = 0;
-	int i, j, eq;
+	int i, j, eq = 0, total_eqs = 0;
 
-	/* Legacy mode or comp_pool is not large enough */
-	if (dev->caps.comp_pool == 0 ||
-	    dev->caps.num_ports > dev->caps.comp_pool)
-		return;
-
-	eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/
-					dev->caps.num_ports);
-
-	/* Init eq table */
-	added_eqs = 0;
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
-		added_eqs += eq_per_port;
-
-	total_eqs = dev->caps.num_comp_vectors + added_eqs;
-
-	ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL);
+	ibdev->eq_table = kcalloc(dev->caps.num_comp_vectors,
+				  sizeof(ibdev->eq_table[0]), GFP_KERNEL);
 	if (!ibdev->eq_table)
 		return;
 
-	ibdev->eq_added = added_eqs;
-
-	eq = 0;
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
-		for (j = 0; j < eq_per_port; j++) {
-			sprintf(name, "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j,
-						pci_get_domain(dev->pdev->dev.bsddev),
-						pci_get_bus(dev->pdev->dev.bsddev),
-						PCI_SLOT(dev->pdev->devfn),
-						PCI_FUNC(dev->pdev->devfn));
-
-			/* Set IRQ for specific name (per ring) */
-			if (mlx4_assign_eq(dev, name,
-					   &ibdev->eq_table[eq])) {
-				/* Use legacy (same as mlx4_en driver) */
-				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
-				ibdev->eq_table[eq] =
-					(eq % dev->caps.num_comp_vectors);
-			}
-			eq++;
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		for (j = 0; j < mlx4_get_eqs_per_port(dev, i);
+		     j++, total_eqs++) {
+			if (i > 1 &&  mlx4_is_eq_shared(dev, total_eqs))
+				continue;
+			ibdev->eq_table[eq] = total_eqs;
+			if (!mlx4_assign_eq(dev, i,
+					    &ibdev->eq_table[eq]))
+				eq++;
+			else
+				ibdev->eq_table[eq] = -1;
 		}
 	}
 
-	/* Fill the reset of the vector with legacy EQ */
-	for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++)
-		ibdev->eq_table[eq++] = i;
+	for (i = eq; i < dev->caps.num_comp_vectors;
+	     ibdev->eq_table[i++] = -1)
+		;
 
 	/* Advertise the new number of EQs to clients */
-	ibdev->ib_dev.num_comp_vectors = total_eqs;
+	ibdev->ib_dev.num_comp_vectors = eq;
 }
 
 static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 {
 	int i;
+	int total_eqs = ibdev->ib_dev.num_comp_vectors;
 
-	/* no additional eqs were added */
+	/* no eqs were allocated */
 	if (!ibdev->eq_table)
 		return;
 
 	/* Reset the advertised EQ number */
-	ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
+	ibdev->ib_dev.num_comp_vectors = 0;
 
-	/* Free only the added eqs */
-	for (i = 0; i < ibdev->eq_added; i++) {
-		/* Don't free legacy eqs if used */
-		if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors)
-			continue;
+	for (i = 0; i < total_eqs; i++)
 		mlx4_release_eq(dev, ibdev->eq_table[i]);
-	}
 
 	kfree(ibdev->eq_table);
+	ibdev->eq_table = NULL;
 }
 
 /*
@@ -2079,8 +2013,8 @@ static size_t show_diag_rprt(struct device *device, char *buf,
 	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
 					       ib_dev.dev);
 
-	ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier,
-				       &counter_offset, &diag_counter);
+	ret = mlx4_query_diag_counters(dev->dev, op_modifier,
+				       &counter_offset, &diag_counter, 1, 0);
 	if (ret)
 		return ret;
 
@@ -2095,8 +2029,8 @@ static ssize_t clear_diag_counters(struct device *device,
 	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
 					       ib_dev.dev);
 
-	ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS,
-				       NULL, NULL);
+	ret = mlx4_query_diag_counters(dev->dev, MLX4_DIAG_RPRT_CLEAR_DIAGS,
+				       NULL, NULL, 0, 0);
 	if (ret)
 		return ret;
 
@@ -2172,63 +2106,6 @@ static struct attribute_group diag_counters_group = {
 	.attrs  = diag_rprt_attrs
 };
 
-static void init_dev_assign(void)
-{
-	int i = 1;
-
-	spin_lock_init(&dev_num_str_lock);
-	if (mlx4_fill_dbdf2val_tbl(&dev_assign_str))
-		return;
-	dev_num_str_bitmap =
-		kmalloc(BITS_TO_LONGS(MAX_NUM_STR_BITMAP) * sizeof(long),
-			GFP_KERNEL);
-	if (!dev_num_str_bitmap) {
-		pr_warn("bitmap alloc failed -- cannot apply dev_assign_str parameter\n");
-		return;
-	}
-	bitmap_zero(dev_num_str_bitmap, MAX_NUM_STR_BITMAP);
-	while ((i < MLX4_DEVS_TBL_SIZE) && (dev_assign_str.tbl[i].dbdf !=
-	       MLX4_ENDOF_TBL)) {
-		if (bitmap_allocate_region(dev_num_str_bitmap,
-					   dev_assign_str.tbl[i].val[0], 0))
-			goto err;
-		i++;
-	}
-	dr_active = 1;
-	return;
-
-err:
-	kfree(dev_num_str_bitmap);
-	dev_num_str_bitmap = NULL;
-	pr_warn("mlx4_ib: The value of 'dev_assign_str' parameter "
-			    "is incorrect. The parameter value is discarded!");
-}
-
-static int mlx4_ib_dev_idx(struct mlx4_dev *dev)
-{
-	int i, val;
-
-	if (!dr_active)
-		return -1;
-	if (!dev)
-		return -1;
-	if (mlx4_get_val(dev_assign_str.tbl, dev->pdev, 0, &val))
-		return -1;
-
-	if (val != DEFAULT_TBL_VAL) {
-		dev->flags |= MLX4_FLAG_DEV_NUM_STR;
-		return val;
-	}
-
-	spin_lock(&dev_num_str_lock);
-	i = bitmap_find_free_region(dev_num_str_bitmap, MAX_NUM_STR_BITMAP, 0);
-	spin_unlock(&dev_num_str_lock);
-	if (i >= 0)
-		return i;
-
-	return -1;
-}
-
 static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
 			       struct ib_port_immutable *immutable)
 {
@@ -2242,7 +2119,7 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
 	} else {
 		if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)
 			immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
-		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCEV2)
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
 			immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
 				RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
 		immutable->core_cap_flags |= RDMA_CORE_PORT_RAW_PACKET;
@@ -2268,7 +2145,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	int i, j;
 	int err;
 	struct mlx4_ib_iboe *iboe;
-	int dev_idx;
 
         pr_info_once("%s", mlx4_ib_version);
 
@@ -2281,7 +2157,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
 	if (!ibdev) {
-		dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+		dev_err(&dev->persist->pdev->dev,
+			"Device struct alloc failed\n");
 		return NULL;
 	}
 
@@ -2303,11 +2180,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	ibdev->dev = dev;
 
-	dev_idx = mlx4_ib_dev_idx(dev);
-	if (dev_idx >= 0)
-		sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx);
-	else
-		strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
 
 	ibdev->ib_dev.owner		= THIS_MODULE;
 	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
@@ -2315,7 +2188,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->num_ports		= num_ports;
 	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
 	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
-	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
+	ibdev->ib_dev.dma_device	= &dev->persist->pdev->dev;
 
 	if (dev->caps.userspace_caps)
 		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
@@ -2457,18 +2330,20 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 						IB_LINK_LAYER_ETHERNET) {
 			if (mlx4_is_slave(dev)) {
 				ibdev->counters[i].status = mlx4_counter_alloc(ibdev->dev,
-									       i + 1,
 									       &ibdev->counters[i].counter_index);
+				if (ibdev->counters[i].status)
+					ibdev->counters[i].counter_index = mlx4_get_default_counter_index(dev,
+								       i + 1);
 			} else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */
 				ibdev->counters[i].counter_index = ((i + 1) << 1) - 1;
 				ibdev->counters[i].status = 0;
 			}
 
-			dev_info(&dev->pdev->dev,
+			dev_info(&dev->persist->pdev->dev,
 				 "%s: allocated counter index %d for port %d\n",
 				 __func__, ibdev->counters[i].counter_index, i+1);
 		} else {
-			ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX;
+			ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX(dev);
 			ibdev->counters[i].status = -ENOSPC;
 		}
 	}
@@ -2489,7 +2364,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 				sizeof(long),
 				GFP_KERNEL);
 		if (!ibdev->ib_uc_qpns_bitmap) {
-			dev_err(&dev->pdev->dev, "bit map alloc failed\n");
+			dev_err(&dev->persist->pdev->dev,
+				"bit map alloc failed\n");
 			goto err_steer_qp_release;
 		}
 
@@ -2591,7 +2467,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i) ==
 						IB_LINK_LAYER_ETHERNET) {
 			mlx4_counter_free(ibdev->dev,
-					  i,
 					  ibdev->counters[i - 1].counter_index);
 		}
 	}
@@ -2678,7 +2553,6 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 {
 	struct mlx4_ib_dev *ibdev = ibdev_ptr;
 	int p, j;
-	int dev_idx, ret;
 
 	if (ibdev->iboe.nb_inet.notifier_call) {
 		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
@@ -2695,19 +2569,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 			mlx4_class_attributes[j]);
 	}
 
-
-	dev_idx = -1;
-	if (dr_active && !(ibdev->dev->flags & MLX4_FLAG_DEV_NUM_STR)) {
-		ret = sscanf(ibdev->ib_dev.name, "mlx4_%d", &dev_idx);
-		if (ret != 1)
-			dev_idx = -1;
-	}
 	ib_unregister_device(&ibdev->ib_dev);
-	if (dev_idx >= 0) {
-		spin_lock(&dev_num_str_lock);
-		bitmap_release_region(dev_num_str_bitmap, dev_idx, 0);
-		spin_unlock(&dev_num_str_lock);
-	}
 
 	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
@@ -2726,7 +2588,6 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, p + 1) ==
 						IB_LINK_LAYER_ETHERNET) {
 			mlx4_counter_free(ibdev->dev,
-					  p + 1,
 					  ibdev->counters[p].counter_index);
 		}
 	}
@@ -2883,8 +2744,6 @@ static int __init mlx4_ib_init(void)
 	if (err)
 		goto clean_proc;
 
-	init_dev_assign();
-
 	err = mlx4_register_interface(&mlx4_ib_interface);
 	if (err)
 		goto clean_mcg;
@@ -2904,8 +2763,6 @@ static void __exit mlx4_ib_cleanup(void)
 	mlx4_unregister_interface(&mlx4_ib_interface);
 	mlx4_ib_mcg_destroy();
 	destroy_workqueue(wq);
-
-	kfree(dev_num_str_bitmap);
 }
 
 module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE);
diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_mr.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_mr.c
index f531a03a3681..9a2889d01ab5 100644
--- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_mr.c
+++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_mr.c
@@ -315,7 +315,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
 						u64 start_va,
 						int *num_of_mtts)
 {
-	u64 block_shift = MLX4_MAX_MTT_SHIFT;
+	u64 block_shift = 31;
 	u64 current_block_len = 0;
 	u64 current_block_start = 0;
 	u64 misalignment_bits;
@@ -763,7 +763,7 @@ struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device
 	if (!mfrpl->ibfrpl.page_list)
 		goto err_free;
 
-	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
+	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->persist->pdev->dev,
 						     size, &mfrpl->map,
 						     GFP_KERNEL);
 	if (!mfrpl->mapped_page_list)
@@ -785,7 +785,7 @@ void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
 	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
 	int size = page_list->max_page_list_len * sizeof (u64);
 
-	dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
+	dma_free_coherent(&dev->dev->persist->pdev->dev, size, mfrpl->mapped_page_list,
 			  mfrpl->map);
 	kfree(mfrpl->ibfrpl.page_list);
 	kfree(mfrpl);
diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c
index a1887a9c263e..625f4b791ae8 100644
--- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c
+++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_qp.c
@@ -641,7 +641,7 @@ static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp,
 		err = mlx4_ib_steer_qp_alloc(dev, tss_align_num, &tss_base);
 	else
 		err = mlx4_qp_reserve_range(dev->dev, tss_align_num,
-				tss_align_num, &tss_base, MLX4_RESERVE_BF_QP);
+				tss_align_num, &tss_base, MLX4_RESERVE_ETH_BF_QP);
 	if (err)
 		goto err1;
 
@@ -801,7 +801,7 @@ static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 		 * VLAN insertion. */
 		if (attr->qp_type == IB_QPT_RAW_PACKET) {
 			err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn,
-						    MLX4_RESERVE_BF_QP);
+						    MLX4_RESERVE_ETH_BF_QP);
 		} else {
 			if(qp->flags & MLX4_IB_QP_NETIF)
 				err = mlx4_ib_steer_qp_alloc(dev, 1, qpn);
@@ -1016,7 +1016,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			goto err;
 
 		if (mlx4_ib_qp_has_rq(init_attr)) {
-			err = mlx4_db_alloc(dev->dev, &qp->db, 0);
+			err = mlx4_db_alloc(dev->dev, &qp->db, 0, GFP_KERNEL);
 			if (err)
 				goto err;
 
@@ -1033,7 +1033,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		} else
 			qp->bf.uar = &dev->priv_uar;
 
-		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, GFP_KERNEL)) {
 			err = -ENOMEM;
 			goto err_db;
 		}
@@ -1043,7 +1043,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		if (err)
 			goto err_buf;
 
-		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, GFP_KERNEL);
 		if (err)
 			goto err_mtt;
 
@@ -1070,7 +1070,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			goto err_proxy;
 	}
 
-	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, GFP_KERNEL);
 	if (err)
 		goto err_qpn;
 
@@ -1408,12 +1408,10 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 		       return ERR_PTR(-EINVAL);
 	}
 
-	if ((mlx4_qp_flags &
+	if (mlx4_qp_flags &
 		(MLX4_IB_QP_CAP_CROSS_CHANNEL |
 		 MLX4_IB_QP_CAP_MANAGED_SEND |
-		 MLX4_IB_QP_CAP_MANAGED_RECV)) &&
-	     !(to_mdev(device)->dev->caps.flags &
-		MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)) {
+		 MLX4_IB_QP_CAP_MANAGED_RECV)) {
 		pr_debug("%s Does not support cross-channel operations\n",
 			 to_mdev(device)->ib_dev.name);
 		return ERR_PTR(-EINVAL);
@@ -1956,27 +1954,6 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
 	}
 
-	if (attr_mask & IB_M_EXT_CLASS_1)
-		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER);
-
-	/* for now we enable also sqe on send */
-	if (attr_mask & IB_M_EXT_CLASS_2) {
-		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_SQ);
-		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER);
-	}
-
-	if (attr_mask & IB_M_EXT_CLASS_3)
-		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_RQ);
-
-	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-		context->params2 |= (qp->flags & MLX4_IB_QP_CAP_CROSS_CHANNEL ?
-			cpu_to_be32(MLX4_QP_BIT_COLL_MASTER) : 0);
-		context->params2 |= (qp->flags & MLX4_IB_QP_CAP_MANAGED_SEND ?
-			cpu_to_be32(MLX4_QP_BIT_COLL_MASTER | MLX4_QP_BIT_COLL_SYNC_SQ) : 0);
-		context->params2 |= (qp->flags & MLX4_IB_QP_CAP_MANAGED_RECV ?
-			cpu_to_be32(MLX4_QP_BIT_COLL_MASTER | MLX4_QP_BIT_COLL_SYNC_RQ) : 0);
-	}
-
 	if (ibqp->srq)
 		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
 
@@ -2067,7 +2044,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		sqd_event = 0;
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
-		context->rlkey |= (1 << 4);
+		context->rlkey_roce_mode |= (1 << 4);
 
 	if ((attr_mask & IB_QP_GROUP_RSS) &&
 		(qp->qpg_data->rss_child_count > 1)) {
@@ -2153,29 +2130,6 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	if (is_sqp(dev, qp))
 		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
 
-	/* Set 'ignore_cq_overrun' bits for collectives offload */
-	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-		if (attr_mask & (IB_M_EXT_CLASS_2 | IB_M_EXT_CLASS_3)) {
-			err = mlx4_ib_ignore_overrun_cq(ibqp->send_cq);
-			if (err) {
-				pr_err("Failed to set ignore CQ "
-				       "overrun for QP 0x%x's send CQ\n",
-				       ibqp->qp_num);
-				goto out;
-			}
-
-			if (ibqp->recv_cq != ibqp->send_cq) {
-				err = mlx4_ib_ignore_overrun_cq(ibqp->recv_cq);
-				if (err) {
-					pr_err("Failed to set ignore "
-					       "CQ overrun for QP 0x%x's recv "
-					       "CQ\n", ibqp->qp_num);
-					goto out;
-				}
-			}
-		}
-	}
-
 	/*
 	 * If we moved QP0 to RTR, bring the IB link up; if we moved
 	 * QP0 to RESET or ERROR, bring the link back down.
@@ -2333,8 +2287,7 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		ll = rdma_port_get_link_layer(&dev->ib_dev, port);
 	}
 
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
-				attr_mask & ~IB_M_QP_MOD_VEND_MASK, ll)) {
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, ll)) {
 		pr_debug("qpn 0x%x: invalid attribute mask specified "
 			 "for transition %d to %d. qp_type %d,"
 			 " attr_mask 0x%x\n",
@@ -2343,12 +2296,6 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 	}
 
-	if ((attr_mask & IB_M_QP_MOD_VEND_MASK) && !dev->dev->caps.sync_qp) {
-		pr_err("extended verbs are not supported by %s\n",
-		       dev->ib_dev.name);
-		goto out;
-	}
-
 	if ((attr_mask & IB_QP_PORT) &&
 	    (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
 		pr_debug("qpn 0x%x: invalid port number (%d) specified "
diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c
index 971f91933a02..f672c0cc10bb 100644
--- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c
+++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_srq.c
@@ -134,13 +134,14 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 		if (err)
 			goto err_mtt;
 	} else {
-		err = mlx4_db_alloc(dev->dev, &srq->db, 0);
+		err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL);
 		if (err)
 			goto err_srq;
 
 		*srq->db.db = 0;
 
-		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf,
+				   GFP_KERNEL)) {
 			err = -ENOMEM;
 			goto err_db;
 		}
@@ -165,7 +166,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 		if (err)
 			goto err_buf;
 
-		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL);
 		if (err)
 			goto err_mtt;
 
diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_sysfs.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_sysfs.c
index 2a7d4f03f4dc..eac5d1c1717c 100644
--- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_sysfs.c
+++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_sysfs.c
@@ -375,7 +375,7 @@ static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
 	char base_name[9];
 
 	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
-	strlcpy(name, pci_name(dev->dev->pdev), max);
+	strlcpy(name, pci_name(dev->dev->persist->pdev), max);
 	strncpy(base_name, name, 8); /*till xxxx:yy:*/
 	base_name[8] = '\0';
 	/* with no ARI only 3 last bits are used so when the fn is higher than 8
@@ -688,7 +688,7 @@ static int register_pkey_tree(struct mlx4_ib_dev *device)
 	if (!mlx4_is_master(device->dev))
 		return 0;
 
-	for (i = 0; i <= device->dev->num_vfs; ++i)
+	for (i = 0; i <= device->dev->persist->num_vfs; ++i)
 		register_one_pkey_tree(device, i);
 
 	return 0;
@@ -703,7 +703,7 @@ static void unregister_pkey_tree(struct mlx4_ib_dev *device)
 	if (!mlx4_is_master(device->dev))
 		return;
 
-	for (slave = device->dev->num_vfs; slave >= 0; --slave) {
+	for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) {
 		list_for_each_entry_safe(p, t,
 					 &device->pkeys.pkey_port_list[slave],
 					 entry) {
diff --git a/sys/dev/mlx4/qp.h b/sys/dev/mlx4/qp.h
index b2479ae87390..fff16de040da 100644
--- a/sys/dev/mlx4/qp.h
+++ b/sys/dev/mlx4/qp.h
@@ -38,22 +38,12 @@
 #include <dev/mlx4/device.h>
 
 #define MLX4_INVALID_LKEY	0x100
-
 #define	DS_SIZE_ALIGNMENT	16
 
 #define	SET_BYTE_COUNT(byte_count) cpu_to_be32(byte_count)
 #define	SET_LSO_MSS(mss_hdr_size) cpu_to_be32(mss_hdr_size)
 #define	DS_BYTE_COUNT_MASK	cpu_to_be32(0x7fffffff)
 
-enum ib_m_qp_attr_mask {
-	IB_M_EXT_CLASS_1 = 1 << 28,
-	IB_M_EXT_CLASS_2 = 1 << 29,
-	IB_M_EXT_CLASS_3 = 1 << 30,
-
-	IB_M_QP_MOD_VEND_MASK = (IB_M_EXT_CLASS_1 | IB_M_EXT_CLASS_2 |
-				 IB_M_EXT_CLASS_3)
-};
-
 enum mlx4_qp_optpar {
 	MLX4_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
 	MLX4_QP_OPTPAR_RRE			= 1 << 1,
@@ -70,7 +60,8 @@ enum mlx4_qp_optpar {
 	MLX4_QP_OPTPAR_RNR_RETRY		= 1 << 13,
 	MLX4_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
 	MLX4_QP_OPTPAR_SCHED_QUEUE		= 1 << 16,
-	MLX4_QP_OPTPAR_COUNTER_INDEX		= 1 << 20
+	MLX4_QP_OPTPAR_COUNTER_INDEX		= 1 << 20,
+	MLX4_QP_OPTPAR_VLAN_STRIPPING		= 1 << 21,
 };
 
 enum mlx4_qp_state {
@@ -109,10 +100,8 @@ enum {
 	MLX4_QP_BIT_RRE				= 1 << 15,
 	MLX4_QP_BIT_RWE				= 1 << 14,
 	MLX4_QP_BIT_RAE				= 1 << 13,
+	MLX4_QP_BIT_FPP				= 1 <<	3,
 	MLX4_QP_BIT_RIC				= 1 <<	4,
-	MLX4_QP_BIT_COLL_SYNC_RQ                = 1 << 2,
-	MLX4_QP_BIT_COLL_SYNC_SQ                = 1 << 1,
-	MLX4_QP_BIT_COLL_MASTER                 = 1 << 0
 };
 
 enum {
@@ -126,25 +115,34 @@ enum {
 	MLX4_RSS_TCP_IPV4			= 1 << 4,
 	MLX4_RSS_IPV4				= 1 << 5,
 
+	MLX4_RSS_BY_OUTER_HEADERS		= 0 << 6,
+	MLX4_RSS_BY_INNER_HEADERS		= 2 << 6,
+	MLX4_RSS_BY_INNER_HEADERS_IPONLY	= 3 << 6,
+
 	/* offset of mlx4_rss_context within mlx4_qp_context.pri_path */
 	MLX4_RSS_OFFSET_IN_QPC_PRI_PATH		= 0x24,
 	/* offset of being RSS indirection QP within mlx4_qp_context.flags */
 	MLX4_RSS_QPC_FLAG_OFFSET		= 13,
 };
 
+#define MLX4_EN_RSS_KEY_SIZE 40
+
 struct mlx4_rss_context {
 	__be32			base_qpn;
 	__be32			default_qpn;
 	u16			reserved;
 	u8			hash_fn;
 	u8			flags;
-	__be32			rss_key[10];
+	__be32			rss_key[MLX4_EN_RSS_KEY_SIZE / sizeof(__be32)];
 	__be32			base_qpn_udp;
 };
 
 struct mlx4_qp_path {
 	u8			fl;
-	u8			vlan_control;
+	union {
+		u8			vlan_control;
+		u8			control;
+	};
 	u8			disable_pkey_check;
 	u8			pkey_index;
 	u8			counter_index;
@@ -161,31 +159,38 @@ struct mlx4_qp_path {
 	u8			feup;
 	u8			fvl_rx;
 	u8			reserved4[2];
-	u8			dmac[6];
+	u8			dmac[ETH_ALEN];
 };
 
 enum { /* fl */
 	MLX4_FL_CV	= 1 << 6,
+	MLX4_FL_SV	= 1 << 5,
 	MLX4_FL_ETH_HIDE_CQE_VLAN	= 1 << 2,
 	MLX4_FL_ETH_SRC_CHECK_MC_LB	= 1 << 1,
 	MLX4_FL_ETH_SRC_CHECK_UC_LB	= 1 << 0,
 };
+
+enum { /* control */
+	MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER	= 1 << 7,
+};
+
 enum { /* vlan_control */
-	MLX4_VLAN_CTRL_ETH_SRC_CHECK_IF_COUNTER	= 1 << 7,
 	MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED	= 1 << 6,
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED	= 1 << 5, /* 802.1p priority tag */
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED	= 1 << 4,
 	MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED	= 1 << 2,
-	MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED	= 1 << 1,/* 802.1p priorty tag*/
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED	= 1 << 1, /* 802.1p priority tag */
 	MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED	= 1 << 0
 };
 
 enum { /* feup */
-	MLX4_FEUP_FORCE_ETH_UP		= 1 << 6, /* force Eth UP */
-	MLX4_FSM_FORCE_ETH_SRC_MAC	= 1 << 5, /* force Source MAC */
-	MLX4_FVL_FORCE_ETH_VLAN		= 1 << 3  /* force Eth vlan */
+	MLX4_FEUP_FORCE_ETH_UP          = 1 << 6, /* force Eth UP */
+	MLX4_FSM_FORCE_ETH_SRC_MAC      = 1 << 5, /* force Source MAC */
+	MLX4_FVL_FORCE_ETH_VLAN         = 1 << 3  /* force Eth vlan */
 };
 
 enum { /* fvl_rx */
-	MLX4_FVL_RX_FORCE_ETH_VLAN	= 1 << 0 /* enforce Eth rx vlan */
+	MLX4_FVL_RX_FORCE_ETH_VLAN      = 1 << 0 /* enforce Eth rx vlan */
 };
 
 struct mlx4_qp_context {
@@ -194,7 +199,7 @@ struct mlx4_qp_context {
 	u8			mtu_msgmax;
 	u8			rq_size_stride;
 	u8			sq_size_stride;
-	u8			rlkey;
+	u8			rlkey_roce_mode;
 	__be32			usr_page;
 	__be32			local_qpn;
 	__be32			remote_qpn;
@@ -204,7 +209,8 @@ struct mlx4_qp_context {
 	u32			reserved1;
 	__be32			next_send_psn;
 	__be32			cqn_send;
-	u32			reserved2[2];
+	__be16                  roce_entropy;
+	__be16                  reserved2[3];
 	__be32			last_acked_psn;
 	__be32			ssn;
 	__be32			params2;
@@ -217,14 +223,17 @@ struct mlx4_qp_context {
 	__be32			msn;
 	__be16			rq_wqe_counter;
 	__be16			sq_wqe_counter;
-	u32			reserved3[2];
+	u32			reserved3;
+	__be16			rate_limit_params;
+	u8			reserved4;
+	u8			qos_vport;
 	__be32			param3;
 	__be32			nummmcpeers_basemkey;
 	u8			log_page_size;
-	u8			reserved4[2];
+	u8			reserved5[2];
 	u8			mtt_base_addr_h;
 	__be32			mtt_base_addr_l;
-	u32			reserved5[10];
+	u32			reserved6[10];
 };
 
 struct mlx4_update_qp_context {
@@ -239,6 +248,8 @@ struct mlx4_update_qp_context {
 enum {
 	MLX4_UPD_QP_MASK_PM_STATE	= 32,
 	MLX4_UPD_QP_MASK_VSD		= 33,
+	MLX4_UPD_QP_MASK_QOS_VPP	= 34,
+	MLX4_UPD_QP_MASK_RATE_LIMIT	= 35,
 };
 
 enum {
@@ -261,13 +272,13 @@ enum {
 	MLX4_UPD_QP_PATH_MASK_FVL_RX			= 16 + 32,
 	MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_UC_LB	= 18 + 32,
 	MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB	= 19 + 32,
+	MLX4_UPD_QP_PATH_MASK_SV			= 22 + 32,
 };
 
 enum { /* param3 */
-	MLX4_STRIP_VLAN	= 1 << 30
+	MLX4_STRIP_VLAN = 1 << 30
 };
 
-
 /* Which firmware version adds support for NEC (NoErrorCompletion) bit */
 #define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
 
@@ -275,21 +286,29 @@ enum {
 	MLX4_WQE_CTRL_OWN		= 1 << 31,
 	MLX4_WQE_CTRL_NEC		= 1 << 29,
 	MLX4_WQE_CTRL_RR		= 1 << 6,
+	MLX4_WQE_CTRL_IIP		= 1 << 28,
+	MLX4_WQE_CTRL_ILP		= 1 << 27,
 	MLX4_WQE_CTRL_FENCE		= 1 << 6,
 	MLX4_WQE_CTRL_CQ_UPDATE		= 3 << 2,
 	MLX4_WQE_CTRL_SOLICITED		= 1 << 1,
 	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
 	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
-	MLX4_WQE_CTRL_INS_VLAN		= 1 << 6,
+	MLX4_WQE_CTRL_INS_CVLAN		= 1 << 6,
+	MLX4_WQE_CTRL_INS_SVLAN		= 1 << 7,
 	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
 	MLX4_WQE_CTRL_FORCE_LOOPBACK	= 1 << 0,
 };
 
 struct mlx4_wqe_ctrl_seg {
 	__be32			owner_opcode;
-	__be16			vlan_tag;
-	u8			ins_vlan;
-	u8			fence_size;
+	union {
+		struct {
+			__be16			vlan_tag;
+			u8			ins_vlan;
+			u8			fence_size;
+		};
+		__be32			bf_qpn;
+	};
 	/*
 	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
 	 * [7]   SO (strong ordering)
@@ -342,7 +361,7 @@ struct mlx4_wqe_datagram_seg {
 	__be32			dqpn;
 	__be32			qkey;
 	__be16			vlan;
-	u8			mac[6];
+	u8			mac[ETH_ALEN];
 };
 
 struct mlx4_wqe_lso_seg {
@@ -351,8 +370,8 @@ struct mlx4_wqe_lso_seg {
 };
 
 enum mlx4_wqe_bind_seg_flags2 {
-	MLX4_WQE_BIND_TYPE_2     = (1<<31),
-	MLX4_WQE_BIND_ZERO_BASED = (1<<30),
+	MLX4_WQE_BIND_ZERO_BASED = (1 << 30),
+	MLX4_WQE_BIND_TYPE_2     = (1 << 31),
 };
 
 struct mlx4_wqe_bind_seg {
@@ -433,6 +452,31 @@ struct mlx4_wqe_inline_seg {
 	__be32			byte_count;
 };
 
+enum mlx4_update_qp_attr {
+	MLX4_UPDATE_QP_SMAC		= 1 << 0,
+	MLX4_UPDATE_QP_VSD		= 1 << 1,
+	MLX4_UPDATE_QP_RATE_LIMIT	= 1 << 2,
+	MLX4_UPDATE_QP_QOS_VPORT	= 1 << 3,
+	MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB      = 1 << 4,
+	MLX4_UPDATE_QP_SUPPORTED_ATTRS	= (1 << 5) - 1
+};
+
+enum mlx4_update_qp_params_flags {
+	MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB     = 1 << 0,
+	MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE		= 1 << 1,
+};
+
+struct mlx4_update_qp_params {
+	u8	smac_index;
+	u8	qos_vport;
+	u32	flags;
+	u16	rate_unit;
+	u16	rate_val;
+};
+
+int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn,
+		   enum mlx4_update_qp_attr attr,
+		   struct mlx4_update_qp_params *params);
 int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
 		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
@@ -452,4 +496,14 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
 
 void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
+static inline u16 folded_qp(u32 q)
+{
+	u16 res;
+
+	res = ((q & 0xff) ^ ((q & 0xff0000) >> 16)) | (q & 0xff00);
+	return res;
+}
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn);
+
 #endif /* MLX4_QP_H */
diff --git a/sys/dev/mlx4/stats.h b/sys/dev/mlx4/stats.h
index 3b86ea18cbd0..fd03c39c11f3 100644
--- a/sys/dev/mlx4/stats.h
+++ b/sys/dev/mlx4/stats.h
@@ -33,100 +33,75 @@
 #ifndef _MLX4_STATS_
 #define _MLX4_STATS_
 
-
-#ifdef MLX4_EN_PERF_STAT
-#define NUM_PERF_STATS			NUM_PERF_COUNTERS
-#else
-#define NUM_PERF_STATS			0
-#endif
-
 #define NUM_PRIORITIES	9
 #define NUM_PRIORITY_STATS 2
 
 struct mlx4_en_pkt_stats {
-	unsigned long rx_packets;
-	unsigned long rx_bytes;
-	unsigned long rx_multicast_packets;
-	unsigned long rx_broadcast_packets;
-	unsigned long rx_errors;
-	unsigned long rx_dropped;
-	unsigned long rx_length_errors;
-	unsigned long rx_over_errors;
-	unsigned long rx_crc_errors;
-	unsigned long rx_jabbers;
-	unsigned long rx_in_range_length_error;
-	unsigned long rx_out_range_length_error;
-	unsigned long rx_lt_64_bytes_packets;
-	unsigned long rx_127_bytes_packets;
-	unsigned long rx_255_bytes_packets;
-	unsigned long rx_511_bytes_packets;
-	unsigned long rx_1023_bytes_packets;
-	unsigned long rx_1518_bytes_packets;
-	unsigned long rx_1522_bytes_packets;
-	unsigned long rx_1548_bytes_packets;
-	unsigned long rx_gt_1548_bytes_packets;
-	unsigned long tx_packets;
-	unsigned long tx_bytes;
-	unsigned long tx_multicast_packets;
-	unsigned long tx_broadcast_packets;
-	unsigned long tx_errors;
-	unsigned long tx_dropped;
-	unsigned long tx_lt_64_bytes_packets;
-	unsigned long tx_127_bytes_packets;
-	unsigned long tx_255_bytes_packets;
-	unsigned long tx_511_bytes_packets;
-	unsigned long tx_1023_bytes_packets;
-	unsigned long tx_1518_bytes_packets;
-	unsigned long tx_1522_bytes_packets;
-	unsigned long tx_1548_bytes_packets;
-	unsigned long tx_gt_1548_bytes_packets;
-	unsigned long rx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
-	unsigned long tx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 rx_multicast_packets;
+	u64 rx_broadcast_packets;
+	u64 rx_errors;
+	u64 rx_dropped;
+	u64 rx_length_errors;
+	u64 rx_over_errors;
+	u64 rx_crc_errors;
+	u64 rx_jabbers;
+	u64 rx_in_range_length_error;
+	u64 rx_out_range_length_error;
+	u64 rx_lt_64_bytes_packets;
+	u64 rx_127_bytes_packets;
+	u64 rx_255_bytes_packets;
+	u64 rx_511_bytes_packets;
+	u64 rx_1023_bytes_packets;
+	u64 rx_1518_bytes_packets;
+	u64 rx_1522_bytes_packets;
+	u64 rx_1548_bytes_packets;
+	u64 rx_gt_1548_bytes_packets;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_multicast_packets;
+	u64 tx_broadcast_packets;
+	u64 tx_errors;
+	u64 tx_dropped;
+	u64 tx_lt_64_bytes_packets;
+	u64 tx_127_bytes_packets;
+	u64 tx_255_bytes_packets;
+	u64 tx_511_bytes_packets;
+	u64 tx_1023_bytes_packets;
+	u64 tx_1518_bytes_packets;
+	u64 tx_1522_bytes_packets;
+	u64 tx_1548_bytes_packets;
+	u64 tx_gt_1548_bytes_packets;
+	u64 rx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+	u64 tx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
 };
 
 struct mlx4_en_vf_stats {
-	unsigned long rx_packets;
-	unsigned long rx_bytes;
-	unsigned long rx_multicast_packets;
-	unsigned long rx_broadcast_packets;
-	unsigned long rx_errors;
-	unsigned long rx_dropped;
-	unsigned long tx_packets;
-	unsigned long tx_bytes;
-	unsigned long tx_multicast_packets;
-	unsigned long tx_broadcast_packets;
-	unsigned long tx_errors;
+	u64 rx_frames;
+	u64 rx_bytes;
+	u64 tx_frames;
+	u64 tx_bytes;
 };
 
 struct mlx4_en_vport_stats {
-	unsigned long rx_unicast_packets;
-	unsigned long rx_unicast_bytes;
-	unsigned long rx_multicast_packets;
-	unsigned long rx_multicast_bytes;
-	unsigned long rx_broadcast_packets;
-	unsigned long rx_broadcast_bytes;
-	unsigned long rx_dropped;
-	unsigned long rx_errors;
-	unsigned long tx_unicast_packets;
-	unsigned long tx_unicast_bytes;
-	unsigned long tx_multicast_packets;
-	unsigned long tx_multicast_bytes;
-	unsigned long tx_broadcast_packets;
-	unsigned long tx_broadcast_bytes;
-	unsigned long tx_errors;
+	u64 rx_frames;
+	u64 rx_bytes;
+	u64 tx_frames;
+	u64 tx_bytes;
 };
 
 struct mlx4_en_port_stats {
-	unsigned long tso_packets;
-	unsigned long queue_stopped;
-	unsigned long wake_queue;
-	unsigned long tx_timeout;
-	unsigned long oversized_packets;
-	unsigned long rx_alloc_failed;
-	unsigned long rx_chksum_good;
-	unsigned long rx_chksum_none;
-	unsigned long tx_chksum_offload;
-	unsigned long defrag_attempts;
+	u64 tso_packets;
+	u64 queue_stopped;
+	u64 wake_queue;
+	u64 tx_timeout;
+	u64 oversized_packets;
+	u64 rx_alloc_failed;
+	u64 rx_chksum_good;
+	u64 rx_chksum_none;
+	u64 tx_chksum_offload;
+	u64 defrag_attempts;
 };
 
 struct mlx4_en_perf_stats {
@@ -138,16 +113,19 @@ struct mlx4_en_perf_stats {
 	u32 napi_quota;
 };
 
-struct mlx4_en_flow_stats {
+#define MLX4_NUM_PRIORITIES	8
+
+struct mlx4_en_flow_stats_rx {
 	u64 rx_pause;
 	u64 rx_pause_duration;
 	u64 rx_pause_transition;
+};
+
+struct mlx4_en_flow_stats_tx {
 	u64 tx_pause;
 	u64 tx_pause_duration;
 	u64 tx_pause_transition;
 };
-#define MLX4_NUM_PRIORITIES	8
-
 
 struct mlx4_en_stat_out_flow_control_mbox {
 	/* Total number of PAUSE frames received from the far-end port */
@@ -170,8 +148,12 @@ struct mlx4_en_stat_out_flow_control_mbox {
 	__be64 reserved[2];
 };
 
+enum {
+	MLX4_DUMP_ETH_STATS_FLOW_CONTROL = 1 << 12
+};
+
 int mlx4_get_vport_ethtool_stats(struct mlx4_dev *dev, int port,
-                         struct mlx4_en_vport_stats *vport_stats,
-                         int reset);
+    struct mlx4_en_vport_stats *vport_stats,
+    int reset, int *read_counters);
 
 #endif
diff --git a/sys/modules/mlx4/Makefile b/sys/modules/mlx4/Makefile
index 44ec725ee9ce..d71f79bde0e2 100644
--- a/sys/modules/mlx4/Makefile
+++ b/sys/modules/mlx4/Makefile
@@ -10,6 +10,7 @@ SRCS=	device_if.h bus_if.h vnode_if.h pci_if.h \
 	mlx4_cq.c \
 	mlx4_eq.c \
 	mlx4_fw.c \
+	mlx4_fw_qos.c \
 	mlx4_icm.c \
 	mlx4_intf.c \
 	mlx4_main.c \
@@ -22,8 +23,7 @@ SRCS=	device_if.h bus_if.h vnode_if.h pci_if.h \
 	mlx4_reset.c \
 	mlx4_sense.c \
 	mlx4_srq.c \
-	mlx4_resource_tracker.c \
-	mlx4_sys_tune.c
+	mlx4_resource_tracker.c
 
 CFLAGS+= -I${SRCTOP}/sys/ofed/include
 CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include
@@ -31,5 +31,3 @@ CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include
 .include <bsd.kmod.mk>
 
 CFLAGS+= -Wno-cast-qual -Wno-pointer-arith
-
-CWARNFLAGS.mlx4_mcg.c=	-Wno-unused