Flexible and asymmetric allocation of EQs and MSI-X vectors for PF/VFs.

Previously, the mlx4 driver queried the firmware in order to get the
number of supported EQs. Under SRIOV, since this was done before the
driver notified the firmware how many VFs it actually needs, the
firmware had to take into account a worst case scenario and always
allocated four EQs per VF, where one was used for events while the
others were used for completions. Now, when the firmware supports the
asymmetric allocation scheme, denoted by exposing num_sys_eqs > 0 (-->
MLX4_DEV_CAP_FLAG2_SYS_EQS), we use the QUERY_FUNC command to query
the firmware before enabling SRIOV. Thus we can get more EQs and MSI-X
vectors per function. Moreover, when running in the new
firmware/driver mode, the limitation that the number of EQs should be
a power of two is lifted.

Obtained from:		Linux (dual BSD/GPLv2 licensed)
Submitted by:		Dexuan Cui @ microsoft . com
Differential Revision:	https://reviews.freebsd.org/D8867
MFC after:		2 weeks
Sponsored by:		Mellanox Technologies
This commit is contained in:
hselasky 2017-02-10 15:22:21 +00:00
parent 348dce9666
commit 3f842f881d
6 changed files with 167 additions and 24 deletions

View File

@ -199,6 +199,7 @@ enum {
MLX4_DEV_CAP_FLAG2_EQE_STRIDE = 1LL << 23,
MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1LL << 24,
MLX4_DEV_CAP_FLAG2_RX_CSUM_MODE = 1LL << 25,
MLX4_DEV_CAP_FLAG2_SYS_EQS = 1LL << 26,
};
/* bit enums for an 8-bit flags field indicating special use
@ -473,6 +474,7 @@ struct mlx4_caps {
int num_cqs;
int max_cqes;
int reserved_cqs;
int num_sys_eqs;
int num_eqs;
int reserved_eqs;
int num_comp_vectors;

View File

@ -56,6 +56,7 @@ struct mlx4_dev_cap {
int max_mpts;
int reserved_eqs;
int max_eqs;
int num_sys_eqs;
int reserved_mtts;
int max_mrw_sz;
int reserved_mrws;
@ -146,6 +147,16 @@ struct mlx4_func_cap {
u8 def_counter_index;
};
struct mlx4_func {
int bus;
int device;
int function;
int physical_function;
int rsvd_eqs;
int max_eq;
int rsvd_uars;
};
struct mlx4_adapter {
u16 vsd_vendor_id;
char board_id[MLX4_BOARD_ID_LEN];
@ -173,6 +184,7 @@ struct mlx4_init_hca_param {
u8 log_num_srqs;
u8 log_num_cqs;
u8 log_num_eqs;
u16 num_sys_eqs;
u8 log_rd_per_qp;
u8 log_mc_table_sz;
u8 log_mpt_sz;
@ -225,6 +237,7 @@ int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
int mlx4_NOP(struct mlx4_dev *dev);
int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave);
void mlx4_opreq_action(struct work_struct *work);
#endif /* MLX4_FW_H */

View File

@ -1136,8 +1136,12 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
goto err_out_free;
}
err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0);
err = mlx4_bitmap_init(&priv->eq_table.bitmap,
roundup_pow_of_two(dev->caps.num_eqs),
dev->caps.num_eqs - 1,
dev->caps.reserved_eqs,
roundup_pow_of_two(dev->caps.num_eqs) -
dev->caps.num_eqs);
if (err)
goto err_out_free;

View File

@ -179,6 +179,60 @@ int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
return err;
}
int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave)
{
struct mlx4_cmd_mailbox *mailbox;
u32 *outbox;
u8 in_modifier;
u8 field;
u16 field16;
int err;
#define QUERY_FUNC_BUS_OFFSET 0x00
#define QUERY_FUNC_DEVICE_OFFSET 0x01
#define QUERY_FUNC_FUNCTION_OFFSET 0x01
#define QUERY_FUNC_PHYSICAL_FUNCTION_OFFSET 0x03
#define QUERY_FUNC_RSVD_EQS_OFFSET 0x04
#define QUERY_FUNC_MAX_EQ_OFFSET 0x06
#define QUERY_FUNC_RSVD_UARS_OFFSET 0x0b
mailbox = mlx4_alloc_cmd_mailbox(dev);
if (IS_ERR(mailbox))
return PTR_ERR(mailbox);
outbox = mailbox->buf;
in_modifier = slave;
err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, 0,
MLX4_CMD_QUERY_FUNC,
MLX4_CMD_TIME_CLASS_A,
MLX4_CMD_NATIVE);
if (err)
goto out;
MLX4_GET(field, outbox, QUERY_FUNC_BUS_OFFSET);
func->bus = field & 0xf;
MLX4_GET(field, outbox, QUERY_FUNC_DEVICE_OFFSET);
func->device = field & 0xf1;
MLX4_GET(field, outbox, QUERY_FUNC_FUNCTION_OFFSET);
func->function = field & 0x7;
MLX4_GET(field, outbox, QUERY_FUNC_PHYSICAL_FUNCTION_OFFSET);
func->physical_function = field & 0xf;
MLX4_GET(field16, outbox, QUERY_FUNC_RSVD_EQS_OFFSET);
func->rsvd_eqs = field16 & 0xffff;
MLX4_GET(field16, outbox, QUERY_FUNC_MAX_EQ_OFFSET);
func->max_eq = field16 & 0xffff;
MLX4_GET(field, outbox, QUERY_FUNC_RSVD_UARS_OFFSET);
func->rsvd_uars = field & 0x0f;
mlx4_dbg(dev, "Bus: %d, Device: %d, Function: %d, Physical function: %d, Max EQs: %d, Reserved EQs: %d, Reserved UARs: %d\n",
func->bus, func->device, func->function, func->physical_function,
func->max_eq, func->rsvd_eqs, func->rsvd_uars);
out:
mlx4_free_cmd_mailbox(dev, mailbox);
return err;
}
int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
struct mlx4_vhcr *vhcr,
struct mlx4_cmd_mailbox *inbox,
@ -189,6 +243,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
u8 field, port;
u32 size;
int err = 0;
struct mlx4_func func;
#define QUERY_FUNC_CAP_FLAGS_OFFSET 0x0
#define QUERY_FUNC_CAP_NUM_PORTS_OFFSET 0x1
@ -231,6 +286,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
#define QUERY_FUNC_CAP_PROPS_DEF_COUNTER 0x20
#define QUERY_FUNC_CAP_RDMA_PROPS_FORCE_PHY_WQE_GID 0x80
#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31)
if (vhcr->op_modifier == 1) {
port = vhcr->in_modifier; /* phys-port = logical-port */
@ -293,11 +349,24 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
size = dev->caps.num_cqs;
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
size = dev->caps.num_eqs;
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
size = dev->caps.reserved_eqs;
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) ||
mlx4_QUERY_FUNC(dev, &func, slave)) {
size = vhcr->in_modifier &
QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS ?
dev->caps.num_eqs :
rounddown_pow_of_two(dev->caps.num_eqs);
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
size = dev->caps.reserved_eqs;
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
} else {
size = vhcr->in_modifier &
QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS ?
func.max_eq :
rounddown_pow_of_two(func.max_eq);
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
size = func.rsvd_eqs;
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
}
size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave];
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
@ -327,14 +396,17 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
u8 field, op_modifier;
u32 size;
int err = 0, quotas = 0;
u32 in_modifier;
op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
in_modifier = op_modifier ? gen_or_port :
QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS;
mailbox = mlx4_alloc_cmd_mailbox(dev);
if (IS_ERR(mailbox))
return PTR_ERR(mailbox);
err = mlx4_cmd_box(dev, 0, mailbox->dma, gen_or_port, op_modifier,
err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, op_modifier,
MLX4_CMD_QUERY_FUNC_CAP,
MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
if (err)
@ -504,6 +576,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET 0x21
#define QUERY_DEV_CAP_RSVD_MRW_OFFSET 0x22
#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET 0x23
#define QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET 0x26
#define QUERY_DEV_CAP_MAX_AV_OFFSET 0x27
#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET 0x29
#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET 0x2b
@ -601,6 +674,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
dev_cap->reserved_mrws = 1 << (field & 0xf);
MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET);
dev_cap->max_mtt_seg = 1 << (field & 0x3f);
MLX4_GET(size, outbox, QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET);
dev_cap->num_sys_eqs = size & 0xfff;
MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
@ -827,8 +902,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
* we can't use any EQs whose doorbell falls on that page,
* even if the EQ itself isn't reserved.
*/
dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
dev_cap->reserved_eqs);
if (dev_cap->num_sys_eqs == 0)
dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
dev_cap->reserved_eqs);
else
dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SYS_EQS;
mlx4_dbg(dev, "Max ICM size %lld MB\n",
(unsigned long long) dev_cap->max_icm_sz >> 20);
@ -838,8 +916,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz);
mlx4_dbg(dev, "Num sys EQs: %d, max EQs: %d, reserved EQs: %d, entry size: %d\n",
dev_cap->num_sys_eqs, dev_cap->max_eqs, dev_cap->reserved_eqs,
dev_cap->eqc_entry_sz);
mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
dev_cap->reserved_mrws, dev_cap->reserved_mtts);
mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
@ -1341,6 +1420,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
#define INIT_HCA_AUXC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50)
#define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60)
#define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67)
#define INIT_HCA_NUM_SYS_EQS_OFFSET (INIT_HCA_QPC_OFFSET + 0x6a)
#define INIT_HCA_RDMARC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70)
#define INIT_HCA_LOG_RD_OFFSET (INIT_HCA_QPC_OFFSET + 0x77)
#define INIT_HCA_MCAST_OFFSET 0x0c0
@ -1449,6 +1529,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
MLX4_PUT(inbox, param->auxc_base, INIT_HCA_AUXC_BASE_OFFSET);
MLX4_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET);
MLX4_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET);
MLX4_PUT(inbox, param->num_sys_eqs, INIT_HCA_NUM_SYS_EQS_OFFSET);
MLX4_PUT(inbox, param->rdmarc_base, INIT_HCA_RDMARC_BASE_OFFSET);
MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
@ -1555,6 +1636,7 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
MLX4_GET(param->auxc_base, outbox, INIT_HCA_AUXC_BASE_OFFSET);
MLX4_GET(param->eqc_base, outbox, INIT_HCA_EQC_BASE_OFFSET);
MLX4_GET(param->log_num_eqs, outbox, INIT_HCA_LOG_EQ_OFFSET);
MLX4_GET(param->num_sys_eqs, outbox, INIT_HCA_NUM_SYS_EQS_OFFSET);
MLX4_GET(param->rdmarc_base, outbox, INIT_HCA_RDMARC_BASE_OFFSET);
MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET);

View File

@ -590,6 +590,29 @@ static void mlx4_set_port_mask(struct mlx4_dev *dev)
dev->caps.port_mask[i] = dev->caps.port_type[i];
}
enum {
MLX4_QUERY_FUNC_NUM_SYS_EQS = 1 << 0,
};
static int mlx4_query_func(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
{
int err = 0;
struct mlx4_func func;
if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
err = mlx4_QUERY_FUNC(dev, &func, 0);
if (err) {
mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
return err;
}
dev_cap->max_eqs = func.max_eq;
dev_cap->reserved_eqs = func.rsvd_eqs;
dev_cap->reserved_uars = func.rsvd_uars;
err |= MLX4_QUERY_FUNC_NUM_SYS_EQS;
}
return err;
}
static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
{
int err;
@ -623,7 +646,10 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
}
dev->caps.num_ports = dev_cap->num_ports;
dev->phys_caps.num_phys_eqs = MLX4_MAX_EQ_NUM;
dev->caps.num_sys_eqs = dev_cap->num_sys_eqs;
dev->phys_caps.num_phys_eqs = dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS ?
dev->caps.num_sys_eqs :
MLX4_MAX_EQ_NUM;
for (i = 1; i <= dev->caps.num_ports; ++i) {
dev->caps.vl_cap[i] = dev_cap->max_vl[i];
dev->caps.ib_mtu_cap[i] = dev_cap->ib_mtu[i];
@ -1465,8 +1491,7 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
if (err)
goto err_srq;
num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs :
dev->caps.num_eqs;
num_eqs = dev->phys_caps.num_phys_eqs;
err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
cmpt_base +
((u64) (MLX4_CMPT_TYPE_EQ *
@ -1528,8 +1553,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
}
num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs :
dev->caps.num_eqs;
num_eqs = dev->phys_caps.num_phys_eqs;
err = mlx4_init_icm_table(dev, &priv->eq_table.table,
init_hca->eqc_base, dev_cap->eqc_entry_sz,
num_eqs, num_eqs, 0, 0);
@ -2065,6 +2089,18 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
goto err_free_icm;
}
if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
err = mlx4_query_func(dev, dev_cap);
if (err < 0) {
mlx4_err(dev, "QUERY_FUNC command failed, aborting.\n");
goto err_stop_fw;
} else if (err & MLX4_QUERY_FUNC_NUM_SYS_EQS) {
dev->caps.num_eqs = dev_cap->max_eqs;
dev->caps.reserved_eqs = dev_cap->reserved_eqs;
dev->caps.reserved_uars = dev_cap->reserved_uars;
}
}
/*
* Read HCA frequency by QUERY_HCA command
*/
@ -2905,13 +2941,12 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
{
struct mlx4_priv *priv = mlx4_priv(dev);
struct msix_entry *entries;
int nreq = min_t(int, dev->caps.num_ports *
min_t(int, num_possible_cpus() + 1, MAX_MSIX_P_PORT)
+ MSIX_LEGACY_SZ, MAX_MSIX);
int err;
int i;
if (msi_x) {
int nreq = dev->caps.num_ports * num_online_cpus() + MSIX_LEGACY_SZ;
nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs,
nreq);

View File

@ -199,10 +199,17 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
init_hca->log_num_cqs = profile[i].log_num;
break;
case MLX4_RES_EQ:
dev->caps.num_eqs = roundup_pow_of_two(min_t(unsigned, dev_cap->max_eqs,
MAX_MSIX));
init_hca->eqc_base = profile[i].start;
init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
init_hca->log_num_eqs = 0x1f;
init_hca->eqc_base = profile[i].start;
init_hca->num_sys_eqs = dev_cap->num_sys_eqs;
} else {
dev->caps.num_eqs = roundup_pow_of_two(
min_t(unsigned,
dev_cap->max_eqs, MAX_MSIX));
init_hca->eqc_base = profile[i].start;
init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
}
break;
case MLX4_RES_DMPT:
dev->caps.num_mpts = profile[i].num;