net/mlx4: drop RSS support

The Verbs RSS API used in this PMD is now obsolete. It is superseded by an
enhanced API with fewer constraints already used in the mlx5 PMD.

Drop RSS support in preparation for a major refactoring. The ability to
configure several Rx queues is retained, these can be targeted directly by
creating specific flow rules.

There is no need for "ignored" Rx queues anymore since their number is no
longer limited to powers of two.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
This commit is contained in:
Adrien Mazarguil 2017-09-01 10:06:31 +02:00 committed by Ferruh Yigit
parent 41f8001be6
commit 4bd2aa1198
4 changed files with 14 additions and 218 deletions

View File

@ -13,7 +13,6 @@ Queue start/stop = Y
MTU update = Y
Jumbo frame = Y
Scattered Rx = Y
RSS hash = Y
SR-IOV = Y
L3 checksum offload = Y
L4 checksum offload = Y

View File

@ -78,22 +78,12 @@ Features
--------
- Multi arch support: x86_64 and POWER8.
- RSS, also known as RCA, is supported. In this mode the number of
configured RX queues must be a power of two.
- Link state information is provided.
- Scattered packets are supported for TX and RX.
- Inner L3/L4 (IP, TCP and UDP) TX/RX checksum offloading and validation.
- Outer L3 (IP) TX/RX checksum offloading and validation for VXLAN frames.
- RX interrupts.
Limitations
-----------
- RSS hash key cannot be modified.
- RSS RETA cannot be configured
- RSS always includes L3 (IPv4/IPv6) and L4 (UDP/TCP). They cannot be
dissociated.
Configuration
-------------
@ -145,9 +135,6 @@ Environment variables
Run-time configuration
~~~~~~~~~~~~~~~~~~~~~~
- The only constraint when RSS mode is requested is to make sure the number
of RX queues is a power of two. This is a hardware requirement.
- librte_pmd_mlx4 brings kernel network interfaces up during initialization
because it is affected by their state. Forcing them down prevents packets
reception.

View File

@ -31,11 +31,6 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Known limitations:
* - RSS hash key and options cannot be modified.
*/
/* System headers. */
#include <stddef.h>
#include <stdio.h>
@ -507,7 +502,7 @@ txq_cleanup(struct txq *txq);
static int
rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
unsigned int socket, int inactive, const struct rte_eth_rxconf *conf,
unsigned int socket, const struct rte_eth_rxconf *conf,
struct rte_mempool *mp);
static void
@ -520,7 +515,6 @@ priv_mac_addr_del(struct priv *priv);
* Ethernet device configuration.
*
* Prepare the driver for a given number of TX and RX queues.
* Allocate parent RSS queue when several RX queues are requested.
*
* @param dev
* Pointer to Ethernet device structure.
@ -534,8 +528,6 @@ dev_configure(struct rte_eth_dev *dev)
struct priv *priv = dev->data->dev_private;
unsigned int rxqs_n = dev->data->nb_rx_queues;
unsigned int txqs_n = dev->data->nb_tx_queues;
unsigned int tmp;
int ret;
priv->rxqs = (void *)dev->data->rx_queues;
priv->txqs = (void *)dev->data->tx_queues;
@ -544,61 +536,12 @@ dev_configure(struct rte_eth_dev *dev)
(void *)dev, priv->txqs_n, txqs_n);
priv->txqs_n = txqs_n;
}
if (rxqs_n == priv->rxqs_n)
return 0;
if (!rte_is_power_of_2(rxqs_n) && !priv->isolated) {
unsigned n_active;
n_active = rte_align32pow2(rxqs_n + 1) >> 1;
WARN("%p: number of RX queues must be a power"
" of 2: %u queues among %u will be active",
(void *)dev, n_active, rxqs_n);
}
INFO("%p: RX queues number update: %u -> %u",
(void *)dev, priv->rxqs_n, rxqs_n);
/* If RSS is enabled, disable it first. */
if (priv->rss) {
unsigned int i;
/* Only if there are no remaining child RX queues. */
for (i = 0; (i != priv->rxqs_n); ++i)
if ((*priv->rxqs)[i] != NULL)
return EINVAL;
priv_mac_addr_del(priv);
rxq_cleanup(&priv->rxq_parent);
priv->rss = 0;
priv->rxqs_n = 0;
}
if (rxqs_n <= 1) {
/* Nothing else to do. */
if (rxqs_n != priv->rxqs_n) {
INFO("%p: Rx queues number update: %u -> %u",
(void *)dev, priv->rxqs_n, rxqs_n);
priv->rxqs_n = rxqs_n;
return 0;
}
/* Allocate a new RSS parent queue if supported by hardware. */
if (!priv->hw_rss) {
ERROR("%p: only a single RX queue can be configured when"
" hardware doesn't support RSS",
(void *)dev);
return EINVAL;
}
/* Fail if hardware doesn't support that many RSS queues. */
if (rxqs_n >= priv->max_rss_tbl_sz) {
ERROR("%p: only %u RX queues can be configured for RSS",
(void *)dev, priv->max_rss_tbl_sz);
return EINVAL;
}
priv->rss = 1;
tmp = priv->rxqs_n;
priv->rxqs_n = rxqs_n;
ret = rxq_setup(dev, &priv->rxq_parent, 0, 0, 0, NULL, NULL);
if (!ret)
return 0;
/* Failure, rollback. */
priv->rss = 0;
priv->rxqs_n = tmp;
assert(ret > 0);
return ret;
return 0;
}
/**
@ -2014,8 +1957,7 @@ priv_mac_addr_del(struct priv *priv)
/**
* Register a MAC address.
*
* In RSS mode, the MAC address is registered in the parent queue,
* otherwise it is registered in queue 0.
* The MAC address is registered in queue 0.
*
* @param priv
* Pointer to private structure.
@ -2035,9 +1977,7 @@ priv_mac_addr_add(struct priv *priv)
return 0;
if (priv->isolated)
return 0;
if (priv->rss)
rxq = &priv->rxq_parent;
else if (*priv->rxqs && (*priv->rxqs)[0])
if (*priv->rxqs && (*priv->rxqs)[0])
rxq = (*priv->rxqs)[0];
else
return 0;
@ -2647,69 +2587,8 @@ rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
.res_domain = rd,
};
attr.max_inl_recv = priv->inl_recv_size;
attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV;
return ibv_exp_create_qp(priv->ctx, &attr);
}
/**
* Allocate a RSS Queue Pair.
* Optionally setup inline receive if supported.
*
* @param priv
* Pointer to private structure.
* @param cq
* Completion queue to associate with QP.
* @param desc
* Number of descriptors in QP (hint only).
* @param parent
* If nonzero, create a parent QP, otherwise a child.
*
* @return
* QP pointer or NULL in case of error.
*/
static struct ibv_qp *
rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
int parent, struct ibv_exp_res_domain *rd)
{
struct ibv_exp_qp_init_attr attr = {
/* CQ to be associated with the send queue. */
.send_cq = cq,
/* CQ to be associated with the receive queue. */
.recv_cq = cq,
.cap = {
/* Max number of outstanding WRs. */
.max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
priv->device_attr.max_qp_wr :
desc),
/* Max number of scatter/gather elements in a WR. */
.max_recv_sge = ((priv->device_attr.max_sge <
MLX4_PMD_SGE_WR_N) ?
priv->device_attr.max_sge :
MLX4_PMD_SGE_WR_N),
},
.qp_type = IBV_QPT_RAW_PACKET,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN |
IBV_EXP_QP_INIT_ATTR_QPG),
.pd = priv->pd,
.res_domain = rd,
};
attr.max_inl_recv = priv->inl_recv_size,
attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV;
if (parent) {
attr.qpg.qpg_type = IBV_EXP_QPG_PARENT;
/* TSS isn't necessary. */
attr.qpg.parent_attrib.tss_child_count = 0;
attr.qpg.parent_attrib.rss_child_count =
rte_align32pow2(priv->rxqs_n + 1) >> 1;
DEBUG("initializing parent RSS queue");
} else {
attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX;
attr.qpg.qpg_parent = priv->rxq_parent.qp;
DEBUG("initializing child RSS queue");
}
return ibv_exp_create_qp(priv->ctx, &attr);
}
@ -2741,13 +2620,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
struct ibv_recv_wr *bad_wr;
unsigned int mb_len;
int err;
int parent = (rxq == &priv->rxq_parent);
if (parent) {
ERROR("%p: cannot rehash parent queue %p",
(void *)dev, (void *)rxq);
return EINVAL;
}
mb_len = rte_pktmbuf_data_room_size(rxq->mp);
DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
/* Number of descriptors and mbufs currently allocated. */
@ -2800,9 +2673,8 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
.port_num = priv->port
};
err = ibv_exp_modify_qp(tmpl.qp, &mod,
(IBV_EXP_QP_STATE |
(parent ? IBV_EXP_QP_GROUP_RSS : 0) |
IBV_EXP_QP_PORT));
IBV_EXP_QP_STATE |
IBV_EXP_QP_PORT);
if (err) {
ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
(void *)dev, strerror(err));
@ -2899,9 +2771,6 @@ skip_rtr:
* Number of descriptors to configure in queue.
* @param socket
* NUMA socket on which memory must be allocated.
* @param inactive
* If true, the queue is disabled because its index is higher or
* equal to the real number of queues, which must be a power of 2.
* @param[in] conf
* Thresholds parameters.
* @param mp
@ -2912,7 +2781,7 @@ skip_rtr:
*/
static int
rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
unsigned int socket, int inactive, const struct rte_eth_rxconf *conf,
unsigned int socket, const struct rte_eth_rxconf *conf,
struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
@ -2931,20 +2800,8 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
struct ibv_recv_wr *bad_wr;
unsigned int mb_len;
int ret = 0;
int parent = (rxq == &priv->rxq_parent);
(void)conf; /* Thresholds configuration (ignored). */
/*
* If this is a parent queue, hardware must support RSS and
* RSS must be enabled.
*/
assert((!parent) || ((priv->hw_rss) && (priv->rss)));
if (parent) {
/* Even if unused, ibv_create_cq() requires at least one
* descriptor. */
desc = 1;
goto skip_mr;
}
mb_len = rte_pktmbuf_data_room_size(mp);
if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) {
ERROR("%p: invalid number of RX descriptors (must be a"
@ -2982,7 +2839,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
(void *)dev, strerror(ret));
goto error;
}
skip_mr:
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
IBV_EXP_RES_DOMAIN_MSG_MODEL),
@ -3022,11 +2878,7 @@ skip_mr:
priv->device_attr.max_qp_wr);
DEBUG("priv->device_attr.max_sge is %d",
priv->device_attr.max_sge);
if (priv->rss && !inactive)
tmpl.qp = rxq_setup_qp_rss(priv, tmpl.cq, desc, parent,
tmpl.rd);
else
tmpl.qp = rxq_setup_qp(priv, tmpl.cq, desc, tmpl.rd);
tmpl.qp = rxq_setup_qp(priv, tmpl.cq, desc, tmpl.rd);
if (tmpl.qp == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: QP creation failure: %s",
@ -3040,17 +2892,13 @@ skip_mr:
.port_num = priv->port
};
ret = ibv_exp_modify_qp(tmpl.qp, &mod,
(IBV_EXP_QP_STATE |
(parent ? IBV_EXP_QP_GROUP_RSS : 0) |
IBV_EXP_QP_PORT));
IBV_EXP_QP_STATE |
IBV_EXP_QP_PORT);
if (ret) {
ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
(void *)dev, strerror(ret));
goto error;
}
/* Allocate descriptors for RX queues, except for the RSS parent. */
if (parent)
goto skip_alloc;
if (tmpl.sp)
ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
else
@ -3072,7 +2920,6 @@ skip_mr:
strerror(ret));
goto error;
}
skip_alloc:
mod = (struct ibv_exp_qp_attr){
.qp_state = IBV_QPS_RTR
};
@ -3146,7 +2993,6 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct rxq *rxq = (*priv->rxqs)[idx];
int inactive = 0;
int ret;
priv_lock(priv);
@ -3178,9 +3024,7 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -ENOMEM;
}
}
if (idx >= rte_align32pow2(priv->rxqs_n + 1) >> 1)
inactive = 1;
ret = rxq_setup(dev, rxq, desc, socket, inactive, conf, mp);
ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
if (ret)
rte_free(rxq);
else {
@ -3215,7 +3059,6 @@ mlx4_rx_queue_release(void *dpdk_rxq)
return;
priv = rxq->priv;
priv_lock(priv);
assert(rxq != &priv->rxq_parent);
for (i = 0; (i != priv->rxqs_n); ++i)
if ((*priv->rxqs)[i] == rxq) {
DEBUG("%p: removing RX queue %p from list",
@ -3440,8 +3283,6 @@ mlx4_dev_close(struct rte_eth_dev *dev)
priv->txqs_n = 0;
priv->txqs = NULL;
}
if (priv->rss)
rxq_cleanup(&priv->rxq_parent);
if (priv->pd != NULL) {
assert(priv->ctx != NULL);
claim_zero(ibv_dealloc_pd(priv->pd));
@ -4756,7 +4597,6 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
if (!(conf.ports.enabled & (1 << i)))
continue;
exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS;
exp_device_attr.comp_mask |= IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ;
DEBUG("using port %u", port);
@ -4814,30 +4654,6 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
err = ENODEV;
goto port_error;
}
if ((exp_device_attr.exp_device_cap_flags &
IBV_EXP_DEVICE_QPG) &&
(exp_device_attr.exp_device_cap_flags &
IBV_EXP_DEVICE_UD_RSS) &&
(exp_device_attr.comp_mask &
IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ) &&
(exp_device_attr.max_rss_tbl_sz > 0)) {
priv->hw_qpg = 1;
priv->hw_rss = 1;
priv->max_rss_tbl_sz = exp_device_attr.max_rss_tbl_sz;
} else {
priv->hw_qpg = 0;
priv->hw_rss = 0;
priv->max_rss_tbl_sz = 0;
}
priv->hw_tss = !!(exp_device_attr.exp_device_cap_flags &
IBV_EXP_DEVICE_UD_TSS);
DEBUG("device flags: %s%s%s",
(priv->hw_qpg ? "IBV_DEVICE_QPG " : ""),
(priv->hw_tss ? "IBV_DEVICE_TSS " : ""),
(priv->hw_rss ? "IBV_DEVICE_RSS " : ""));
if (priv->hw_rss)
DEBUG("maximum RSS indirection table size: %u",
exp_device_attr.max_rss_tbl_sz);
priv->hw_csum =
((exp_device_attr.exp_device_cap_flags &

View File

@ -248,19 +248,13 @@ struct priv {
uint16_t mtu; /* Configured MTU. */
uint8_t port; /* Physical port number. */
unsigned int started:1; /* Device started, flows enabled. */
unsigned int hw_qpg:1; /* QP groups are supported. */
unsigned int hw_tss:1; /* TSS is supported. */
unsigned int hw_rss:1; /* RSS is supported. */
unsigned int hw_csum:1; /* Checksum offload is supported. */
unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int rss:1; /* RSS is enabled. */
unsigned int vf:1; /* This is a VF device. */
unsigned int pending_alarm:1; /* An alarm is pending. */
unsigned int isolated:1; /* Toggle isolated mode. */
unsigned int inl_recv_size; /* Inline recv size */
unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */
/* RX/TX queues. */
struct rxq rxq_parent; /* Parent queue when RSS is enabled. */
unsigned int rxqs_n; /* RX queues array size. */
unsigned int txqs_n; /* TX queues array size. */
struct rxq *(*rxqs)[]; /* RX queues. */