net/mlx5: support multi-packet send

This feature enables the TX burst function to emit up to 5 packets using
only two work queue entries (WQEs) on devices that support it. Saves PCI
bandwidth and improves performance.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Olga Shern <olgas@mellanox.com>
This commit is contained in:
Nélio Laranjeiro 2016-06-24 15:17:57 +02:00 committed by Bruce Richardson
parent 2a66cf3789
commit 230189d9ff
6 changed files with 446 additions and 4 deletions

View File

@ -171,6 +171,16 @@ Run-time configuration
This option should be used in combination with ``txq_inline`` above.
- ``txq_mpw_en`` parameter [int]
A nonzero value enables multi-packet send. This feature allows the TX
burst function to pack up to five packets in two descriptors in order to
save PCI bandwidth and improve performance at the cost of a slightly
higher CPU usage.
It is currently only supported on the ConnectX-4 Lx family of adapters.
Enabled by default.
Prerequisites
-------------

View File

@ -81,6 +81,9 @@
*/
#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
/* Device parameter to enable multi-packet send WQEs. */
#define MLX5_TXQ_MPW_EN "txq_mpw_en"
/**
* Retrieve integer value from environment variable.
*
@ -282,6 +285,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
priv->txq_inline = tmp;
} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
priv->txqs_inline = tmp;
} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
priv->mps = !!tmp;
} else {
WARN("%s: unknown parameter", key);
return -EINVAL;
@ -307,6 +312,7 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
MLX5_RXQ_CQE_COMP_EN,
MLX5_TXQ_INLINE,
MLX5_TXQS_MIN_INLINE,
MLX5_TXQ_MPW_EN,
NULL,
};
struct rte_kvargs *kvlist;
@ -503,6 +509,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
priv->mps = mps; /* Enable MPW by default if supported. */
priv->cqe_comp = 1; /* Enable compression by default. */
err = mlx5_args(priv, pci_dev->devargs);
if (err) {
@ -551,7 +558,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv_get_num_vfs(priv, &num_vfs);
priv->sriov = (num_vfs || sriov);
priv->mps = mps;
if (priv->mps && !mps) {
ERROR("multi-packet send not supported on this device"
" (" MLX5_TXQ_MPW_EN ")");
err = ENOTSUP;
goto port_error;
}
/* Allocate and register default RSS hash keys. */
priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
sizeof((*priv->rss_conf)[0]), 0);

View File

@ -584,7 +584,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
DEV_RX_OFFLOAD_UDP_CKSUM |
DEV_RX_OFFLOAD_TCP_CKSUM) :
0);
info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
if (!priv->mps)
info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
if (priv->hw_csum)
info->tx_offload_capa |=
(DEV_TX_OFFLOAD_IPV4_CKSUM |
@ -1318,7 +1319,17 @@ void
priv_select_tx_function(struct priv *priv)
{
priv->dev->tx_pkt_burst = mlx5_tx_burst;
if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
/* Display warning for unsupported configurations. */
if (priv->sriov && priv->mps)
WARN("multi-packet send WQE cannot be used on a SR-IOV setup");
/* Select appropriate TX function. */
if ((priv->sriov == 0) && priv->mps && priv->txq_inline) {
priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
DEBUG("selected MPW inline TX function");
} else if ((priv->sriov == 0) && priv->mps) {
priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw;
DEBUG("selected MPW TX function");
} else if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
DEBUG("selected inline TX function (%u >= %u queues)",
priv->txqs_n, priv->txqs_inline);

View File

@ -801,6 +801,413 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
return i;
}
/**
* Open a MPW session.
*
* @param txq
* Pointer to TX queue structure.
* @param mpw
* Pointer to MPW session structure.
* @param length
* Packet length.
*/
static inline void
mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
{
uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
(volatile struct mlx5_wqe_data_seg (*)[])
(uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
mpw->state = MLX5_MPW_STATE_OPENED;
mpw->pkts_n = 0;
mpw->len = length;
mpw->total_len = 0;
mpw->wqe = &(*txq->wqes)[idx];
mpw->wqe->mpw.eseg.mss = htons(length);
mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
mpw->wqe->mpw.eseg.rsvd0 = 0;
mpw->wqe->mpw.eseg.rsvd1 = 0;
mpw->wqe->mpw.eseg.rsvd2 = 0;
mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
(txq->wqe_ci << 8) |
MLX5_OPCODE_LSO_MPW);
mpw->wqe->mpw.ctrl.data[2] = 0;
mpw->wqe->mpw.ctrl.data[3] = 0;
mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
mpw->data.dseg[2] = &(*dseg)[0];
mpw->data.dseg[3] = &(*dseg)[1];
mpw->data.dseg[4] = &(*dseg)[2];
}
/**
* Close a MPW session.
*
* @param txq
* Pointer to TX queue structure.
* @param mpw
* Pointer to MPW session structure.
*/
static inline void
mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
{
unsigned int num = mpw->pkts_n;
/*
* Store size in multiple of 16 bytes. Control and Ethernet segments
* count as 2.
*/
mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
mpw->state = MLX5_MPW_STATE_CLOSED;
if (num < 3)
++txq->wqe_ci;
else
txq->wqe_ci += 2;
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
}
/**
* DPDK callback for TX with MPW support.
*
* @param dpdk_txq
* Generic pointer to TX queue structure.
* @param[in] pkts
* Packets to transmit.
* @param pkts_n
* Number of packets in array.
*
* @return
* Number of packets successfully transmitted (<= pkts_n).
*/
uint16_t
mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i;
unsigned int max;
unsigned int comp;
struct mlx5_mpw mpw = {
.state = MLX5_MPW_STATE_CLOSED,
};
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
assert(max >= 1);
assert(max <= elts_n);
/* Always leave one free entry in the ring. */
--max;
if (max == 0)
return 0;
if (max > pkts_n)
max = pkts_n;
for (i = 0; (i != max); ++i) {
struct rte_mbuf *buf = pkts[i];
volatile struct mlx5_wqe_data_seg *dseg;
unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
uintptr_t addr;
uint32_t length;
uint32_t cs_flags = 0;
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
/* Update element. */
(*txq->elts)[elts_head] = buf;
/* Start new session if packet differs. */
if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
((mpw.len != length) ||
(mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
mlx5_mpw_close(txq, &mpw);
if (mpw.state == MLX5_MPW_STATE_CLOSED) {
mlx5_mpw_new(txq, &mpw, length);
mpw.wqe->mpw.eseg.cs_flags = cs_flags;
}
dseg = mpw.data.dseg[mpw.pkts_n];
*dseg = (struct mlx5_wqe_data_seg){
.byte_count = htonl(length),
.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
.addr = htonll(addr),
};
++mpw.pkts_n;
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
mlx5_mpw_close(txq, &mpw);
elts_head = elts_head_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
}
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
comp = txq->elts_comp + i;
if (comp >= MLX5_TX_COMP_THRESH) {
volatile union mlx5_wqe *wqe = mpw.wqe;
/* Request completion on last WQE. */
wqe->mpw.ctrl.data[2] = htonl(8);
/* Save elts_head in unused "immediate" field of WQE. */
wqe->mpw.ctrl.data[3] = elts_head;
txq->elts_comp = 0;
} else {
txq->elts_comp = comp;
}
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent packets counter. */
txq->stats.opackets += i;
#endif
/* Ring QP doorbell. */
if (mpw.state == MLX5_MPW_STATE_OPENED)
mlx5_mpw_close(txq, &mpw);
mlx5_tx_dbrec(txq);
txq->elts_head = elts_head;
return i;
}
/**
* Open a MPW inline session.
*
* @param txq
* Pointer to TX queue structure.
* @param mpw
* Pointer to MPW session structure.
* @param length
* Packet length.
*/
static inline void
mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
{
uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
mpw->state = MLX5_MPW_INL_STATE_OPENED;
mpw->pkts_n = 0;
mpw->len = length;
mpw->total_len = 0;
mpw->wqe = &(*txq->wqes)[idx];
mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
(txq->wqe_ci << 8) |
MLX5_OPCODE_LSO_MPW);
mpw->wqe->mpw_inl.ctrl.data[2] = 0;
mpw->wqe->mpw_inl.ctrl.data[3] = 0;
mpw->wqe->mpw_inl.eseg.mss = htons(length);
mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
mpw->wqe->mpw_inl.eseg.cs_flags = 0;
mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
}
/**
* Close a MPW inline session.
*
* @param txq
* Pointer to TX queue structure.
* @param mpw
* Pointer to MPW session structure.
*/
static inline void
mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
{
unsigned int size;
size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
/*
* Store size in multiple of 16 bytes. Control and Ethernet segments
* count as 2.
*/
mpw->wqe->mpw_inl.ctrl.data[1] =
htonl(txq->qp_num_8s | ((size + 15) / 16));
mpw->state = MLX5_MPW_STATE_CLOSED;
mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
}
/**
* DPDK callback for TX with MPW inline support.
*
* @param dpdk_txq
* Generic pointer to TX queue structure.
* @param[in] pkts
* Packets to transmit.
* @param pkts_n
* Number of packets in array.
*
* @return
* Number of packets successfully transmitted (<= pkts_n).
*/
uint16_t
mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n)
{
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i;
unsigned int max;
unsigned int comp;
unsigned int inline_room = txq->max_inline;
struct mlx5_mpw mpw = {
.state = MLX5_MPW_STATE_CLOSED,
};
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
assert(max >= 1);
assert(max <= elts_n);
/* Always leave one free entry in the ring. */
--max;
if (max == 0)
return 0;
if (max > pkts_n)
max = pkts_n;
for (i = 0; (i != max); ++i) {
struct rte_mbuf *buf = pkts[i];
unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
uintptr_t addr;
uint32_t length;
uint32_t cs_flags = 0;
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
/* Update element. */
(*txq->elts)[elts_head] = buf;
/* Start new session if packet differs. */
if (mpw.state == MLX5_MPW_STATE_OPENED) {
if ((mpw.len != length) ||
(mpw.wqe->mpw.eseg.cs_flags != cs_flags))
mlx5_mpw_close(txq, &mpw);
} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
if ((mpw.len != length) ||
(length > inline_room) ||
(mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
mlx5_mpw_inline_close(txq, &mpw);
inline_room = txq->max_inline;
}
}
if (mpw.state == MLX5_MPW_STATE_CLOSED) {
if (length > inline_room) {
mlx5_mpw_new(txq, &mpw, length);
mpw.wqe->mpw.eseg.cs_flags = cs_flags;
} else {
mlx5_mpw_inline_new(txq, &mpw, length);
mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
}
}
if (mpw.state == MLX5_MPW_STATE_OPENED) {
volatile struct mlx5_wqe_data_seg *dseg;
assert(inline_room == txq->max_inline);
dseg = mpw.data.dseg[mpw.pkts_n];
*dseg = (struct mlx5_wqe_data_seg){
.byte_count = htonl(length),
.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
.addr = htonll(addr),
};
++mpw.pkts_n;
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
mlx5_mpw_close(txq, &mpw);
} else {
unsigned int max;
assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
assert(length <= inline_room);
/* Maximum number of bytes before wrapping. */
max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
(uintptr_t)mpw.data.raw);
if (length > max) {
rte_memcpy((void *)(uintptr_t)mpw.data.raw,
(void *)addr,
max);
mpw.data.raw =
(volatile void *)&(*txq->wqes)[0];
rte_memcpy((void *)(uintptr_t)mpw.data.raw,
(void *)(addr + max),
length - max);
mpw.data.raw += length - max;
} else {
rte_memcpy((void *)(uintptr_t)mpw.data.raw,
(void *)addr,
length);
mpw.data.raw += length;
}
if ((uintptr_t)mpw.data.raw ==
(uintptr_t)&(*txq->wqes)[txq->wqe_n])
mpw.data.raw =
(volatile void *)&(*txq->wqes)[0];
++mpw.pkts_n;
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
mlx5_mpw_inline_close(txq, &mpw);
inline_room = txq->max_inline;
} else {
inline_room -= length;
}
}
mpw.total_len += length;
elts_head = elts_head_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
}
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
comp = txq->elts_comp + i;
if (comp >= MLX5_TX_COMP_THRESH) {
volatile union mlx5_wqe *wqe = mpw.wqe;
/* Request completion on last WQE. */
wqe->mpw_inl.ctrl.data[2] = htonl(8);
/* Save elts_head in unused "immediate" field of WQE. */
wqe->mpw_inl.ctrl.data[3] = elts_head;
txq->elts_comp = 0;
} else {
txq->elts_comp = comp;
}
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent packets counter. */
txq->stats.opackets += i;
#endif
/* Ring QP doorbell. */
if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
mlx5_mpw_inline_close(txq, &mpw);
else if (mpw.state == MLX5_MPW_STATE_OPENED)
mlx5_mpw_close(txq, &mpw);
mlx5_tx_dbrec(txq);
txq->elts_head = elts_head;
return i;
}
/**
* Translate RX completion flags to packet type.
*

View File

@ -312,6 +312,8 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);

View File

@ -398,7 +398,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.obj = tmpl.qp,
/* Enable multi-packet send if supported. */
.family_flags =
(priv->mps ?
((priv->mps && !priv->sriov) ?
IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR :
0),
};