net/mlx5: implement vectorized MPRQ burst

MPRQ (Multi-Packet Rx Queue) processes one packet at a time using
simple scalar instructions. MPRQ works by posting a single large buffer
(consisted of multiple fixed-size strides) in order to receive multiple
packets at once on this buffer. A Rx packet is then copied to a
user-provided mbuf or PMD attaches the Rx packet to the mbuf by the
pointer to an external buffer.

There is an opportunity to speed up the packet receiving by processing
4 packets simultaneously using SIMD (single instruction, multiple data)
extensions. Allocate mbufs in batches for every MPRQ buffer and process
the packets in groups of 4 until all the strides are exhausted. Then
switch to another MPRQ buffer and repeat the process over again.

The vectorized MPRQ burst routine is engaged automatically in case
the mprq_en=1 devarg is specified and the vectorization is not disabled
explicitly by providing rx_vec_en=0 devarg. There is a limitation:
LRO is not supported and scalar MPRQ is selected if it is on.

Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
This commit is contained in:
Alexander Kozyrev 2020-10-21 20:30:30 +00:00 committed by Ferruh Yigit
parent 1ded26239a
commit 0f20acbf5e
7 changed files with 644 additions and 292 deletions

View File

@ -437,10 +437,17 @@ mlx5_rxq_create_devx_cq_resources(struct rte_eth_dev *dev, uint16_t idx)
if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
!rxq_data->lro) {
cq_attr.cqe_comp_en = 1u;
cq_attr.mini_cqe_res_format =
mlx5_rxq_mprq_enabled(rxq_data) ?
MLX5_CQE_RESP_FORMAT_CSUM_STRIDX :
MLX5_CQE_RESP_FORMAT_HASH;
/*
* Select CSUM miniCQE format only for non-vectorized MPRQ
* Rx burst, use HASH miniCQE format for everything else.
*/
if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
mlx5_rxq_mprq_enabled(rxq_data))
cq_attr.mini_cqe_res_format =
MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
else
cq_attr.mini_cqe_res_format =
MLX5_CQE_RESP_FORMAT_HASH;
/*
* For vectorized Rx, it must not be doubled in order to
* make cq_ci and rq_ci aligned.

View File

@ -427,7 +427,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
if (dev->rx_pkt_burst == mlx5_rx_burst ||
dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
dev->rx_pkt_burst == mlx5_rx_burst_vec)
dev->rx_pkt_burst == mlx5_rx_burst_vec ||
dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec)
return ptypes;
return NULL;
}
@ -486,11 +487,22 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)
MLX5_ASSERT(dev != NULL);
if (mlx5_check_vec_rx_support(dev) > 0) {
rx_pkt_burst = mlx5_rx_burst_vec;
DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
dev->data->port_id);
if (mlx5_mprq_enabled(dev)) {
rx_pkt_burst = mlx5_rx_burst_mprq_vec;
DRV_LOG(DEBUG, "port %u selected vectorized"
" MPRQ Rx function", dev->data->port_id);
} else {
rx_pkt_burst = mlx5_rx_burst_vec;
DRV_LOG(DEBUG, "port %u selected vectorized"
" SPRQ Rx function", dev->data->port_id);
}
} else if (mlx5_mprq_enabled(dev)) {
rx_pkt_burst = mlx5_rx_burst_mprq;
DRV_LOG(DEBUG, "port %u selected MPRQ Rx function",
dev->data->port_id);
} else {
DRV_LOG(DEBUG, "port %u selected SPRQ Rx function",
dev->data->port_id);
}
return rx_pkt_burst;
}

View File

@ -173,7 +173,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
rxq->mprq_repl = buf;
}
DRV_LOG(DEBUG,
"port %u Rx queue %u allocated and configured %u segments",
"port %u MPRQ queue %u allocated and configured %u segments",
rxq->port_id, rxq->idx, wqe_n);
return 0;
error:
@ -185,7 +185,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
(*rxq->mprq_bufs)[i]);
(*rxq->mprq_bufs)[i] = NULL;
}
DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
DRV_LOG(DEBUG, "port %u MPRQ queue %u failed, freed everything",
rxq->port_id, rxq->idx);
rte_errno = err; /* Restore rte_errno. */
return -rte_errno;
@ -204,7 +204,9 @@ static int
rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
{
const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
unsigned int elts_n = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
(1 << rxq_ctrl->rxq.elts_n) * (1 << rxq_ctrl->rxq.strd_num_n) :
(1 << rxq_ctrl->rxq.elts_n);
unsigned int i;
int err;
@ -262,7 +264,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
}
DRV_LOG(DEBUG,
"port %u Rx queue %u allocated and configured %u segments"
"port %u SPRQ queue %u allocated and configured %u segments"
" (max %u packets)",
PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n,
elts_n / (1 << rxq_ctrl->rxq.sges_n));
@ -275,7 +277,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
(*rxq_ctrl->rxq.elts)[i] = NULL;
}
DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
DRV_LOG(DEBUG, "port %u SPRQ queue %u failed, freed everything",
PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx);
rte_errno = err; /* Restore rte_errno. */
return -rte_errno;
@ -293,8 +295,15 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
int
rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
{
return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
int ret = 0;
/**
* For MPRQ we need to allocate both MPRQ buffers
* for WQEs and simple mbufs for vector processing.
*/
if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
ret = rxq_alloc_elts_mprq(rxq_ctrl);
return (ret || rxq_alloc_elts_sprq(rxq_ctrl));
}
/**
@ -309,11 +318,10 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
uint16_t i;
DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs",
rxq->port_id, rxq->idx);
DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing %d WRs",
rxq->port_id, rxq->idx, (1u << rxq->elts_n));
if (rxq->mprq_bufs == NULL)
return;
MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);
for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
if ((*rxq->mprq_bufs)[i] != NULL)
mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
@ -335,25 +343,27 @@ static void
rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
{
struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
const uint16_t q_n = (1 << rxq->elts_n);
const uint16_t q_n = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
(1 << rxq->elts_n) * (1 << rxq->strd_num_n) :
(1 << rxq->elts_n);
const uint16_t q_mask = q_n - 1;
uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
uint16_t i;
DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
PORT_ID(rxq_ctrl->priv), rxq->idx);
DRV_LOG(DEBUG, "port %u Rx queue %u freeing %d WRs",
PORT_ID(rxq_ctrl->priv), rxq->idx, q_n);
if (rxq->elts == NULL)
return;
/**
* Some mbuf in the Ring belongs to the application. They cannot be
* freed.
* Some mbuf in the Ring belongs to the application.
* They cannot be freed.
*/
if (mlx5_rxq_check_vec_support(rxq) > 0) {
for (i = 0; i < used; ++i)
(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
rxq->rq_pi = rxq->rq_ci;
}
for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
for (i = 0; i != q_n; ++i) {
if ((*rxq->elts)[i] != NULL)
rte_pktmbuf_free_seg((*rxq->elts)[i]);
(*rxq->elts)[i] = NULL;
@ -369,10 +379,13 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
static void
rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
{
/*
* For MPRQ we need to allocate both MPRQ buffers
* for WQEs and simple mbufs for vector processing.
*/
if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
rxq_free_elts_mprq(rxq_ctrl);
else
rxq_free_elts_sprq(rxq_ctrl);
rxq_free_elts_sprq(rxq_ctrl);
}
/**
@ -1334,20 +1347,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
struct mlx5_priv *priv = dev->data->dev_private;
struct mlx5_rxq_ctrl *tmpl;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
unsigned int mprq_stride_nums;
unsigned int mprq_stride_size;
unsigned int mprq_stride_cap;
struct mlx5_dev_config *config = &priv->config;
/*
* Always allocate extra slots, even if eventually
* the vector Rx will not be used.
*/
uint16_t desc_n =
desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
uint64_t offloads = conf->offloads |
dev->data->dev_conf.rxmode.offloads;
unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
const int mprq_en = mlx5_check_mprq_support(dev) > 0;
unsigned int max_rx_pkt_len = lro_on_queue ?
dev->data->dev_conf.rxmode.max_lro_pkt_size :
dev->data->dev_conf.rxmode.max_rx_pkt_len;
@ -1355,6 +1358,21 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
RTE_PKTMBUF_HEADROOM;
unsigned int max_lro_size = 0;
unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
const int mprq_en = mlx5_check_mprq_support(dev) > 0;
unsigned int mprq_stride_nums = config->mprq.stride_num_n ?
config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
unsigned int mprq_stride_size = non_scatter_min_mbuf_size <=
(1U << config->mprq.max_stride_size_n) ?
log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
unsigned int mprq_stride_cap = (config->mprq.stride_num_n ?
(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
(config->mprq.stride_size_n ?
(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
/*
* Always allocate extra slots, even if eventually
* the vector Rx will not be used.
*/
uint16_t desc_n = desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
if (non_scatter_min_mbuf_size > mb_len && !(offloads &
DEV_RX_OFFLOAD_SCATTER)) {
@ -1366,8 +1384,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
rte_errno = ENOSPC;
return NULL;
}
tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
desc_n * sizeof(struct rte_mbuf *), 0, socket);
tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
sizeof(*tmpl) + desc_n * sizeof(struct rte_mbuf *) +
(desc >> mprq_stride_nums) * sizeof(struct mlx5_mprq_buf *),
0, socket);
if (!tmpl) {
rte_errno = ENOMEM;
return NULL;
@ -1381,15 +1402,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
tmpl->socket = socket;
if (dev->data->dev_conf.intr_conf.rxq)
tmpl->irq = 1;
mprq_stride_nums = config->mprq.stride_num_n ?
config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
mprq_stride_size = non_scatter_min_mbuf_size <=
(1U << config->mprq.max_stride_size_n) ?
log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
mprq_stride_cap = (config->mprq.stride_num_n ?
(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
(config->mprq.stride_size_n ?
(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
/*
* This Rx queue can be configured as a Multi-Packet RQ if all of the
* following conditions are met:
@ -1543,9 +1555,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
tmpl->rxq.mp = mp;
tmpl->rxq.elts_n = log2above(desc);
tmpl->rxq.rq_repl_thresh =
MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
MLX5_VPMD_RXQ_RPLNSH_THRESH(desc_n);
tmpl->rxq.elts =
(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
(struct rte_mbuf *(*)[desc_n])(tmpl + 1);
tmpl->rxq.mprq_bufs =
(struct mlx5_mprq_buf *(*)[desc])(*tmpl->rxq.elts + desc_n);
#ifndef RTE_ARCH_64
tmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq;
#endif

View File

@ -19,12 +19,12 @@
#include <mlx5_prm.h>
#include <mlx5_common.h>
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
#include "mlx5.h"
#include "mlx5_mr.h"
#include "mlx5_utils.h"
#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
/* TX burst subroutines return codes. */
enum mlx5_txcmp_code {
@ -93,10 +93,6 @@ static __rte_always_inline void
rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
static __rte_always_inline void
mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
const unsigned int strd_n);
static int
mlx5_queue_state_modify(struct rte_eth_dev *dev,
struct mlx5_mp_arg_queue_state_modify *sm);
@ -584,7 +580,14 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
struct rte_eth_burst_mode *mode)
{
eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
struct mlx5_priv *priv = dev->data->dev_private;
struct mlx5_rxq_data *rxq;
rxq = (*priv->rxqs)[rx_queue_id];
if (!rxq) {
rte_errno = EINVAL;
return -rte_errno;
}
if (pkt_burst == mlx5_rx_burst) {
snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
} else if (pkt_burst == mlx5_rx_burst_mprq) {
@ -598,6 +601,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
#else
return -EINVAL;
#endif
} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
#if defined RTE_ARCH_X86_64
snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE");
#elif defined RTE_ARCH_ARM64
snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon");
#elif defined RTE_ARCH_PPC_64
snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec");
#else
return -EINVAL;
#endif
} else {
return -EINVAL;
@ -866,6 +879,8 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
rxq->zip = (struct rxq_zip){
.ai = 0,
};
rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ?
(wqe_n >> rxq->sges_n) * (1 << rxq->strd_num_n) : 0;
/* Update doorbell counter. */
rxq->rq_ci = wqe_n >> rxq->sges_n;
rte_io_wmb();
@ -969,7 +984,8 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
{
const uint16_t cqe_n = 1 << rxq->cqe_n;
const uint16_t cqe_mask = cqe_n - 1;
const unsigned int wqe_n = 1 << rxq->elts_n;
const uint16_t wqe_n = 1 << rxq->elts_n;
const uint16_t strd_n = 1 << rxq->strd_num_n;
struct mlx5_rxq_ctrl *rxq_ctrl =
container_of(rxq, struct mlx5_rxq_ctrl, rxq);
union {
@ -1033,21 +1049,27 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
&sm))
return -1;
if (vec) {
const uint16_t q_mask = wqe_n - 1;
uint16_t elt_idx;
const uint32_t elts_n =
mlx5_rxq_mprq_enabled(rxq) ?
wqe_n * strd_n : wqe_n;
const uint32_t e_mask = elts_n - 1;
uint32_t elts_ci =
mlx5_rxq_mprq_enabled(rxq) ?
rxq->elts_ci : rxq->rq_ci;
uint32_t elt_idx;
struct rte_mbuf **elt;
int i;
unsigned int n = wqe_n - (rxq->rq_ci -
unsigned int n = elts_n - (elts_ci -
rxq->rq_pi);
for (i = 0; i < (int)n; ++i) {
elt_idx = (rxq->rq_ci + i) & q_mask;
elt_idx = (elts_ci + i) & e_mask;
elt = &(*rxq->elts)[elt_idx];
*elt = rte_mbuf_raw_alloc(rxq->mp);
if (!*elt) {
for (i--; i >= 0; --i) {
elt_idx = (rxq->rq_ci +
i) & q_mask;
elt_idx = (elts_ci +
i) & elts_n;
elt = &(*rxq->elts)
[elt_idx];
rte_pktmbuf_free_seg
@ -1056,7 +1078,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
return -1;
}
}
for (i = 0; i < (int)wqe_n; ++i) {
for (i = 0; i < (int)elts_n; ++i) {
elt = &(*rxq->elts)[i];
DATA_LEN(*elt) =
(uint16_t)((*elt)->buf_len -
@ -1064,7 +1086,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
}
/* Padding with a fake mbuf for vec Rx. */
for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
(*rxq->elts)[wqe_n + i] =
(*rxq->elts)[elts_n + i] =
&rxq->fake_mbuf;
}
mlx5_rxq_initialize(rxq);
@ -1545,31 +1567,6 @@ mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
mlx5_mprq_buf_free_cb(NULL, buf);
}
static inline void
mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
const unsigned int strd_n)
{
struct mlx5_mprq_buf *rep = rxq->mprq_repl;
volatile struct mlx5_wqe_data_seg *wqe =
&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
void *addr;
MLX5_ASSERT(rep != NULL);
/* Replace MPRQ buf. */
(*rxq->mprq_bufs)[rq_idx] = rep;
/* Replace WQE. */
addr = mlx5_mprq_buf_addr(rep, strd_n);
wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
/* If there's only one MR, no need to replace LKey in WQE. */
if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
/* Stash a mbuf for next replacement. */
if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
rxq->mprq_repl = rep;
else
rxq->mprq_repl = NULL;
}
/**
* DPDK callback for RX with Multi-Packet RQ support.
*
@ -1587,12 +1584,9 @@ uint16_t
mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct mlx5_rxq_data *rxq = dpdk_rxq;
const unsigned int strd_n = 1 << rxq->strd_num_n;
const unsigned int strd_sz = 1 << rxq->strd_sz_n;
const unsigned int strd_shift =
MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
const uint32_t strd_n = 1 << rxq->strd_num_n;
const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
unsigned int i = 0;
uint32_t rq_ci = rxq->rq_ci;
@ -1601,37 +1595,18 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
while (i < pkts_n) {
struct rte_mbuf *pkt;
void *addr;
int ret;
uint32_t len;
uint16_t strd_cnt;
uint16_t strd_idx;
uint32_t offset;
uint32_t byte_cnt;
int32_t hdrm_overlap;
volatile struct mlx5_mini_cqe8 *mcqe = NULL;
uint32_t rss_hash_res = 0;
enum mlx5_rqx_code rxq_code;
if (consumed_strd == strd_n) {
/* Replace WQE only if the buffer is still in use. */
if (__atomic_load_n(&buf->refcnt,
__ATOMIC_RELAXED) > 1) {
mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
/* Release the old buffer. */
mlx5_mprq_buf_free(buf);
} else if (unlikely(rxq->mprq_repl == NULL)) {
struct mlx5_mprq_buf *rep;
/*
* Currently, the MPRQ mempool is out of buffer
* and doing memcpy regardless of the size of Rx
* packet. Retry allocation to get back to
* normal.
*/
if (!rte_mempool_get(rxq->mprq_mp,
(void **)&rep))
rxq->mprq_repl = rep;
}
/* Replace WQE if the buffer is still in use. */
mprq_buf_replace(rxq, rq_ci & wq_mask);
/* Advance to the next WQE. */
consumed_strd = 0;
++rq_ci;
@ -1667,122 +1642,23 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
if (rxq->crc_present)
len -= RTE_ETHER_CRC_LEN;
offset = strd_idx * strd_sz + strd_shift;
addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
hdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;
/*
* Memcpy packets to the target mbuf if:
* - The size of packet is smaller than mprq_max_memcpy_len.
* - Out of buffer in the Mempool for Multi-Packet RQ.
* - The packet's stride overlaps a headroom and scatter is off.
*/
if (len <= rxq->mprq_max_memcpy_len ||
rxq->mprq_repl == NULL ||
(hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
addr, len);
DATA_LEN(pkt) = len;
} else if (rxq->strd_scatter_en) {
struct rte_mbuf *prev = pkt;
uint32_t seg_len =
RTE_MIN(rte_pktmbuf_tailroom(pkt), len);
uint32_t rem_len = len - seg_len;
rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
addr, seg_len);
DATA_LEN(pkt) = seg_len;
while (rem_len) {
struct rte_mbuf *next =
rte_pktmbuf_alloc(rxq->mp);
if (unlikely(next == NULL)) {
rte_pktmbuf_free(pkt);
++rxq->stats.rx_nombuf;
goto out;
}
NEXT(prev) = next;
SET_DATA_OFF(next, 0);
addr = RTE_PTR_ADD(addr, seg_len);
seg_len = RTE_MIN
(rte_pktmbuf_tailroom(next),
rem_len);
rte_memcpy
(rte_pktmbuf_mtod(next, void *),
addr, seg_len);
DATA_LEN(next) = seg_len;
rem_len -= seg_len;
prev = next;
++NB_SEGS(pkt);
}
} else {
rte_pktmbuf_free_seg(pkt);
rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf,
strd_idx, strd_cnt);
if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
rte_pktmbuf_free_seg(pkt);
if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
++rxq->stats.idropped;
continue;
}
} else {
rte_iova_t buf_iova;
struct rte_mbuf_ext_shared_info *shinfo;
uint16_t buf_len = strd_cnt * strd_sz;
void *buf_addr;
/* Increment the refcnt of the whole chunk. */
__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
__ATOMIC_RELAXED) <= strd_n + 1);
buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
/*
* MLX5 device doesn't use iova but it is necessary in a
* case where the Rx packet is transmitted via a
* different PMD.
*/
buf_iova = rte_mempool_virt2iova(buf) +
RTE_PTR_DIFF(buf_addr, buf);
shinfo = &buf->shinfos[strd_idx];
rte_mbuf_ext_refcnt_set(shinfo, 1);
/*
* EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
* attaching the stride to mbuf and more offload flags
* will be added below by calling rxq_cq_to_mbuf().
* Other fields will be overwritten.
*/
rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
buf_len, shinfo);
/* Set mbuf head-room. */
SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);
MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);
MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >=
len - (hdrm_overlap > 0 ? hdrm_overlap : 0));
DATA_LEN(pkt) = len;
/*
* Copy the last fragment of a packet (up to headroom
* size bytes) in case there is a stride overlap with
* a next packet's headroom. Allocate a separate mbuf
* to store this fragment and link it. Scatter is on.
*/
if (hdrm_overlap > 0) {
MLX5_ASSERT(rxq->strd_scatter_en);
struct rte_mbuf *seg =
rte_pktmbuf_alloc(rxq->mp);
if (unlikely(seg == NULL)) {
rte_pktmbuf_free_seg(pkt);
++rxq->stats.rx_nombuf;
break;
}
SET_DATA_OFF(seg, 0);
rte_memcpy(rte_pktmbuf_mtod(seg, void *),
RTE_PTR_ADD(addr, len - hdrm_overlap),
hdrm_overlap);
DATA_LEN(seg) = hdrm_overlap;
DATA_LEN(pkt) = len - hdrm_overlap;
NEXT(pkt) = seg;
NB_SEGS(pkt) = 2;
if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
++rxq->stats.rx_nombuf;
break;
}
}
rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
if (cqe->lro_num_seg > 1) {
mlx5_lro_update_hdr(addr, cqe, len);
mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
cqe, len);
pkt->ol_flags |= PKT_RX_LRO;
pkt->tso_segsz = len / cqe->lro_num_seg;
}
@ -1796,7 +1672,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
*(pkts++) = pkt;
++i;
}
out:
/* Update the consumer indexes. */
rxq->consumed_strd = consumed_strd;
rte_io_wmb();
@ -1878,6 +1753,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
return 0;
}
__rte_weak uint16_t
mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,
struct rte_mbuf **pkts __rte_unused,
uint16_t pkts_n __rte_unused)
{
return 0;
}
__rte_weak int
mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
{

View File

@ -30,6 +30,7 @@
#include "mlx5_utils.h"
#include "mlx5.h"
#include "mlx5_autoconf.h"
#include "mlx5_mr.h"
/* Support tunnel matching. */
#define MLX5_FLOW_TUNNEL 10
@ -94,6 +95,12 @@ enum mlx5_rxq_err_state {
MLX5_RXQ_ERR_STATE_NEED_READY,
};
enum mlx5_rqx_code {
MLX5_RXQ_CODE_EXIT = 0,
MLX5_RXQ_CODE_NOMBUF,
MLX5_RXQ_CODE_DROPPED,
};
/* RX queue descriptor. */
struct mlx5_rxq_data {
unsigned int csum:1; /* Enable checksum offloading. */
@ -116,6 +123,7 @@ struct mlx5_rxq_data {
volatile uint32_t *rq_db;
volatile uint32_t *cq_db;
uint16_t port_id;
uint32_t elts_ci;
uint32_t rq_ci;
uint16_t consumed_strd; /* Number of consumed strides in WQE. */
uint32_t rq_pi;
@ -130,11 +138,8 @@ struct mlx5_rxq_data {
uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
volatile void *wqes;
volatile struct mlx5_cqe(*cqes)[];
RTE_STD_C11
union {
struct rte_mbuf *(*elts)[];
struct mlx5_mprq_buf *(*mprq_bufs)[];
};
struct rte_mbuf *(*elts)[];
struct mlx5_mprq_buf *(*mprq_bufs)[];
struct rte_mempool *mp;
struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
@ -423,6 +428,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
/* mlx5_mr.c */
@ -700,4 +707,187 @@ mlx5_timestamp_set(struct rte_mbuf *mbuf, int offset,
*RTE_MBUF_DYNFIELD(mbuf, offset, rte_mbuf_timestamp_t *) = timestamp;
}
/**
* Replace MPRQ buffer.
*
* @param rxq
* Pointer to Rx queue structure.
* @param rq_idx
* RQ index to replace.
*/
static __rte_always_inline void
mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx)
{
const uint32_t strd_n = 1 << rxq->strd_num_n;
struct mlx5_mprq_buf *rep = rxq->mprq_repl;
volatile struct mlx5_wqe_data_seg *wqe =
&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
void *addr;
if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) > 1) {
MLX5_ASSERT(rep != NULL);
/* Replace MPRQ buf. */
(*rxq->mprq_bufs)[rq_idx] = rep;
/* Replace WQE. */
addr = mlx5_mprq_buf_addr(rep, strd_n);
wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
/* If there's only one MR, no need to replace LKey in WQE. */
if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
/* Stash a mbuf for next replacement. */
if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
rxq->mprq_repl = rep;
else
rxq->mprq_repl = NULL;
/* Release the old buffer. */
mlx5_mprq_buf_free(buf);
} else if (unlikely(rxq->mprq_repl == NULL)) {
struct mlx5_mprq_buf *rep;
/*
* Currently, the MPRQ mempool is out of buffer
* and doing memcpy regardless of the size of Rx
* packet. Retry allocation to get back to
* normal.
*/
if (!rte_mempool_get(rxq->mprq_mp, (void **)&rep))
rxq->mprq_repl = rep;
}
}
/**
* Attach or copy MPRQ buffer content to a packet.
*
* @param rxq
* Pointer to Rx queue structure.
* @param pkt
* Pointer to a packet to fill.
* @param len
* Packet length.
* @param buf
* Pointer to a MPRQ buffer to take the data from.
* @param strd_idx
* Stride index to start from.
* @param strd_cnt
* Number of strides to consume.
*/
static __rte_always_inline enum mlx5_rqx_code
mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,
struct mlx5_mprq_buf *buf, uint16_t strd_idx, uint16_t strd_cnt)
{
const uint32_t strd_n = 1 << rxq->strd_num_n;
const uint16_t strd_sz = 1 << rxq->strd_sz_n;
const uint16_t strd_shift =
MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
const int32_t hdrm_overlap =
len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;
const uint32_t offset = strd_idx * strd_sz + strd_shift;
void *addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
/*
* Memcpy packets to the target mbuf if:
* - The size of packet is smaller than mprq_max_memcpy_len.
* - Out of buffer in the Mempool for Multi-Packet RQ.
* - The packet's stride overlaps a headroom and scatter is off.
*/
if (len <= rxq->mprq_max_memcpy_len ||
rxq->mprq_repl == NULL ||
(hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
if (likely(len <=
(uint32_t)(pkt->buf_len - RTE_PKTMBUF_HEADROOM))) {
rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
addr, len);
DATA_LEN(pkt) = len;
} else if (rxq->strd_scatter_en) {
struct rte_mbuf *prev = pkt;
uint32_t seg_len = RTE_MIN(len, (uint32_t)
(pkt->buf_len - RTE_PKTMBUF_HEADROOM));
uint32_t rem_len = len - seg_len;
rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
addr, seg_len);
DATA_LEN(pkt) = seg_len;
while (rem_len) {
struct rte_mbuf *next =
rte_pktmbuf_alloc(rxq->mp);
if (unlikely(next == NULL))
return MLX5_RXQ_CODE_NOMBUF;
NEXT(prev) = next;
SET_DATA_OFF(next, 0);
addr = RTE_PTR_ADD(addr, seg_len);
seg_len = RTE_MIN(rem_len, (uint32_t)
(next->buf_len - RTE_PKTMBUF_HEADROOM));
rte_memcpy
(rte_pktmbuf_mtod(next, void *),
addr, seg_len);
DATA_LEN(next) = seg_len;
rem_len -= seg_len;
prev = next;
++NB_SEGS(pkt);
}
} else {
return MLX5_RXQ_CODE_DROPPED;
}
} else {
rte_iova_t buf_iova;
struct rte_mbuf_ext_shared_info *shinfo;
uint16_t buf_len = strd_cnt * strd_sz;
void *buf_addr;
/* Increment the refcnt of the whole chunk. */
__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
__ATOMIC_RELAXED) <= strd_n + 1);
buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
/*
* MLX5 device doesn't use iova but it is necessary in a
* case where the Rx packet is transmitted via a
* different PMD.
*/
buf_iova = rte_mempool_virt2iova(buf) +
RTE_PTR_DIFF(buf_addr, buf);
shinfo = &buf->shinfos[strd_idx];
rte_mbuf_ext_refcnt_set(shinfo, 1);
/*
* EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
* attaching the stride to mbuf and more offload flags
* will be added below by calling rxq_cq_to_mbuf().
* Other fields will be overwritten.
*/
rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
buf_len, shinfo);
/* Set mbuf head-room. */
SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);
MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);
MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >=
len - (hdrm_overlap > 0 ? hdrm_overlap : 0));
DATA_LEN(pkt) = len;
/*
* Copy the last fragment of a packet (up to headroom
* size bytes) in case there is a stride overlap with
* a next packet's headroom. Allocate a separate mbuf
* to store this fragment and link it. Scatter is on.
*/
if (hdrm_overlap > 0) {
MLX5_ASSERT(rxq->strd_scatter_en);
struct rte_mbuf *seg =
rte_pktmbuf_alloc(rxq->mp);
if (unlikely(seg == NULL))
return MLX5_RXQ_CODE_NOMBUF;
SET_DATA_OFF(seg, 0);
rte_memcpy(rte_pktmbuf_mtod(seg, void *),
RTE_PTR_ADD(addr, len - hdrm_overlap),
hdrm_overlap);
DATA_LEN(seg) = hdrm_overlap;
DATA_LEN(pkt) = len - hdrm_overlap;
NEXT(pkt) = seg;
NB_SEGS(pkt) = 2;
}
}
return MLX5_RXQ_CODE_EXIT;
}
#endif /* RTE_PMD_MLX5_RXTX_H_ */

View File

@ -77,6 +77,177 @@ rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
return n;
}
/**
* Replenish buffers for RX in bulk.
*
* @param rxq
* Pointer to RX queue structure.
*/
static inline void
mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)
{
const uint16_t q_n = 1 << rxq->elts_n;
const uint16_t q_mask = q_n - 1;
uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi);
uint16_t elts_idx = rxq->rq_ci & q_mask;
struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
volatile struct mlx5_wqe_data_seg *wq =
&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
unsigned int i;
if (n >= rxq->rq_repl_thresh) {
MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >
MLX5_VPMD_DESCS_PER_LOOP);
/* Not to cross queue end. */
n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
rxq->stats.rx_nombuf += n;
return;
}
for (i = 0; i < n; ++i) {
void *buf_addr;
/*
* In order to support the mbufs with external attached
* data buffer we should use the buf_addr pointer
* instead of rte_mbuf_buf_addr(). It touches the mbuf
* itself and may impact the performance.
*/
buf_addr = elts[i]->buf_addr;
wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
RTE_PKTMBUF_HEADROOM);
/* If there's a single MR, no need to replace LKey. */
if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh)
> 1))
wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
}
rxq->rq_ci += n;
/* Prevent overflowing into consumed mbufs. */
elts_idx = rxq->rq_ci & q_mask;
for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
rte_io_wmb();
*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
}
}
/**
* Replenish buffers for MPRQ RX in bulk.
*
* @param rxq
* Pointer to RX queue structure.
*/
static inline void
mlx5_rx_mprq_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)
{
const uint16_t wqe_n = 1 << rxq->elts_n;
const uint32_t strd_n = 1 << rxq->strd_num_n;
const uint32_t elts_n = wqe_n * strd_n;
const uint32_t wqe_mask = elts_n - 1;
uint32_t n = elts_n - (rxq->elts_ci - rxq->rq_pi);
uint32_t elts_idx = rxq->elts_ci & wqe_mask;
struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
/* Not to cross queue end. */
if (n >= rxq->rq_repl_thresh) {
MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(elts_n));
MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(elts_n) >
MLX5_VPMD_DESCS_PER_LOOP);
n = RTE_MIN(n, elts_n - elts_idx);
if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
rxq->stats.rx_nombuf += n;
return;
}
rxq->elts_ci += n;
}
}
/**
* Copy or attach MPRQ buffers to RX SW ring.
*
* @param rxq
* Pointer to RX queue structure.
* @param pkts
* Pointer to array of packets to be stored.
* @param pkts_n
* Number of packets to be stored.
*
* @return
* Number of packets successfully copied/attached (<= pkts_n).
*/
static inline uint16_t
rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq,
struct rte_mbuf **pkts, uint16_t pkts_n)
{
const uint16_t wqe_n = 1 << rxq->elts_n;
const uint16_t wqe_mask = wqe_n - 1;
const uint16_t strd_sz = 1 << rxq->strd_sz_n;
const uint32_t strd_n = 1 << rxq->strd_num_n;
const uint32_t elts_n = wqe_n * strd_n;
const uint32_t elts_mask = elts_n - 1;
uint32_t elts_idx = rxq->rq_pi & elts_mask;
struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
uint32_t rq_ci = rxq->rq_ci;
struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wqe_mask];
uint16_t copied = 0;
uint16_t i = 0;
for (i = 0; i < pkts_n; ++i) {
uint16_t strd_cnt;
enum mlx5_rqx_code rxq_code;
if (rxq->consumed_strd == strd_n) {
/* Replace WQE if the buffer is still in use. */
mprq_buf_replace(rxq, rq_ci & wqe_mask);
/* Advance to the next WQE. */
rxq->consumed_strd = 0;
rq_ci++;
buf = (*rxq->mprq_bufs)[rq_ci & wqe_mask];
}
if (!elts[i]->pkt_len) {
rxq->consumed_strd = strd_n;
rte_pktmbuf_free_seg(elts[i]);
#ifdef MLX5_PMD_SOFT_COUNTERS
rxq->stats.ipackets -= 1;
#endif
continue;
}
strd_cnt = (elts[i]->pkt_len / strd_sz) +
((elts[i]->pkt_len % strd_sz) ? 1 : 0);
rxq_code = mprq_buf_to_pkt(rxq, elts[i], elts[i]->pkt_len,
buf, rxq->consumed_strd, strd_cnt);
rxq->consumed_strd += strd_cnt;
if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
rte_pktmbuf_free_seg(elts[i]);
#ifdef MLX5_PMD_SOFT_COUNTERS
rxq->stats.ipackets -= 1;
rxq->stats.ibytes -= elts[i]->pkt_len;
#endif
if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
++rxq->stats.rx_nombuf;
break;
}
if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
++rxq->stats.idropped;
continue;
}
}
pkts[copied++] = elts[i];
}
rxq->rq_pi += i;
rxq->cq_ci += i;
rte_io_wmb();
*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
if (rq_ci != rxq->rq_ci) {
rxq->rq_ci = rq_ci;
rte_io_wmb();
*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
}
return copied;
}
/**
* Receive burst of packets. An errored completion also consumes a mbuf, but the
* packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
@ -204,7 +375,142 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
bool no_cq = false;
do {
nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn, &err, &no_cq);
nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn,
&err, &no_cq);
if (unlikely(err | rxq->err_state))
nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);
tn += nb_rx;
if (unlikely(no_cq))
break;
} while (tn != pkts_n);
return tn;
}
/**
* Receive burst of packets. An errored completion also consumes a mbuf, but the
* packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
* before returning to application.
*
* @param rxq
* Pointer to RX queue structure.
* @param[out] pkts
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
* @param[out] err
* Pointer to a flag. Set non-zero value if pkts array has at least one error
* packet to handle.
* @param[out] no_cq
* Pointer to a boolean. Set true if no new CQE seen.
*
* @return
* Number of packets received including errors (<= pkts_n).
*/
static inline uint16_t
rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
uint16_t pkts_n, uint64_t *err, bool *no_cq)
{
const uint16_t q_n = 1 << rxq->cqe_n;
const uint16_t q_mask = q_n - 1;
const uint16_t wqe_n = 1 << rxq->elts_n;
const uint32_t strd_n = 1 << rxq->strd_num_n;
const uint32_t elts_n = wqe_n * strd_n;
const uint32_t elts_mask = elts_n - 1;
volatile struct mlx5_cqe *cq;
struct rte_mbuf **elts;
uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
uint16_t nocmp_n = 0;
uint16_t rcvd_pkt = 0;
uint16_t cp_pkt = 0;
unsigned int cq_idx = rxq->cq_ci & q_mask;
unsigned int elts_idx;
MLX5_ASSERT(rxq->sges_n == 0);
cq = &(*rxq->cqes)[cq_idx];
rte_prefetch0(cq);
rte_prefetch0(cq + 1);
rte_prefetch0(cq + 2);
rte_prefetch0(cq + 3);
pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
mlx5_rx_mprq_replenish_bulk_mbuf(rxq);
/* See if there're unreturned mbufs from compressed CQE. */
rcvd_pkt = rxq->decompressed;
if (rcvd_pkt > 0) {
rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
cp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt);
rxq->decompressed -= rcvd_pkt;
pkts += cp_pkt;
}
elts_idx = rxq->rq_pi & elts_mask;
elts = &(*rxq->elts)[elts_idx];
/* Not to overflow pkts array. */
pkts_n = RTE_ALIGN_FLOOR(pkts_n - cp_pkt, MLX5_VPMD_DESCS_PER_LOOP);
/* Not to cross queue end. */
pkts_n = RTE_MIN(pkts_n, elts_n - elts_idx);
pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
/* Not to move past the allocated mbufs. */
pkts_n = RTE_MIN(pkts_n, rxq->elts_ci - rxq->rq_pi);
if (!pkts_n) {
*no_cq = !cp_pkt;
return cp_pkt;
}
/* At this point, there shouldn't be any remaining packets. */
MLX5_ASSERT(rxq->decompressed == 0);
/* Process all the CQEs */
nocmp_n = rxq_cq_process_v(rxq, cq, elts, pkts, pkts_n, err, &comp_idx);
/* If no new CQE seen, return without updating cq_db. */
if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
*no_cq = true;
return cp_pkt;
}
/* Update the consumer indexes for non-compressed CQEs. */
MLX5_ASSERT(nocmp_n <= pkts_n);
cp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n);
rcvd_pkt += cp_pkt;
/* Decompress the last CQE if compressed. */
if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) {
MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
&elts[nocmp_n]);
/* Return more packets if needed. */
if (nocmp_n < pkts_n) {
uint16_t n = rxq->decompressed;
n = RTE_MIN(n, pkts_n - nocmp_n);
cp_pkt = rxq_copy_mprq_mbuf_v(rxq, &pkts[cp_pkt], n);
rcvd_pkt += cp_pkt;
rxq->decompressed -= n;
}
}
*no_cq = !rcvd_pkt;
return rcvd_pkt;
}
/**
* DPDK callback for vectorized MPRQ RX.
*
* @param dpdk_rxq
* Generic pointer to RX queue structure.
* @param[out] pkts
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
*
* @return
* Number of packets successfully received (<= pkts_n).
*/
uint16_t
mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct mlx5_rxq_data *rxq = dpdk_rxq;
uint16_t nb_rx = 0;
uint16_t tn = 0;
uint64_t err = 0;
bool no_cq = false;
do {
nb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn,
&err, &no_cq);
if (unlikely(err | rxq->err_state))
nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);
tn += nb_rx;
@ -229,8 +535,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)
struct mlx5_rxq_ctrl *ctrl =
container_of(rxq, struct mlx5_rxq_ctrl, rxq);
if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))
return -ENOTSUP;
if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)
return -ENOTSUP;
if (rxq->lro)
@ -257,8 +561,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev)
return -ENOTSUP;
if (!priv->config.rx_vec_en)
return -ENOTSUP;
if (mlx5_mprq_enabled(dev))
return -ENOTSUP;
/* All the configured queues should support. */
for (i = 0; i < priv->rxqs_n; ++i) {
struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];

View File

@ -12,7 +12,6 @@
#include <mlx5_prm.h>
#include "mlx5_autoconf.h"
#include "mlx5_mr.h"
/* HW checksum offload capabilities of vectorized Tx. */
@ -68,59 +67,4 @@ S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, sop_drop_qpn) ==
S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) ==
offsetof(struct mlx5_cqe, sop_drop_qpn) + 7);
/**
* Replenish buffers for RX in bulk.
*
* @param rxq
* Pointer to RX queue structure.
*/
static inline void
mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)
{
const uint16_t q_n = 1 << rxq->elts_n;
const uint16_t q_mask = q_n - 1;
uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi);
uint16_t elts_idx = rxq->rq_ci & q_mask;
struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
volatile struct mlx5_wqe_data_seg *wq =
&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
unsigned int i;
if (n >= rxq->rq_repl_thresh) {
MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >
MLX5_VPMD_DESCS_PER_LOOP);
/* Not to cross queue end. */
n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
rxq->stats.rx_nombuf += n;
return;
}
for (i = 0; i < n; ++i) {
void *buf_addr;
/*
* In order to support the mbufs with external attached
* data buffer we should use the buf_addr pointer
* instead of rte_mbuf_buf_addr(). It touches the mbuf
* itself and may impact the performance.
*/
buf_addr = elts[i]->buf_addr;
wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
RTE_PKTMBUF_HEADROOM);
/* If there's a single MR, no need to replace LKey. */
if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh)
> 1))
wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
}
rxq->rq_ci += n;
/* Prevent overflowing into consumed mbufs. */
elts_idx = rxq->rq_ci & q_mask;
for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
rte_io_wmb();
*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
}
}
#endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */