From 6cb559d67b83045e99c3c6d7de423960076ce207 Mon Sep 17 00:00:00 2001 From: Yongseok Koh Date: Thu, 6 Jul 2017 11:41:10 -0700 Subject: [PATCH] net/mlx5: add vectorized Rx/Tx burst for x86 To make vectorized burst routines enabled, it is required to run on x86_64 architecture. If all the conditions are met, the vectorized burst functions are enabled automatically. The decision is made individually on RX and TX. There's no PMD option to make a selection. Signed-off-by: Yongseok Koh Acked-by: Nelio Laranjeiro --- drivers/net/mlx5/Makefile | 3 + drivers/net/mlx5/mlx5_defs.h | 18 + drivers/net/mlx5/mlx5_ethdev.c | 24 +- drivers/net/mlx5/mlx5_rxq.c | 55 +- drivers/net/mlx5/mlx5_rxtx.c | 387 ++----- drivers/net/mlx5/mlx5_rxtx.h | 282 +++++- drivers/net/mlx5/mlx5_rxtx_vec_sse.c | 1394 ++++++++++++++++++++++++++ drivers/net/mlx5/mlx5_txq.c | 2 +- 8 files changed, 1868 insertions(+), 297 deletions(-) create mode 100644 drivers/net/mlx5/mlx5_rxtx_vec_sse.c diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index 8b8f6eaac6..8736de5d30 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -39,6 +39,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxq.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txq.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx.c +ifeq ($(CONFIG_RTE_ARCH_X86_64),y) +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx_vec_sse.c +endif SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_trigger.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_ethdev.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mac.c diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index 201bb33627..a76bc6f655 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -89,4 +89,22 @@ /* Maximum Packet headers size (L2+L3+L4) for TSO. */ #define MLX5_MAX_TSO_HEADER 128 +/* Default minimum number of Tx queues for vectorized Tx. */ +#define MLX5_VPMD_MIN_TXQS 4 + +/* Threshold of buffer replenishment for vectorized Rx. */ +#define MLX5_VPMD_RXQ_RPLNSH_THRESH 64U + +/* Maximum size of burst for vectorized Rx. */ +#define MLX5_VPMD_RX_MAX_BURST MLX5_VPMD_RXQ_RPLNSH_THRESH + +/* + * Maximum size of burst for vectorized Tx. This is related to the maximum size + * of Enhaned MPW (eMPW) WQE as vectorized Tx is supported with eMPW. + */ +#define MLX5_VPMD_TX_MAX_BURST 32U + +/* Number of packets vectorized Rx can simultaneously process in a loop. */ +#define MLX5_VPMD_DESCS_PER_LOOP 4 + #endif /* RTE_PMD_MLX5_DEFS_H_ */ diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 54c4c6caea..b70b7b9a9d 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -723,7 +723,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) }; - if (dev->rx_pkt_burst == mlx5_rx_burst) + if (dev->rx_pkt_burst == mlx5_rx_burst || + dev->rx_pkt_burst == mlx5_rx_burst_vec) return ptypes; return NULL; } @@ -1585,9 +1586,16 @@ priv_select_tx_function(struct priv *priv) priv->dev->tx_pkt_burst = mlx5_tx_burst; /* Select appropriate TX function. */ if (priv->mps == MLX5_MPW_ENHANCED) { - priv->dev->tx_pkt_burst = - mlx5_tx_burst_empw; - DEBUG("selected Enhanced MPW TX function"); + if (priv_check_vec_tx_support(priv) > 0) { + if (priv_check_raw_vec_tx_support(priv) > 0) + priv->dev->tx_pkt_burst = mlx5_tx_burst_raw_vec; + else + priv->dev->tx_pkt_burst = mlx5_tx_burst_vec; + DEBUG("selected Enhanced MPW TX vectorized function"); + } else { + priv->dev->tx_pkt_burst = mlx5_tx_burst_empw; + DEBUG("selected Enhanced MPW TX function"); + } } else if (priv->mps && priv->txq_inline) { priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline; DEBUG("selected MPW inline TX function"); @@ -1606,5 +1614,11 @@ priv_select_tx_function(struct priv *priv) void priv_select_rx_function(struct priv *priv) { - priv->dev->rx_pkt_burst = mlx5_rx_burst; + if (priv_check_vec_rx_support(priv) > 0) { + priv_prep_vec_rx_function(priv); + priv->dev->rx_pkt_burst = mlx5_rx_burst_vec; + DEBUG("selected RX vectorized function"); + } else { + priv->dev->rx_pkt_burst = mlx5_rx_burst; + } } diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index 98b20eb92b..34ec95bc57 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -633,6 +633,41 @@ priv_rehash_flows(struct priv *priv) return 0; } +/** + * Unlike regular Rx function, vPMD Rx doesn't replace mbufs immediately when + * receiving packets. Instead it replaces later in bulk. In rxq->elts[], entries + * from rq_pi to rq_ci are owned by device but the rest is already delivered to + * application. In order not to reuse those mbufs by rxq_alloc_elts(), this + * function must be called to replace used mbufs. + * + * @param rxq + * Pointer to RX queue structure. + * + * @return + * 0 on success, errno value on failure. + */ +static int +rxq_trim_elts(struct rxq *rxq) +{ + const uint16_t q_n = (1 << rxq->elts_n); + const uint16_t q_mask = q_n - 1; + uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi); + uint16_t i; + + if (!rxq->trim_elts) + return 0; + for (i = 0; i < used; ++i) { + struct rte_mbuf *buf; + buf = rte_pktmbuf_alloc(rxq->mp); + if (!buf) + return ENOMEM; + (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = buf; + } + rxq->rq_pi = rxq->rq_ci; + rxq->trim_elts = 0; + return 0; +} + /** * Allocate RX queue elements. * @@ -800,6 +835,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl) return err; } /* Snatch mbufs from original queue. */ + claim_zero(rxq_trim_elts(&rxq_ctrl->rxq)); claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts)); for (i = 0; i != elts_n; ++i) { struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i]; @@ -860,6 +896,7 @@ rxq_setup(struct rxq_ctrl *tmpl) tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt); tmpl->rxq.cq_ci = 0; tmpl->rxq.rq_ci = 0; + tmpl->rxq.rq_pi = 0; tmpl->rxq.cq_db = cq_info.dbrec; tmpl->rxq.wqes = (volatile struct mlx5_wqe_data_seg (*)[]) @@ -993,7 +1030,12 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, if (priv->cqe_comp) { attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS; attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE; - cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */ + /* + * For vectorized Rx, it must not be doubled in order to + * make cq_ci and rq_ci aligned. + */ + if (rxq_check_vec_support(&tmpl.rxq) < 0) + cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */ } tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0, &attr.cq); @@ -1103,7 +1145,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, if (rxq_ctrl->rxq.elts_n) { assert(1 << rxq_ctrl->rxq.elts_n == desc); assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts); - ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts); + ret = rxq_trim_elts(&rxq_ctrl->rxq); + if (!ret) + ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts); } else ret = rxq_alloc_elts(&tmpl, desc, NULL); if (ret) { @@ -1165,6 +1209,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct priv *priv = dev->data->dev_private; struct rxq *rxq = (*priv->rxqs)[idx]; struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); + const uint16_t desc_pad = MLX5_VPMD_DESCS_PER_LOOP; /* For vPMD. */ int ret; if (mlx5_is_secondary()) @@ -1198,7 +1243,8 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, if (rxq_ctrl->rxq.elts_n != log2above(desc)) { rxq_ctrl = rte_realloc(rxq_ctrl, sizeof(*rxq_ctrl) + - desc * sizeof(struct rte_mbuf *), + (desc + desc_pad) * + sizeof(struct rte_mbuf *), RTE_CACHE_LINE_SIZE); if (!rxq_ctrl) { ERROR("%p: unable to reallocate queue index %u", @@ -1209,7 +1255,8 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, } } else { rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) + - desc * sizeof(struct rte_mbuf *), + (desc + desc_pad) * + sizeof(struct rte_mbuf *), 0, socket); if (rxq_ctrl == NULL) { ERROR("%p: unable to allocate queue index %u", diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 688ee9028a..ab6df19eb0 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -69,19 +69,6 @@ #include "mlx5_defs.h" #include "mlx5_prm.h" -static __rte_always_inline int -check_cqe(volatile struct mlx5_cqe *cqe, - unsigned int cqes_n, const uint16_t ci); - -static __rte_always_inline void -txq_complete(struct txq *txq); - -static __rte_always_inline uint32_t -txq_mb2mr(struct txq *txq, struct rte_mbuf *mb); - -static __rte_always_inline void -mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe); - static __rte_always_inline uint32_t rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe); @@ -92,101 +79,29 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, static __rte_always_inline uint32_t rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe); -#ifndef NDEBUG - -/** - * Verify or set magic value in CQE. - * - * @param cqe - * Pointer to CQE. - * - * @return - * 0 the first time. +/* + * The index to the array should have: + * bit[1:0] = l3_hdr_type, bit[2] = tunneled, bit[3] = outer_l3_type */ -static inline int -check_cqe_seen(volatile struct mlx5_cqe *cqe) -{ - static const uint8_t magic[] = "seen"; - volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; - int ret = 1; - unsigned int i; - - for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) - if (!ret || (*buf)[i] != magic[i]) { - ret = 0; - (*buf)[i] = magic[i]; - } - return ret; -} - -#endif /* NDEBUG */ - -/** - * Check whether CQE is valid. - * - * @param cqe - * Pointer to CQE. - * @param cqes_n - * Size of completion queue. - * @param ci - * Consumer index. - * - * @return - * 0 on success, 1 on failure. - */ -static inline int -check_cqe(volatile struct mlx5_cqe *cqe, - unsigned int cqes_n, const uint16_t ci) -{ - uint16_t idx = ci & cqes_n; - uint8_t op_own = cqe->op_own; - uint8_t op_owner = MLX5_CQE_OWNER(op_own); - uint8_t op_code = MLX5_CQE_OPCODE(op_own); - - if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) - return 1; /* No CQE. */ -#ifndef NDEBUG - if ((op_code == MLX5_CQE_RESP_ERR) || - (op_code == MLX5_CQE_REQ_ERR)) { - volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; - uint8_t syndrome = err_cqe->syndrome; - - if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || - (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) - return 0; - if (!check_cqe_seen(cqe)) - ERROR("unexpected CQE error %u (0x%02x)" - " syndrome 0x%02x", - op_code, op_code, syndrome); - return 1; - } else if ((op_code != MLX5_CQE_RESP_SEND) && - (op_code != MLX5_CQE_REQ)) { - if (!check_cqe_seen(cqe)) - ERROR("unexpected CQE opcode %u (0x%02x)", - op_code, op_code); - return 1; - } -#endif /* NDEBUG */ - return 0; -} - -/** - * Return the address of the WQE. - * - * @param txq - * Pointer to TX queue structure. - * @param wqe_ci - * WQE consumer index. - * - * @return - * WQE address. - */ -static inline uintptr_t * -tx_mlx5_wqe(struct txq *txq, uint16_t ci) -{ - ci &= ((1 << txq->wqe_n) - 1); - return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); -} +const uint32_t mlx5_ptype_table[] = { + RTE_PTYPE_UNKNOWN, + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, /* b0001 */ + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, /* b0010 */ + RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, /* b0101 */ + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, /* b0110 */ + RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, /* b1001 */ + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, /* b1010 */ + RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN, + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, /* b1101 */ + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, /* b1110 */ + RTE_PTYPE_ALL_MASK /* b1111 */ +}; /** * Return the size of tailroom of WQ. @@ -244,174 +159,6 @@ mlx5_copy_to_wq(void *dst, const void *src, size_t n, return ret; } -/** - * Manage TX completions. - * - * When sending a burst, mlx5_tx_burst() posts several WRs. - * - * @param txq - * Pointer to TX queue structure. - */ -static inline void -txq_complete(struct txq *txq) -{ - const uint16_t elts_n = 1 << txq->elts_n; - const uint16_t elts_m = elts_n - 1; - const unsigned int cqe_n = 1 << txq->cqe_n; - const unsigned int cqe_cnt = cqe_n - 1; - uint16_t elts_free = txq->elts_tail; - uint16_t elts_tail; - uint16_t cq_ci = txq->cq_ci; - volatile struct mlx5_cqe *cqe = NULL; - volatile struct mlx5_wqe_ctrl *ctrl; - struct rte_mbuf *m, *free[elts_n]; - struct rte_mempool *pool = NULL; - unsigned int blk_n = 0; - - do { - volatile struct mlx5_cqe *tmp; - - tmp = &(*txq->cqes)[cq_ci & cqe_cnt]; - if (check_cqe(tmp, cqe_n, cq_ci)) - break; - cqe = tmp; -#ifndef NDEBUG - if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { - if (!check_cqe_seen(cqe)) - ERROR("unexpected compressed CQE, TX stopped"); - return; - } - if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || - (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { - if (!check_cqe_seen(cqe)) - ERROR("unexpected error CQE, TX stopped"); - return; - } -#endif /* NDEBUG */ - ++cq_ci; - } while (1); - if (unlikely(cqe == NULL)) - return; - txq->wqe_pi = ntohs(cqe->wqe_counter); - ctrl = (volatile struct mlx5_wqe_ctrl *) - tx_mlx5_wqe(txq, txq->wqe_pi); - elts_tail = ctrl->ctrl3; - assert((elts_tail & elts_m) < (1 << txq->wqe_n)); - /* Free buffers. */ - while (elts_free != elts_tail) { - m = rte_pktmbuf_prefree_seg((*txq->elts)[elts_free++ & elts_m]); - if (likely(m != NULL)) { - if (likely(m->pool == pool)) { - free[blk_n++] = m; - } else { - if (likely(pool != NULL)) - rte_mempool_put_bulk(pool, - (void *)free, - blk_n); - free[0] = m; - pool = m->pool; - blk_n = 1; - } - } - } - if (blk_n) - rte_mempool_put_bulk(pool, (void *)free, blk_n); -#ifndef NDEBUG - elts_free = txq->elts_tail; - /* Poisoning. */ - while (elts_free != elts_tail) { - memset(&(*txq->elts)[elts_free & elts_m], - 0x66, - sizeof((*txq->elts)[elts_free & elts_m])); - ++elts_free; - } -#endif - txq->cq_ci = cq_ci; - txq->elts_tail = elts_tail; - /* Update the consumer index. */ - rte_wmb(); - *txq->cq_db = htonl(cq_ci); -} - -/** - * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which - * the cloned mbuf is allocated is returned instead. - * - * @param buf - * Pointer to mbuf. - * - * @return - * Memory pool where data is located for given mbuf. - */ -static struct rte_mempool * -txq_mb2mp(struct rte_mbuf *buf) -{ - if (unlikely(RTE_MBUF_INDIRECT(buf))) - return rte_mbuf_from_indirect(buf)->pool; - return buf->pool; -} - -/** - * Get Memory Region (MR) <-> rte_mbuf association from txq->mp2mr[]. - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, - * remove an entry first. - * - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. - * - * @return - * mr->lkey on success, (uint32_t)-1 on failure. - */ -static inline uint32_t -txq_mb2mr(struct txq *txq, struct rte_mbuf *mb) -{ - uint16_t i = txq->mr_cache_idx; - uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t); - - assert(i < RTE_DIM(txq->mp2mr)); - if (likely(txq->mp2mr[i].start <= addr && txq->mp2mr[i].end >= addr)) - return txq->mp2mr[i].lkey; - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i].mr == NULL)) { - /* Unknown MP, add a new MR for it. */ - break; - } - if (txq->mp2mr[i].start <= addr && - txq->mp2mr[i].end >= addr) { - assert(txq->mp2mr[i].lkey != (uint32_t)-1); - assert(htonl(txq->mp2mr[i].mr->lkey) == - txq->mp2mr[i].lkey); - txq->mr_cache_idx = i; - return txq->mp2mr[i].lkey; - } - } - txq->mr_cache_idx = 0; - return txq_mp2mr_reg(txq, txq_mb2mp(mb), i); -} - -/** - * Ring TX queue doorbell. - * - * @param txq - * Pointer to TX queue structure. - * @param wqe - * Pointer to the last WQE posted in the NIC. - */ -static inline void -mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) -{ - uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); - volatile uint64_t *src = ((volatile uint64_t *)wqe); - - rte_wmb(); - *txq->qp_db = htonl(txq->wqe_ci); - /* Ensure ordering between DB record and BF copy. */ - rte_wmb(); - *dst = *src; -} - /** * DPDK callback to check the status of a tx descriptor. * @@ -429,7 +176,7 @@ mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) struct txq *txq = tx_queue; uint16_t used; - txq_complete(txq); + mlx5_tx_complete(txq); used = txq->elts_head - txq->elts_tail; if (offset < used) return RTE_ETH_TX_DESC_FULL; @@ -525,7 +272,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Prefetch first packet cacheline. */ rte_prefetch0(*pkts); /* Start processing. */ - txq_complete(txq); + mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); if (unlikely(!max_wqe)) @@ -773,7 +520,7 @@ use_dseg: naddr = htonll(addr); *dseg = (rte_v128u32_t){ htonl(length), - txq_mb2mr(txq, buf), + mlx5_tx_mb2mr(txq, buf), naddr, naddr >> 32, }; @@ -812,7 +559,7 @@ next_seg: naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); *dseg = (rte_v128u32_t){ htonl(length), - txq_mb2mr(txq, buf), + mlx5_tx_mb2mr(txq, buf), naddr, naddr >> 32, }; @@ -991,7 +738,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); /* Start processing. */ - txq_complete(txq); + mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); if (unlikely(!max_wqe)) @@ -1054,7 +801,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mb2mr(txq, buf), + .lkey = mlx5_tx_mb2mr(txq, buf), .addr = htonll(addr), }; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) @@ -1214,7 +961,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); /* Start processing. */ - txq_complete(txq); + mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); do { struct rte_mbuf *buf = *(pkts++); @@ -1300,7 +1047,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mb2mr(txq, buf), + .lkey = mlx5_tx_mb2mr(txq, buf), .addr = htonll(addr), }; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) @@ -1495,7 +1242,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (unlikely(!pkts_n)) return 0; /* Start processing. */ - txq_complete(txq); + mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); /* A CQE slot must always be available. */ assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); @@ -1607,7 +1354,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mb2mr(txq, buf), + .lkey = mlx5_tx_mb2mr(txq, buf), .addr = htonll(addr), }; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) @@ -1690,7 +1437,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) naddr = htonll(addr); *dseg = (rte_v128u32_t) { htonl(length), - txq_mb2mr(txq, buf), + mlx5_tx_mb2mr(txq, buf), naddr, naddr >> 32, }; @@ -2138,3 +1885,71 @@ removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) (void)pkts_n; return 0; } + +/* + * Vectorized Rx/Tx routines are not compiled in when required vector + * instructions are not supported on a target architecture. The following null + * stubs are needed for linkage when those are not included outside of this file + * (e.g. mlx5_rxtx_vec_sse.c for x86). + */ + +uint16_t __attribute__((weak)) +mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_txq; + (void)pkts; + (void)pkts_n; + return 0; +} + +uint16_t __attribute__((weak)) +mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_txq; + (void)pkts; + (void)pkts_n; + return 0; +} + +uint16_t __attribute__((weak)) +mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_rxq; + (void)pkts; + (void)pkts_n; + return 0; +} + +int __attribute__((weak)) +priv_check_raw_vec_tx_support(struct priv *priv) +{ + (void)priv; + return -ENOTSUP; +} + +int __attribute__((weak)) +priv_check_vec_tx_support(struct priv *priv) +{ + (void)priv; + return -ENOTSUP; +} + +int __attribute__((weak)) +rxq_check_vec_support(struct rxq *rxq) +{ + (void)rxq; + return -ENOTSUP; +} + +int __attribute__((weak)) +priv_check_vec_rx_support(struct priv *priv) +{ + (void)priv; + return -ENOTSUP; +} + +void __attribute__((weak)) +priv_prep_vec_rx_function(struct priv *priv) +{ + (void)priv; +} diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index d0f508e903..534aaeb467 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -115,10 +115,13 @@ struct rxq { unsigned int port_id:8; unsigned int rss_hash:1; /* RSS hash result is enabled. */ unsigned int mark:1; /* Marked flow available on the queue. */ - unsigned int :8; /* Remaining bits. */ + unsigned int pending_err:1; /* CQE error needs to be handled. */ + unsigned int trim_elts:1; /* Whether elts needs clean-up. */ + unsigned int :6; /* Remaining bits. */ volatile uint32_t *rq_db; volatile uint32_t *cq_db; uint16_t rq_ci; + uint16_t rq_pi; uint16_t cq_ci; volatile struct mlx5_wqe_data_seg(*wqes)[]; volatile struct mlx5_cqe(*cqes)[]; @@ -126,6 +129,8 @@ struct rxq { struct rte_mbuf *(*elts)[]; struct rte_mempool *mp; struct mlx5_rxq_stats stats; + uint64_t mbuf_initializer; /* Default rearm_data for vectorized Rx. */ + struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */ } __rte_cache_aligned; /* RX queue control descriptor. */ @@ -261,6 +266,7 @@ struct txq { uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ uint16_t inline_max_packet_sz; /* Max packet size for inlining. */ uint32_t qp_num_8s; /* QP number shifted by 8. */ + uint32_t flags; /* Flags for Tx Queue. */ volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ volatile void *wqes; /* Work queue (use volatile to write into). */ volatile uint32_t *qp_db; /* Work queue doorbell. */ @@ -328,6 +334,8 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t); /* mlx5_rxtx.c */ +extern const uint32_t mlx5_ptype_table[]; + uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t); uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t); uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t); @@ -338,10 +346,282 @@ uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); int mlx5_rx_descriptor_status(void *, uint16_t); int mlx5_tx_descriptor_status(void *, uint16_t); +/* Vectorized version of mlx5_rxtx.c */ +int priv_check_raw_vec_tx_support(struct priv *); +int priv_check_vec_tx_support(struct priv *); +int rxq_check_vec_support(struct rxq *); +int priv_check_vec_rx_support(struct priv *); +void priv_prep_vec_rx_function(struct priv *); +uint16_t mlx5_tx_burst_raw_vec(void *, struct rte_mbuf **, uint16_t); +uint16_t mlx5_tx_burst_vec(void *, struct rte_mbuf **, uint16_t); +uint16_t mlx5_rx_burst_vec(void *, struct rte_mbuf **, uint16_t); + /* mlx5_mr.c */ struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *); void txq_mp2mr_iter(struct rte_mempool *, void *); uint32_t txq_mp2mr_reg(struct txq *, struct rte_mempool *, unsigned int); +#ifndef NDEBUG +/** + * Verify or set magic value in CQE. + * + * @param cqe + * Pointer to CQE. + * + * @return + * 0 the first time. + */ +static inline int +check_cqe_seen(volatile struct mlx5_cqe *cqe) +{ + static const uint8_t magic[] = "seen"; + volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; + int ret = 1; + unsigned int i; + + for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) + if (!ret || (*buf)[i] != magic[i]) { + ret = 0; + (*buf)[i] = magic[i]; + } + return ret; +} +#endif /* NDEBUG */ + +/** + * Check whether CQE is valid. + * + * @param cqe + * Pointer to CQE. + * @param cqes_n + * Size of completion queue. + * @param ci + * Consumer index. + * + * @return + * 0 on success, 1 on failure. + */ +static __rte_always_inline int +check_cqe(volatile struct mlx5_cqe *cqe, + unsigned int cqes_n, const uint16_t ci) +{ + uint16_t idx = ci & cqes_n; + uint8_t op_own = cqe->op_own; + uint8_t op_owner = MLX5_CQE_OWNER(op_own); + uint8_t op_code = MLX5_CQE_OPCODE(op_own); + + if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) + return 1; /* No CQE. */ +#ifndef NDEBUG + if ((op_code == MLX5_CQE_RESP_ERR) || + (op_code == MLX5_CQE_REQ_ERR)) { + volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; + uint8_t syndrome = err_cqe->syndrome; + + if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || + (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) + return 0; + if (!check_cqe_seen(cqe)) + ERROR("unexpected CQE error %u (0x%02x)" + " syndrome 0x%02x", + op_code, op_code, syndrome); + return 1; + } else if ((op_code != MLX5_CQE_RESP_SEND) && + (op_code != MLX5_CQE_REQ)) { + if (!check_cqe_seen(cqe)) + ERROR("unexpected CQE opcode %u (0x%02x)", + op_code, op_code); + return 1; + } +#endif /* NDEBUG */ + return 0; +} + +/** + * Return the address of the WQE. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe_ci + * WQE consumer index. + * + * @return + * WQE address. + */ +static inline uintptr_t * +tx_mlx5_wqe(struct txq *txq, uint16_t ci) +{ + ci &= ((1 << txq->wqe_n) - 1); + return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); +} + +/** + * Manage TX completions. + * + * When sending a burst, mlx5_tx_burst() posts several WRs. + * + * @param txq + * Pointer to TX queue structure. + */ +static __rte_always_inline void +mlx5_tx_complete(struct txq *txq) +{ + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + const unsigned int cqe_n = 1 << txq->cqe_n; + const unsigned int cqe_cnt = cqe_n - 1; + uint16_t elts_free = txq->elts_tail; + uint16_t elts_tail; + uint16_t cq_ci = txq->cq_ci; + volatile struct mlx5_cqe *cqe = NULL; + volatile struct mlx5_wqe_ctrl *ctrl; + struct rte_mbuf *m, *free[elts_n]; + struct rte_mempool *pool = NULL; + unsigned int blk_n = 0; + + do { + volatile struct mlx5_cqe *tmp; + + tmp = &(*txq->cqes)[cq_ci & cqe_cnt]; + if (check_cqe(tmp, cqe_n, cq_ci)) + break; + cqe = tmp; +#ifndef NDEBUG + if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { + if (!check_cqe_seen(cqe)) + ERROR("unexpected compressed CQE, TX stopped"); + return; + } + if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || + (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { + if (!check_cqe_seen(cqe)) + ERROR("unexpected error CQE, TX stopped"); + return; + } +#endif /* NDEBUG */ + ++cq_ci; + } while (1); + if (unlikely(cqe == NULL)) + return; + txq->wqe_pi = ntohs(cqe->wqe_counter); + ctrl = (volatile struct mlx5_wqe_ctrl *) + tx_mlx5_wqe(txq, txq->wqe_pi); + elts_tail = ctrl->ctrl3; + assert((elts_tail & elts_m) < (1 << txq->wqe_n)); + /* Free buffers. */ + while (elts_free != elts_tail) { + m = rte_pktmbuf_prefree_seg((*txq->elts)[elts_free++ & elts_m]); + if (likely(m != NULL)) { + if (likely(m->pool == pool)) { + free[blk_n++] = m; + } else { + if (likely(pool != NULL)) + rte_mempool_put_bulk(pool, + (void *)free, + blk_n); + free[0] = m; + pool = m->pool; + blk_n = 1; + } + } + } + if (blk_n) + rte_mempool_put_bulk(pool, (void *)free, blk_n); +#ifndef NDEBUG + elts_free = txq->elts_tail; + /* Poisoning. */ + while (elts_free != elts_tail) { + memset(&(*txq->elts)[elts_free & elts_m], + 0x66, + sizeof((*txq->elts)[elts_free & elts_m])); + ++elts_free; + } +#endif + txq->cq_ci = cq_ci; + txq->elts_tail = elts_tail; + /* Update the consumer index. */ + rte_wmb(); + *txq->cq_db = htonl(cq_ci); +} + +/** + * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which + * the cloned mbuf is allocated is returned instead. + * + * @param buf + * Pointer to mbuf. + * + * @return + * Memory pool where data is located for given mbuf. + */ +static struct rte_mempool * +mlx5_tx_mb2mp(struct rte_mbuf *buf) +{ + if (unlikely(RTE_MBUF_INDIRECT(buf))) + return rte_mbuf_from_indirect(buf)->pool; + return buf->pool; +} + +/** + * Get Memory Region (MR) <-> rte_mbuf association from txq->mp2mr[]. + * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, + * remove an entry first. + * + * @param txq + * Pointer to TX queue structure. + * @param[in] mp + * Memory Pool for which a Memory Region lkey must be returned. + * + * @return + * mr->lkey on success, (uint32_t)-1 on failure. + */ +static __rte_always_inline uint32_t +mlx5_tx_mb2mr(struct txq *txq, struct rte_mbuf *mb) +{ + uint16_t i = txq->mr_cache_idx; + uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t); + + assert(i < RTE_DIM(txq->mp2mr)); + if (likely(txq->mp2mr[i].start <= addr && txq->mp2mr[i].end >= addr)) + return txq->mp2mr[i].lkey; + for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { + if (unlikely(txq->mp2mr[i].mr == NULL)) { + /* Unknown MP, add a new MR for it. */ + break; + } + if (txq->mp2mr[i].start <= addr && + txq->mp2mr[i].end >= addr) { + assert(txq->mp2mr[i].lkey != (uint32_t)-1); + assert(htonl(txq->mp2mr[i].mr->lkey) == + txq->mp2mr[i].lkey); + txq->mr_cache_idx = i; + return txq->mp2mr[i].lkey; + } + } + txq->mr_cache_idx = 0; + return txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i); +} + +/** + * Ring TX queue doorbell. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the last WQE posted in the NIC. + */ +static __rte_always_inline void +mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) +{ + uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); + volatile uint64_t *src = ((volatile uint64_t *)wqe); + + rte_wmb(); + *txq->qp_db = htonl(txq->wqe_ci); + /* Ensure ordering between DB record and BF copy. */ + rte_wmb(); + *dst = *src; +} + #endif /* RTE_PMD_MLX5_RXTX_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c new file mode 100644 index 0000000000..95e96baa23 --- /dev/null +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c @@ -0,0 +1,1394 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include +#include +#include +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +/* DPDK headers don't like -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include +#include +#include +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include "mlx5.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" +#include "mlx5_autoconf.h" +#include "mlx5_defs.h" +#include "mlx5_prm.h" + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + +/** + * Fill in buffer descriptors in a multi-packet send descriptor. + * + * @param txq + * Pointer to TX queue structure. + * @param dseg + * Pointer to buffer descriptor to be writen. + * @param pkts + * Pointer to array of packets to be sent. + * @param n + * Number of packets to be filled. + */ +static inline void +txq_wr_dseg_v(struct txq *txq, __m128i *dseg, + struct rte_mbuf **pkts, unsigned int n) +{ + unsigned int pos; + uintptr_t addr; + const __m128i shuf_mask_dseg = + _mm_set_epi8(8, 9, 10, 11, /* addr, bswap64 */ + 12, 13, 14, 15, + 7, 6, 5, 4, /* lkey */ + 0, 1, 2, 3 /* length, bswap32 */); +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t tx_byte = 0; +#endif + + for (pos = 0; pos < n; ++pos, ++dseg) { + __m128i desc; + struct rte_mbuf *pkt = pkts[pos]; + + addr = rte_pktmbuf_mtod(pkt, uintptr_t); + desc = _mm_set_epi32(addr >> 32, + addr, + mlx5_tx_mb2mr(txq, pkt), + DATA_LEN(pkt)); + desc = _mm_shuffle_epi8(desc, shuf_mask_dseg); + _mm_store_si128(dseg, desc); +#ifdef MLX5_PMD_SOFT_COUNTERS + tx_byte += DATA_LEN(pkt); +#endif + } +#ifdef MLX5_PMD_SOFT_COUNTERS + txq->stats.obytes += tx_byte; +#endif +} + +/** + * Count the number of continuous single segment packets. The first packet must + * be a single segment packet. + * + * @param pkts + * Pointer to array of packets. + * @param pkts_n + * Number of packets. + * + * @return + * Number of continuous single segment packets. + */ +static inline unsigned int +txq_check_multiseg(struct rte_mbuf **pkts, uint16_t pkts_n) +{ + unsigned int pos; + + if (!pkts_n) + return 0; + assert(NB_SEGS(pkts[0]) == 1); + /* Count the number of continuous single segment packets. */ + for (pos = 1; pos < pkts_n; ++pos) + if (NB_SEGS(pkts[pos]) > 1) + break; + return pos; +} + +/** + * Count the number of packets having same ol_flags and calculate cs_flags. + * + * @param txq + * Pointer to TX queue structure. + * @param pkts + * Pointer to array of packets. + * @param pkts_n + * Number of packets. + * @param cs_flags + * Pointer of flags to be returned. + * + * @return + * Number of packets having same ol_flags. + */ +static inline unsigned int +txq_calc_offload(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n, + uint8_t *cs_flags) +{ + unsigned int pos; + const uint64_t ol_mask = + PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | + PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM; + + if (!pkts_n) + return 0; + /* Count the number of packets having same ol_flags. */ + for (pos = 1; pos < pkts_n; ++pos) + if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask) + break; + /* Should open another MPW session for the rest. */ + if (pkts[0]->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { + const uint64_t is_tunneled = + pkts[0]->ol_flags & + (PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN); + + if (is_tunneled && txq->tunnel_en) { + *cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | + MLX5_ETH_WQE_L4_INNER_CSUM; + if (pkts[0]->ol_flags & PKT_TX_OUTER_IP_CKSUM) + *cs_flags |= MLX5_ETH_WQE_L3_CSUM; + } else { + *cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + } + } + return pos; +} + +/** + * Send multi-segmented packets until it encounters a single segment packet in + * the pkts list. + * + * @param txq + * Pointer to TX queue structure. + * @param pkts + * Pointer to array of packets to be sent. + * @param pkts_n + * Number of packets to be sent. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +static uint16_t +txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + uint16_t elts_head = txq->elts_head; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + const uint16_t wq_n = 1 << txq->wqe_n; + const uint16_t wq_mask = wq_n - 1; + const unsigned int nb_dword_per_wqebb = + MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; + const unsigned int nb_dword_in_hdr = + sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; + unsigned int n; + volatile struct mlx5_wqe *wqe = NULL; + + assert(elts_n > pkts_n); + mlx5_tx_complete(txq); + if (unlikely(!pkts_n)) + return 0; + for (n = 0; n < pkts_n; ++n) { + struct rte_mbuf *buf = pkts[n]; + unsigned int segs_n = buf->nb_segs; + unsigned int ds = nb_dword_in_hdr; + unsigned int len = PKT_LEN(buf); + uint16_t wqe_ci = txq->wqe_ci; + const __m128i shuf_mask_ctrl = + _mm_set_epi8(15, 14, 13, 12, + 8, 9, 10, 11, /* bswap32 */ + 4, 5, 6, 7, /* bswap32 */ + 0, 1, 2, 3 /* bswap32 */); + uint8_t cs_flags = 0; + uint16_t max_elts; + uint16_t max_wqe; + __m128i *t_wqe, *dseg; + __m128i ctrl; + + assert(segs_n); + max_elts = elts_n - (elts_head - txq->elts_tail); + max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi); + /* + * A MPW session consumes 2 WQEs at most to + * include MLX5_MPW_DSEG_MAX pointers. + */ + if (segs_n == 1 || + max_elts < segs_n || max_wqe < 2) + break; + wqe = &((volatile struct mlx5_wqe64 *) + txq->wqes)[wqe_ci & wq_mask].hdr; + if (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { + const uint64_t is_tunneled = buf->ol_flags & + (PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN); + + if (is_tunneled && txq->tunnel_en) { + cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | + MLX5_ETH_WQE_L4_INNER_CSUM; + if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) + cs_flags |= MLX5_ETH_WQE_L3_CSUM; + } else { + cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + } + } + /* Title WQEBB pointer. */ + t_wqe = (__m128i *)wqe; + dseg = (__m128i *)(wqe + 1); + do { + if (!(ds++ % nb_dword_per_wqebb)) { + dseg = (__m128i *) + &((volatile struct mlx5_wqe64 *) + txq->wqes)[++wqe_ci & wq_mask]; + } + txq_wr_dseg_v(txq, dseg++, &buf, 1); + (*txq->elts)[elts_head++ & elts_m] = buf; + buf = buf->next; + } while (--segs_n); + if (ds % nb_dword_per_wqebb) + ++wqe_ci; + /* Fill CTRL in the header. */ + ctrl = _mm_set_epi32(0, 0, txq->qp_num_8s | ds, + MLX5_OPC_MOD_MPW << 24 | + txq->wqe_ci << 8 | MLX5_OPCODE_TSO); + ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); + _mm_store_si128(t_wqe, ctrl); + /* Fill ESEG in the header. */ + _mm_store_si128(t_wqe + 1, + _mm_set_epi16(0, 0, 0, 0, + htons(len), cs_flags, + 0, 0)); + txq->wqe_ci = wqe_ci; + } + if (!n) + return 0; + txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); + txq->elts_head = elts_head; + if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { + wqe->ctrl[2] = htonl(8); + wqe->ctrl[3] = txq->elts_head; + txq->elts_comp = 0; + ++txq->cq_pi; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + txq->stats.opackets += n; +#endif + mlx5_tx_dbrec(txq, wqe); + return n; +} + +/** + * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet, + * it returns to make it processed by txq_scatter_v(). All the packets in + * the pkts list should be single segment packets having same offload flags. + * This must be checked by txq_check_multiseg() and txq_calc_offload(). + * + * @param txq + * Pointer to TX queue structure. + * @param pkts + * Pointer to array of packets to be sent. + * @param pkts_n + * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). + * @param cs_flags + * Checksum offload flags to be written in the descriptor. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +static inline uint16_t +txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n, + uint8_t cs_flags) +{ + struct rte_mbuf **elts; + uint16_t elts_head = txq->elts_head; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + const unsigned int nb_dword_per_wqebb = + MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; + const unsigned int nb_dword_in_hdr = + sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; + unsigned int n = 0; + unsigned int pos; + uint16_t max_elts; + uint16_t max_wqe; + uint32_t comp_req = 0; + const uint16_t wq_n = 1 << txq->wqe_n; + const uint16_t wq_mask = wq_n - 1; + uint16_t wq_idx = txq->wqe_ci & wq_mask; + volatile struct mlx5_wqe64 *wq = + &((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx]; + volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq; + const __m128i shuf_mask_ctrl = + _mm_set_epi8(15, 14, 13, 12, + 8, 9, 10, 11, /* bswap32 */ + 4, 5, 6, 7, /* bswap32 */ + 0, 1, 2, 3 /* bswap32 */); + __m128i *t_wqe, *dseg; + __m128i ctrl; + + /* Make sure all packets can fit into a single WQE. */ + assert(elts_n > pkts_n); + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); + max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); + pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); + if (unlikely(!pkts_n)) + return 0; + elts = &(*txq->elts)[elts_head & elts_m]; + /* Loop for available tailroom first. */ + n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n); + for (pos = 0; pos < (n & -2); pos += 2) + _mm_storeu_si128((__m128i *)&elts[pos], + _mm_loadu_si128((__m128i *)&pkts[pos])); + if (n & 1) + elts[pos] = pkts[pos]; + /* Check if it crosses the end of the queue. */ + if (unlikely(n < pkts_n)) { + elts = &(*txq->elts)[0]; + for (pos = 0; pos < pkts_n - n; ++pos) + elts[pos] = pkts[n + pos]; + } + txq->elts_head += pkts_n; + /* Save title WQEBB pointer. */ + t_wqe = (__m128i *)wqe; + dseg = (__m128i *)(wqe + 1); + /* Calculate the number of entries to the end. */ + n = RTE_MIN( + (wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr, + pkts_n); + /* Fill DSEGs. */ + txq_wr_dseg_v(txq, dseg, pkts, n); + /* Check if it crosses the end of the queue. */ + if (n < pkts_n) { + dseg = (__m128i *)txq->wqes; + txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n); + } + if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { + txq->elts_comp += pkts_n; + } else { + /* Request a completion. */ + txq->elts_comp = 0; + ++txq->cq_pi; + comp_req = 8; + } + /* Fill CTRL in the header. */ + ctrl = _mm_set_epi32(txq->elts_head, comp_req, + txq->qp_num_8s | (pkts_n + 2), + MLX5_OPC_MOD_ENHANCED_MPSW << 24 | + txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW); + ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); + _mm_store_si128(t_wqe, ctrl); + /* Fill ESEG in the header. */ + _mm_store_si128(t_wqe + 1, + _mm_set_epi8(0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, cs_flags, + 0, 0, 0, 0)); +#ifdef MLX5_PMD_SOFT_COUNTERS + txq->stats.opackets += pkts_n; +#endif + txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / + nb_dword_per_wqebb; + /* Ring QP doorbell. */ + mlx5_tx_dbrec(txq, wqe); + return pkts_n; +} + +/** + * DPDK callback for vectorized TX. + * + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n) +{ + struct txq *txq = (struct txq *)dpdk_txq; + uint16_t nb_tx = 0; + + while (pkts_n > nb_tx) { + uint16_t n; + uint16_t ret; + + n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); + ret = txq_burst_v(txq, &pkts[nb_tx], n, 0); + nb_tx += ret; + if (!ret) + break; + } + return nb_tx; +} + +/** + * DPDK callback for vectorized TX with multi-seg packets and offload. + * + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct txq *txq = (struct txq *)dpdk_txq; + uint16_t nb_tx = 0; + + while (pkts_n > nb_tx) { + uint8_t cs_flags = 0; + uint16_t n; + uint16_t ret; + + /* Transmit multi-seg packets in the head of pkts list. */ + if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) && + NB_SEGS(pkts[nb_tx]) > 1) + nb_tx += txq_scatter_v(txq, + &pkts[nb_tx], + pkts_n - nb_tx); + n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); + if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS)) + n = txq_check_multiseg(&pkts[nb_tx], n); + if (!(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS)) + n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags); + ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags); + nb_tx += ret; + if (!ret) + break; + } + return nb_tx; +} + +/** + * Store free buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + */ +static inline void +rxq_copy_mbuf_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t n) +{ + const uint16_t q_mask = (1 << rxq->elts_n) - 1; + struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; + unsigned int pos; + uint16_t p = n & -2; + + for (pos = 0; pos < p; pos += 2) { + __m128i mbp; + + mbp = _mm_loadu_si128((__m128i *)&elts[pos]); + _mm_storeu_si128((__m128i *)&pkts[pos], mbp); + } + if (n & 1) + pkts[pos] = elts[pos]; +} + +/** + * Replenish buffers for RX in bulk. + * + * @param rxq + * Pointer to RX queue structure. + * @param n + * Number of buffers to be replenished. + */ +static inline void +rxq_replenish_bulk_mbuf(struct rxq *rxq, uint16_t n) +{ + const uint16_t q_n = 1 << rxq->elts_n; + const uint16_t q_mask = q_n - 1; + const uint16_t elts_idx = rxq->rq_ci & q_mask; + struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; + volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx]; + unsigned int i; + + assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH); + assert(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi))); + assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); + if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { + rxq->stats.rx_nombuf += n; + return; + } + for (i = 0; i < n; ++i) + wq[i].addr = htonll(rte_pktmbuf_mtod(elts[i], uintptr_t)); + rxq->rq_ci += n; + rte_wmb(); + *rxq->rq_db = htonl(rxq->rq_ci); +} + +/** + * Decompress a compressed completion and fill in mbufs in RX SW ring with data + * extracted from the title completion descriptor. + * + * @param rxq + * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. + */ +static inline void +rxq_cq_decompress_v(struct rxq *rxq, + volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts) +{ + volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1); + struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ + unsigned int pos; + unsigned int i; + unsigned int inv = 0; + /* Mask to shuffle from extracted mini CQE to mbuf. */ + const __m128i shuf_mask1 = + _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */ + -1, -1, /* skip vlan_tci */ + 6, 7, /* data_len, bswap16 */ + -1, -1, 6, 7, /* pkt_len, bswap16 */ + -1, -1, -1, -1 /* skip packet_type */); + const __m128i shuf_mask2 = + _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */ + -1, -1, /* skip vlan_tci */ + 14, 15, /* data_len, bswap16 */ + -1, -1, 14, 15, /* pkt_len, bswap16 */ + -1, -1, -1, -1 /* skip packet_type */); + /* Restore the compressed count. Must be 16 bits. */ + const uint16_t mcqe_n = t_pkt->data_len + + (rxq->crc_present * ETHER_CRC_LEN); + const __m128i rearm = + _mm_loadu_si128((__m128i *)&t_pkt->rearm_data); + const __m128i rxdf = + _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1); + const __m128i crc_adj = + _mm_set_epi16(0, 0, 0, + rxq->crc_present * ETHER_CRC_LEN, + 0, + rxq->crc_present * ETHER_CRC_LEN, + 0, 0); + const uint32_t flow_tag = t_pkt->hash.fdir.hi; +#ifdef MLX5_PMD_SOFT_COUNTERS + const __m128i zero = _mm_setzero_si128(); + const __m128i ones = _mm_cmpeq_epi32(zero, zero); + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const __m128i len_shuf_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 14, 15, 6, 7, + 10, 11, 2, 3); +#endif + + /* Compile time sanity check for this function. */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); + /* + * A. load mCQEs into a 128bit register. + * B. store rearm data to mbuf. + * C. combine data from mCQEs with rx_descriptor_fields1. + * D. store rx_descriptor_fields1. + * E. store flow tag (rte_flow mark). + */ + for (pos = 0; pos < mcqe_n; ) { + __m128i mcqe1, mcqe2; + __m128i rxdf1, rxdf2; +#ifdef MLX5_PMD_SOFT_COUNTERS + __m128i byte_cnt, invalid_mask; +#endif + + if (!(pos & 0x7) && pos + 8 < mcqe_n) + rte_prefetch0((void *)(cq + pos + 8)); + /* A.1 load mCQEs into a 128bit register. */ + mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]); + mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]); + /* B.1 store rearm data to mbuf. */ + _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm); + _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm); + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1); + rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2); + rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); + rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); + rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); + rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); + /* D.1 store rx_descriptor_fields1. */ + _mm_storeu_si128((__m128i *) + &elts[pos]->rx_descriptor_fields1, + rxdf1); + _mm_storeu_si128((__m128i *) + &elts[pos + 1]->rx_descriptor_fields1, + rxdf2); + /* B.1 store rearm data to mbuf. */ + _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm); + _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm); + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1); + rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2); + rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); + rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); + rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); + rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); + /* D.1 store rx_descriptor_fields1. */ + _mm_storeu_si128((__m128i *) + &elts[pos + 2]->rx_descriptor_fields1, + rxdf1); + _mm_storeu_si128((__m128i *) + &elts[pos + 3]->rx_descriptor_fields1, + rxdf2); +#ifdef MLX5_PMD_SOFT_COUNTERS + invalid_mask = _mm_set_epi64x(0, + (mcqe_n - pos) * + sizeof(uint16_t) * 8); + invalid_mask = _mm_sll_epi64(ones, invalid_mask); + mcqe1 = _mm_srli_si128(mcqe1, 4); + byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc); + byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask); + byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); + byte_cnt = _mm_hadd_epi16(byte_cnt, zero); + rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); +#endif + if (rxq->mark) { + /* E.1 store flow tag (rte_flow mark). */ + elts[pos]->hash.fdir.hi = flow_tag; + elts[pos + 1]->hash.fdir.hi = flow_tag; + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } + pos += MLX5_VPMD_DESCS_PER_LOOP; + /* Move to next CQE and invalidate consumed CQEs. */ + if (!(pos & 0x7) && pos < mcqe_n) { + mcq = (void *)(cq + pos); + for (i = 0; i < 8; ++i) + cq[inv++].op_own = MLX5_CQE_INVALIDATE; + } + } + /* Invalidate the rest of CQEs. */ + for (; inv < mcqe_n; ++inv) + cq[inv].op_own = MLX5_CQE_INVALIDATE; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += mcqe_n; + rxq->stats.ibytes += rcvd_byte; +#endif + rxq->cq_ci += mcqe_n; +} + +/** + * Calculate packet type and offload flag for mbuf and store it. + * + * @param rxq + * Pointer to RX queue structure. + * @param cqes[4] + * Array of four 16bytes completions extracted from the original completion + * descriptor. + * @param op_err + * Opcode vector having responder error status. Each field is 4B. + * @param pkts + * Pointer to array of packets to be filled. + */ +static inline void +rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err, + struct rte_mbuf **pkts) +{ + __m128i pinfo0, pinfo1; + __m128i pinfo, ptype; + __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH); + __m128i cv_flags; + const __m128i zero = _mm_setzero_si128(); + const __m128i ptype_mask = + _mm_set_epi32(0xd06, 0xd06, 0xd06, 0xd06); + const __m128i ptype_ol_mask = + _mm_set_epi32(0x106, 0x106, 0x106, 0x106); + const __m128i pinfo_mask = + _mm_set_epi32(0x3, 0x3, 0x3, 0x3); + const __m128i cv_flag_sel = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, + (uint8_t)((PKT_RX_IP_CKSUM_GOOD | + PKT_RX_L4_CKSUM_GOOD) >> 1), + 0, + (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), + 0, + (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), + (uint8_t)(PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED), + 0); + const __m128i cv_mask = + _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED); + const __m128i mbuf_init = + _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer); + __m128i rearm0, rearm1, rearm2, rearm3; + + /* Extract pkt_info field. */ + pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); + pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]); + pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1); + /* Extract hdr_type_etc field. */ + pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]); + pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]); + ptype = _mm_unpacklo_epi64(pinfo0, pinfo1); + if (rxq->mark) { + const __m128i pinfo_ft_mask = + _mm_set_epi32(0xffffff00, 0xffffff00, + 0xffffff00, 0xffffff00); + const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR); + const __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID); + __m128i flow_tag, invalid_mask; + + flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask); + /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ + invalid_mask = _mm_cmpeq_epi32(flow_tag, zero); + ol_flags = _mm_or_si128(ol_flags, + _mm_andnot_si128(invalid_mask, + fdir_flags)); + /* Mask out invalid entries. */ + flow_tag = _mm_andnot_si128(invalid_mask, flow_tag); + /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ + ol_flags = _mm_or_si128(ol_flags, + _mm_andnot_si128( + _mm_cmpeq_epi32(flow_tag, + pinfo_ft_mask), + fdir_id_flags)); + } + /* + * Merge the two fields to generate the following: + * bit[1] = l3_ok, bit[2] = l4_ok + * bit[8] = cv, bit[11:10] = l3_hdr_type + * bit[12] = tunneled, bit[13] = outer_l3_type + */ + ptype = _mm_and_si128(ptype, ptype_mask); + pinfo = _mm_and_si128(pinfo, pinfo_mask); + pinfo = _mm_slli_epi32(pinfo, 12); + ptype = _mm_or_si128(ptype, pinfo); + ptype = _mm_srli_epi32(ptype, 10); + ptype = _mm_packs_epi32(ptype, zero); + /* Errored packets will have RTE_PTYPE_ALL_MASK. */ + op_err = _mm_srli_epi16(op_err, 12); + ptype = _mm_or_si128(ptype, op_err); + pkts[0]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 0)]; + pkts[1]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 2)]; + pkts[2]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 4)]; + pkts[3]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 6)]; + /* Fill flags for checksum and VLAN. */ + pinfo = _mm_and_si128(pinfo, ptype_ol_mask); + pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); + /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ + cv_flags = _mm_slli_epi32(pinfo, 9); + cv_flags = _mm_or_si128(pinfo, cv_flags); + /* Move back flags to start from byte[0]. */ + cv_flags = _mm_srli_epi32(cv_flags, 8); + /* Mask out garbage bits. */ + cv_flags = _mm_and_si128(cv_flags, cv_mask); + /* Merge to ol_flags. */ + ol_flags = _mm_or_si128(ol_flags, cv_flags); + /* Merge mbuf_init and ol_flags. */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != + offsetof(struct rte_mbuf, rearm_data) + 8); + rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30); + rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30); + rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30); + rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30); + /* Write 8B rearm_data and 8B ol_flags. */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != + RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); + _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0); + _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1); + _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2); + _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3); +} + +/** + * Skip error packets. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +static uint16_t +rxq_handle_pending_error(struct rxq *rxq, struct rte_mbuf **pkts, + uint16_t pkts_n) +{ + uint16_t n = 0; + unsigned int i; + + for (i = 0; i < pkts_n; ++i) { + struct rte_mbuf *pkt = pkts[i]; + + if (pkt->packet_type == RTE_PTYPE_ALL_MASK) + rte_pktmbuf_free_seg(pkt); + else + pkts[n++] = pkt; + } + rxq->stats.idropped += (pkts_n - n); + rxq->pending_err = 0; + return n; +} + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + unsigned int pos; + uint64_t n; + uint16_t repl_n; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_idx = rxq->cq_ci & q_mask; + unsigned int elts_idx; + unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); + const __m128i owner_check = + _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); + const __m128i opcode_check = + _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); + const __m128i format_check = + _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); + const __m128i resp_err_check = + _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const __m128i len_shuf_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 12, 13, 8, 9, + 4, 5, 0, 1); +#endif + /* Mask to shuffle from extracted CQE to mbuf. */ + const __m128i shuf_mask = + _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */ + 12, 13, 14, 15, /* rss, bswap32 */ + 10, 11, /* vlan_tci, bswap16 */ + 4, 5, /* data_len, bswap16 */ + -1, -1, /* zero out 2nd half of pkt_len */ + 4, 5 /* pkt_len, bswap16 */); + /* Mask to blend from the last Qword to the first DQword. */ + const __m128i blend_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, -1); + const __m128i zero = _mm_setzero_si128(); + const __m128i ones = _mm_cmpeq_epi32(zero, zero); + const __m128i crc_adj = + _mm_set_epi16(0, 0, 0, 0, 0, + rxq->crc_present * ETHER_CRC_LEN, + 0, + rxq->crc_present * ETHER_CRC_LEN); + const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); + + /* Compile time sanity check for this function. */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, pkt_info) != 0); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rx_hash_res) != + offsetof(struct mlx5_cqe, pkt_info) + 12); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rsvd1) + + sizeof(((struct mlx5_cqe *)0)->rsvd1) != + offsetof(struct mlx5_cqe, hdr_type_etc)); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, vlan_info) != + offsetof(struct mlx5_cqe, hdr_type_etc) + 2); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rsvd2) + + sizeof(((struct mlx5_cqe *)0)->rsvd2) != + offsetof(struct mlx5_cqe, byte_cnt)); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, sop_drop_qpn) != + RTE_ALIGN(offsetof(struct mlx5_cqe, sop_drop_qpn), 8)); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, op_own) != + offsetof(struct mlx5_cqe, sop_drop_qpn) + 7); + assert(rxq->sges_n == 0); + assert(rxq->cqe_n == rxq->elts_n); + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch0(cq); + rte_prefetch0(cq + 1); + rte_prefetch0(cq + 2); + rte_prefetch0(cq + 3); + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + /* + * Order of indexes: + * rq_ci >= cq_ci >= rq_pi + * Definition of indexes: + * rq_ci - cq_ci := # of buffers owned by HW (posted). + * cq_ci - rq_pi := # of buffers not returned to app (decompressed). + * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). + */ + repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); + if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH) + rxq_replenish_bulk_mbuf(rxq, repl_n); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->cq_ci - rxq->rq_pi; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); + rxq->rq_pi += rcvd_pkt; + pkts += rcvd_pkt; + } + elts_idx = rxq->rq_pi & q_mask; + elts = &(*rxq->elts)[elts_idx]; + /* Not to overflow pkts array. */ + pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); + if (!pkts_n) + return rcvd_pkt; + /* At this point, there shouldn't be any remained packets. */ + assert(rxq->rq_pi == rxq->cq_ci); + /* + * A. load first Qword (8bytes) in one loop. + * B. copy 4 mbuf pointers from elts ring to returing pkts. + * C. load remained CQE data and extract necessary fields. + * Final 16bytes cqes[] extracted from original 64bytes CQE has the + * following structure: + * struct { + * uint8_t pkt_info; + * uint8_t flow_tag[3]; + * uint16_t byte_cnt; + * uint8_t rsvd4; + * uint8_t op_own; + * uint16_t hdr_type_etc; + * uint16_t vlan_info; + * uint32_t rx_has_res; + * } c; + * D. fill in mbuf. + * E. get valid CQEs. + * F. find compressed CQE. + */ + for (pos = 0; + pos < pkts_n; + pos += MLX5_VPMD_DESCS_PER_LOOP) { + __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP]; + __m128i cqe_tmp1, cqe_tmp2; + __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; + __m128i op_own, op_own_tmp1, op_own_tmp2; + __m128i opcode, owner_mask, invalid_mask; + __m128i comp_mask; + __m128i mask; +#ifdef MLX5_PMD_SOFT_COUNTERS + __m128i byte_cnt; +#endif + __m128i mbp1, mbp2; + __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); + unsigned int p1, p2, p3; + + /* Prefetch next 4 CQEs. */ + if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); + } + /* A.0 do not cross the end of CQ. */ + mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + p = _mm_andnot_si128(mask, p); + /* A.1 load cqes. */ + p3 = _mm_extract_epi16(p, 3); + cqes[3] = _mm_loadl_epi64((__m128i *) + &cq[pos + p3].sop_drop_qpn); + rte_compiler_barrier(); + p2 = _mm_extract_epi16(p, 2); + cqes[2] = _mm_loadl_epi64((__m128i *) + &cq[pos + p2].sop_drop_qpn); + rte_compiler_barrier(); + /* B.1 load mbuf pointers. */ + mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]); + mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]); + /* A.1 load a block having op_own. */ + p1 = _mm_extract_epi16(p, 1); + cqes[1] = _mm_loadl_epi64((__m128i *) + &cq[pos + p1].sop_drop_qpn); + rte_compiler_barrier(); + cqes[0] = _mm_loadl_epi64((__m128i *) + &cq[pos].sop_drop_qpn); + /* B.2 copy mbuf pointers. */ + _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); + _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); + rte_compiler_barrier(); + /* C.1 load remained CQE data and extract necessary fields. */ + cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]); + cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]); + cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask); + cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask); + cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].rsvd1[3]); + cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].rsvd1[3]); + cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30); + cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30); + cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd2[10]); + cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd2[10]); + cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04); + cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04); + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask); + pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask); + /* C.3 adjust CRC length. */ + pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj); + pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj); + /* C.4 adjust flow mark. */ + pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj); + pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj); + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3); + _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2); + /* E.1 extract op_own field. */ + op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]); + /* C.1 load remained CQE data and extract necessary fields. */ + cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]); + cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]); + cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask); + cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask); + cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].rsvd1[3]); + cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].rsvd1[3]); + cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30); + cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30); + cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd2[10]); + cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd2[10]); + cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04); + cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04); + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask); + pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask); + /* C.3 adjust CRC length. */ + pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj); + pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj); + /* C.4 adjust flow mark. */ + pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj); + pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj); + /* E.1 extract op_own byte. */ + op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]); + op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2); + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1); + _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0); + /* E.2 flip owner bit to mark CQEs from last round. */ + owner_mask = _mm_and_si128(op_own, owner_check); + if (ownership) + owner_mask = _mm_xor_si128(owner_mask, owner_check); + owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check); + owner_mask = _mm_packs_epi32(owner_mask, zero); + /* E.3 get mask for invalidated CQEs. */ + opcode = _mm_and_si128(op_own, opcode_check); + invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode); + invalid_mask = _mm_packs_epi32(invalid_mask, zero); + /* E.4 mask out beyond boundary. */ + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* E.5 merge invalid_mask with invalid owner. */ + invalid_mask = _mm_or_si128(invalid_mask, owner_mask); + /* F.1 find compressed CQE format. */ + comp_mask = _mm_and_si128(op_own, format_check); + comp_mask = _mm_cmpeq_epi32(comp_mask, format_check); + comp_mask = _mm_packs_epi32(comp_mask, zero); + /* F.2 mask out invalid entries. */ + comp_mask = _mm_andnot_si128(invalid_mask, comp_mask); + comp_idx = _mm_cvtsi128_si64(comp_mask); + /* F.3 get the first compressed CQE. */ + comp_idx = comp_idx ? + __builtin_ctzll(comp_idx) / + (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + /* E.6 mask out entries after the compressed CQE. */ + mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* E.7 count non-compressed valid CQEs. */ + n = _mm_cvtsi128_si64(invalid_mask); + n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + nocmp_n += n; + /* D.2 get the final invalid mask. */ + mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* D.3 check error in opcode. */ + opcode = _mm_cmpeq_epi32(resp_err_check, opcode); + opcode = _mm_packs_epi32(opcode, zero); + opcode = _mm_andnot_si128(invalid_mask, opcode); + /* D.4 mark if any error is set */ + rxq->pending_err |= !!_mm_cvtsi128_si64(opcode); + /* D.5 fill in mbuf - rearm_data and packet_type. */ + rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ + byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask); + byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); + byte_cnt = _mm_hadd_epi16(byte_cnt, zero); + rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); +#endif + /* + * Break the loop unless more valid CQE is expected, or if + * there's a compressed CQE. + */ + if (n != MLX5_VPMD_DESCS_PER_LOOP) + break; + } + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) + return rcvd_pkt; + /* Update the consumer indexes for non-compressed CQEs. */ + assert(nocmp_n <= pkts_n); + rxq->cq_ci += nocmp_n; + rxq->rq_pi += nocmp_n; + rcvd_pkt += nocmp_n; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += nocmp_n; + rxq->stats.ibytes += rcvd_byte; +#endif + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { + assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->cq_ci - rxq->rq_pi; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); + rxq->rq_pi += n; + rcvd_pkt += n; + } + } + rte_wmb(); + *rxq->cq_db = htonl(rxq->cq_ci); + return rcvd_pkt; +} + +/** + * DPDK callback for vectorized RX. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct rxq *rxq = dpdk_rxq; + uint16_t nb_rx; + + nb_rx = rxq_burst_v(rxq, pkts, pkts_n); + if (unlikely(rxq->pending_err)) + nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx); + return nb_rx; +} + +/** + * Check Tx queue flags are set for raw vectorized Tx. + * + * @param priv + * Pointer to private structure. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __attribute__((cold)) +priv_check_raw_vec_tx_support(struct priv *priv) +{ + uint16_t i; + + /* All the configured queues should support. */ + for (i = 0; i < priv->txqs_n; ++i) { + struct txq *txq = (*priv->txqs)[i]; + + if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) || + !(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS)) + break; + } + if (i != priv->txqs_n) + return -ENOTSUP; + return 1; +} + +/** + * Check a device can support vectorized TX. + * + * @param priv + * Pointer to private structure. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __attribute__((cold)) +priv_check_vec_tx_support(struct priv *priv) +{ + if (priv->txqs_n > MLX5_VPMD_MIN_TXQS || + priv->mps != MLX5_MPW_ENHANCED || + priv->tso) + return -ENOTSUP; + return 1; +} + +/** + * Check a RX queue can support vectorized RX. + * + * @param rxq + * Pointer to RX queue. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __attribute__((cold)) +rxq_check_vec_support(struct rxq *rxq) +{ + if (rxq->sges_n != 0) + return -ENOTSUP; + return 1; +} + +/** + * Check a device can support vectorized RX. + * + * @param priv + * Pointer to private structure. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __attribute__((cold)) +priv_check_vec_rx_support(struct priv *priv) +{ + uint16_t i; + + /* All the configured queues should support. */ + for (i = 0; i < priv->rxqs_n; ++i) { + struct rxq *rxq = (*priv->rxqs)[i]; + + if (rxq_check_vec_support(rxq) < 0) + break; + } + if (i != priv->rxqs_n) + return -ENOTSUP; + return 1; +} + +/** + * Prepare for vectorized RX. + * + * @param priv + * Pointer to private structure. + */ +void +priv_prep_vec_rx_function(struct priv *priv) +{ + uint16_t i; + + for (i = 0; i < priv->rxqs_n; ++i) { + struct rxq *rxq = (*priv->rxqs)[i]; + struct rte_mbuf *mbuf_init = &rxq->fake_mbuf; + const uint16_t desc = 1 << rxq->elts_n; + int j; + + assert(rxq->elts_n == rxq->cqe_n); + /* Initialize default rearm_data for vPMD. */ + mbuf_init->data_off = RTE_PKTMBUF_HEADROOM; + rte_mbuf_refcnt_set(mbuf_init, 1); + mbuf_init->nb_segs = 1; + mbuf_init->port = rxq->port_id; + /* + * prevent compiler reordering: + * rearm_data covers previous fields. + */ + rte_compiler_barrier(); + rxq->mbuf_initializer = + *(uint64_t *)&mbuf_init->rearm_data; + /* Padding with a fake mbuf for vectorized Rx. */ + for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j) + (*rxq->elts)[desc + j] = &rxq->fake_mbuf; + /* Mark that it need to be cleaned up for rxq_alloc_elts(). */ + rxq->trim_elts = 1; + } +} diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index 03e23f58d0..3ad6ac0c6e 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -243,7 +243,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set"); goto error; } - (void)conf; /* Thresholds configuration (ignored). */ + tmpl.txq.flags = conf->txq_flags; assert(desc > MLX5_TX_COMP_THRESH); tmpl.txq.elts_n = log2above(desc); if (priv->mps == MLX5_MPW_ENHANCED)