net/iavf: add offload path for Tx AVX512

Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
Acked-by: Qi Zhang <qi.z.zhang@intel.com>
This commit is contained in:
Wenzhuo Lu 2021-04-14 15:34:07 +08:00 committed by Qi Zhang
parent d309785f00
commit 059f18ae2a
4 changed files with 210 additions and 69 deletions

View File

@ -160,7 +160,7 @@ check_rx_vec_allow(struct iavf_rx_queue *rxq)
static inline bool
check_tx_vec_allow(struct iavf_tx_queue *txq)
{
if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@ -2498,17 +2498,23 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
#ifdef RTE_ARCH_X86
struct iavf_tx_queue *txq;
int i;
int check_ret;
bool use_sse = false;
bool use_avx2 = false;
#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
#endif
if (!iavf_tx_vec_dev_check(dev) &&
check_ret = iavf_tx_vec_dev_check(dev);
if (check_ret >= 0 &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
/* SSE and AVX2 not support offload path yet. */
if (check_ret == IAVF_VECTOR_PATH) {
use_sse = true;
if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
use_avx2 = true;
}
#ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@ -2516,15 +2522,29 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
use_avx512 = true;
#endif
if (!use_sse && !use_avx2 && !use_avx512)
goto normal;
if (!use_avx512) {
PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
use_avx2 ? "avx2 " : "",
dev->data->port_id);
dev->tx_pkt_burst = use_avx2 ?
iavf_xmit_pkts_vec_avx2 :
iavf_xmit_pkts_vec;
}
#ifdef CC_AVX512_SUPPORT
if (use_avx512)
if (use_avx512) {
if (check_ret == IAVF_VECTOR_PATH) {
dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx (port %d).",
dev->data->port_id);
} else {
dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload;
PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector Tx (port %d).",
dev->data->port_id);
}
}
#endif
dev->tx_pkt_prepare = NULL;
@ -2544,8 +2564,9 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
return;
}
#endif
normal:
#endif
PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
dev->data->port_id);
dev->tx_pkt_burst = iavf_xmit_pkts;

View File

@ -23,14 +23,21 @@
#define IAVF_VPMD_DESCS_PER_LOOP 4
#define IAVF_VPMD_TX_MAX_FREE_BUF 64
#define IAVF_NO_VECTOR_FLAGS ( \
#define IAVF_TX_NO_VECTOR_FLAGS ( \
DEV_TX_OFFLOAD_MULTI_SEGS | \
DEV_TX_OFFLOAD_TCP_TSO)
#define IAVF_TX_VECTOR_OFFLOAD ( \
DEV_TX_OFFLOAD_VLAN_INSERT | \
DEV_TX_OFFLOAD_QINQ_INSERT | \
DEV_TX_OFFLOAD_IPV4_CKSUM | \
DEV_TX_OFFLOAD_SCTP_CKSUM | \
DEV_TX_OFFLOAD_UDP_CKSUM | \
DEV_TX_OFFLOAD_TCP_TSO | \
DEV_TX_OFFLOAD_TCP_CKSUM)
#define IAVF_VECTOR_PATH 0
#define IAVF_VECTOR_OFFLOAD_PATH 1
#define DEFAULT_TX_RS_THRESH 32
#define DEFAULT_TX_FREE_THRESH 32
@ -488,6 +495,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
uint16_t nb_pkts);
uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);

View File

@ -1518,14 +1518,16 @@ tx_backlog_entry_avx512(struct iavf_tx_vec_entry *txep,
txep[i].mbuf = tx_pkts[i];
}
static inline void
static __rte_always_inline void
iavf_vtx1(volatile struct iavf_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
struct rte_mbuf *pkt, uint64_t flags, bool offload)
{
uint64_t high_qw =
(IAVF_TX_DESC_DTYPE_DATA |
((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT) |
((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
if (offload)
iavf_txd_enable_offload(pkt, &high_qw);
__m128i descriptor = _mm_set_epi64x(high_qw,
pkt->buf_iova + pkt->data_off);
@ -1534,62 +1536,70 @@ iavf_vtx1(volatile struct iavf_tx_desc *txdp,
#define IAVF_TX_LEN_MASK 0xAA
#define IAVF_TX_OFF_MASK 0x55
static inline void
static __rte_always_inline void
iavf_vtx(volatile struct iavf_tx_desc *txdp,
struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags,
bool offload)
{
const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT));
/* if unaligned on 32-bit boundary, do one to align */
if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
iavf_vtx1(txdp, *pkt, flags);
iavf_vtx1(txdp, *pkt, flags, offload);
nb_pkts--, txdp++, pkt++;
}
/* do 4 at a time while possible, in bursts */
for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
__m512i desc4 =
_mm512_set_epi64
((uint64_t)pkt[3]->data_len,
pkt[3]->buf_iova,
(uint64_t)pkt[2]->data_len,
pkt[2]->buf_iova,
(uint64_t)pkt[1]->data_len,
pkt[1]->buf_iova,
(uint64_t)pkt[0]->data_len,
pkt[0]->buf_iova);
__m512i hi_qw_tmpl_4 = _mm512_set1_epi64(hi_qw_tmpl);
__m512i data_off_4 =
_mm512_set_epi64
(0,
pkt[3]->data_off,
0,
pkt[2]->data_off,
0,
pkt[1]->data_off,
0,
pkt[0]->data_off);
desc4 = _mm512_mask_slli_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
uint64_t hi_qw3 =
hi_qw_tmpl |
((uint64_t)pkt[3]->data_len <<
IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
desc4 = _mm512_mask_or_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
hi_qw_tmpl_4);
desc4 = _mm512_mask_add_epi64(desc4, IAVF_TX_OFF_MASK, desc4,
data_off_4);
_mm512_storeu_si512((void *)txdp, desc4);
if (offload)
iavf_txd_enable_offload(pkt[3], &hi_qw3);
uint64_t hi_qw2 =
hi_qw_tmpl |
((uint64_t)pkt[2]->data_len <<
IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
if (offload)
iavf_txd_enable_offload(pkt[2], &hi_qw2);
uint64_t hi_qw1 =
hi_qw_tmpl |
((uint64_t)pkt[1]->data_len <<
IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
if (offload)
iavf_txd_enable_offload(pkt[1], &hi_qw1);
uint64_t hi_qw0 =
hi_qw_tmpl |
((uint64_t)pkt[0]->data_len <<
IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
if (offload)
iavf_txd_enable_offload(pkt[0], &hi_qw0);
__m512i desc0_3 =
_mm512_set_epi64
(hi_qw3,
pkt[3]->buf_iova + pkt[3]->data_off,
hi_qw2,
pkt[2]->buf_iova + pkt[2]->data_off,
hi_qw1,
pkt[1]->buf_iova + pkt[1]->data_off,
hi_qw0,
pkt[0]->buf_iova + pkt[0]->data_off);
_mm512_storeu_si512((void *)txdp, desc0_3);
}
/* do any last ones */
while (nb_pkts) {
iavf_vtx1(txdp, *pkt, flags);
iavf_vtx1(txdp, *pkt, flags, offload);
txdp++, pkt++, nb_pkts--;
}
}
static inline uint16_t
static __rte_always_inline uint16_t
iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
uint16_t nb_pkts, bool offload)
{
struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
volatile struct iavf_tx_desc *txdp;
@ -1620,11 +1630,11 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
if (nb_commit >= n) {
tx_backlog_entry_avx512(txep, tx_pkts, n);
iavf_vtx(txdp, tx_pkts, n - 1, flags);
iavf_vtx(txdp, tx_pkts, n - 1, flags, offload);
tx_pkts += (n - 1);
txdp += (n - 1);
iavf_vtx1(txdp, *tx_pkts++, rs);
iavf_vtx1(txdp, *tx_pkts++, rs, offload);
nb_commit = (uint16_t)(nb_commit - n);
@ -1639,7 +1649,7 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
iavf_vtx(txdp, tx_pkts, nb_commit, flags);
iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload);
tx_id = (uint16_t)(tx_id + nb_commit);
if (tx_id > txq->next_rs) {
@ -1657,9 +1667,9 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
return nb_pkts;
}
uint16_t
iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
static __rte_always_inline uint16_t
iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts, bool offload)
{
uint16_t nb_tx = 0;
struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
@ -1669,7 +1679,7 @@ iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
num);
num, offload);
nb_tx += ret;
nb_pkts -= ret;
if (ret < num)
@ -1679,6 +1689,13 @@ iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
return nb_tx;
}
uint16_t
iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
{
return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false);
}
static inline void
iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq)
{
@ -1709,3 +1726,10 @@ iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq)
txq->ops = &avx512_vec_txq_ops;
return 0;
}
uint16_t
iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
{
return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true);
}

View File

@ -240,14 +240,17 @@ iavf_tx_vec_queue_default(struct iavf_tx_queue *txq)
if (!txq)
return -1;
if (txq->offloads & IAVF_NO_VECTOR_FLAGS)
return -1;
if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
return -1;
return 0;
if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
return -1;
if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD)
return IAVF_VECTOR_OFFLOAD_PATH;
return IAVF_VECTOR_PATH;
}
static inline int
@ -270,14 +273,97 @@ iavf_tx_vec_dev_check_default(struct rte_eth_dev *dev)
{
int i;
struct iavf_tx_queue *txq;
int ret;
int result = 0;
for (i = 0; i < dev->data->nb_tx_queues; i++) {
txq = dev->data->tx_queues[i];
if (iavf_tx_vec_queue_default(txq))
ret = iavf_tx_vec_queue_default(txq);
if (ret < 0)
return -1;
if (ret > result)
result = ret;
}
return 0;
return result;
}
/******************************************************************************
* If user knows a specific offload is not enabled by APP,
* the macro can be commented to save the effort of fast path.
* Currently below 2 features are supported in TX path,
* 1, checksum offload
* 2, VLAN/QINQ insertion
******************************************************************************/
#define IAVF_TX_CSUM_OFFLOAD
#define IAVF_TX_VLAN_QINQ_OFFLOAD
static __rte_always_inline void
iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt,
uint64_t *txd_hi)
{
#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD)
uint64_t ol_flags = tx_pkt->ol_flags;
#endif
uint32_t td_cmd = 0;
#ifdef IAVF_TX_CSUM_OFFLOAD
uint32_t td_offset = 0;
#endif
#ifdef IAVF_TX_CSUM_OFFLOAD
/* Set MACLEN */
td_offset |= (tx_pkt->l2_len >> 1) <<
IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
/* Enable L3 checksum offloads */
if (ol_flags & PKT_TX_IP_CKSUM) {
td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM;
td_offset |= (tx_pkt->l3_len >> 2) <<
IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
} else if (ol_flags & PKT_TX_IPV4) {
td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4;
td_offset |= (tx_pkt->l3_len >> 2) <<
IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
} else if (ol_flags & PKT_TX_IPV6) {
td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
td_offset |= (tx_pkt->l3_len >> 2) <<
IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
}
/* Enable L4 checksum offloads */
switch (ol_flags & PKT_TX_L4_MASK) {
case PKT_TX_TCP_CKSUM:
td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) <<
IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
break;
case PKT_TX_SCTP_CKSUM:
td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) <<
IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
break;
case PKT_TX_UDP_CKSUM:
td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
td_offset |= (sizeof(struct rte_udp_hdr) >> 2) <<
IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
break;
default:
break;
}
*txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT;
#endif
#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD
if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) {
td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
*txd_hi |= ((uint64_t)tx_pkt->vlan_tci <<
IAVF_TXD_QW1_L2TAG1_SHIFT);
}
#endif
*txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT;
}
#ifdef CC_AVX2_SUPPORT