a3d4f4d291
This patch adds NEON vector instructions to optimize Rx burst process. Signed-off-by: Chengwen Feng <fengchengwen@huawei.com> Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com> Signed-off-by: Huisong Li <lihuisong@huawei.com>
289 lines
8.6 KiB
C
289 lines
8.6 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright(c) 2020 Hisilicon Limited.
|
|
*/
|
|
|
|
#ifndef _HNS3_RXTX_VEC_NEON_H_
|
|
#define _HNS3_RXTX_VEC_NEON_H_
|
|
|
|
#include <arm_neon.h>
|
|
|
|
#pragma GCC diagnostic ignored "-Wcast-qual"
|
|
|
|
static inline void
|
|
hns3_vec_tx(volatile struct hns3_desc *desc, struct rte_mbuf *pkt)
|
|
{
|
|
uint64x2_t val1 = {
|
|
pkt->buf_iova + pkt->data_off,
|
|
((uint64_t)pkt->data_len) << HNS3_TXD_SEND_SIZE_SHIFT
|
|
};
|
|
uint64x2_t val2 = {
|
|
0,
|
|
((uint64_t)HNS3_TXD_DEFAULT_VLD_FE_BDTYPE) << HNS3_UINT32_BIT
|
|
};
|
|
vst1q_u64((uint64_t *)&desc->addr, val1);
|
|
vst1q_u64((uint64_t *)&desc->tx.outer_vlan_tag, val2);
|
|
}
|
|
|
|
static uint16_t
|
|
hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
|
|
struct rte_mbuf **__restrict tx_pkts,
|
|
uint16_t nb_pkts)
|
|
{
|
|
struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue;
|
|
volatile struct hns3_desc *tx_desc;
|
|
struct hns3_entry *tx_entry;
|
|
uint16_t next_to_use;
|
|
uint16_t nb_commit;
|
|
uint16_t nb_tx;
|
|
uint16_t n, i;
|
|
|
|
if (txq->tx_bd_ready < txq->tx_free_thresh)
|
|
hns3_tx_free_buffers(txq);
|
|
|
|
nb_commit = RTE_MIN(txq->tx_bd_ready, nb_pkts);
|
|
if (unlikely(nb_commit == 0)) {
|
|
txq->queue_full_cnt++;
|
|
return 0;
|
|
}
|
|
nb_tx = nb_commit;
|
|
|
|
next_to_use = txq->next_to_use;
|
|
tx_desc = &txq->tx_ring[next_to_use];
|
|
tx_entry = &txq->sw_ring[next_to_use];
|
|
|
|
/*
|
|
* We need to deal with n descriptors first for better performance,
|
|
* if nb_commit is greater than the difference between txq->nb_tx_desc
|
|
* and next_to_use in sw_ring and tx_ring.
|
|
*/
|
|
n = txq->nb_tx_desc - next_to_use;
|
|
if (nb_commit >= n) {
|
|
for (i = 0; i < n; i++, tx_pkts++, tx_desc++) {
|
|
hns3_vec_tx(tx_desc, *tx_pkts);
|
|
tx_entry[i].mbuf = *tx_pkts;
|
|
}
|
|
|
|
nb_commit -= n;
|
|
next_to_use = 0;
|
|
tx_desc = &txq->tx_ring[next_to_use];
|
|
tx_entry = &txq->sw_ring[next_to_use];
|
|
}
|
|
|
|
for (i = 0; i < nb_commit; i++, tx_pkts++, tx_desc++) {
|
|
hns3_vec_tx(tx_desc, *tx_pkts);
|
|
tx_entry[i].mbuf = *tx_pkts;
|
|
}
|
|
|
|
next_to_use += nb_commit;
|
|
txq->next_to_use = next_to_use;
|
|
txq->tx_bd_ready -= nb_tx;
|
|
|
|
hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
|
|
|
|
return nb_tx;
|
|
}
|
|
|
|
static inline uint32_t
|
|
hns3_desc_parse_field(struct hns3_rx_queue *rxq,
|
|
struct hns3_entry *sw_ring,
|
|
struct hns3_desc *rxdp,
|
|
uint32_t bd_vld_num)
|
|
{
|
|
uint32_t l234_info, ol_info, bd_base_info;
|
|
struct rte_mbuf *pkt;
|
|
uint32_t retcode = 0;
|
|
uint32_t cksum_err;
|
|
int ret, i;
|
|
|
|
for (i = 0; i < (int)bd_vld_num; i++) {
|
|
pkt = sw_ring[i].mbuf;
|
|
|
|
/* init rte_mbuf.rearm_data last 64-bit */
|
|
pkt->ol_flags = PKT_RX_RSS_HASH;
|
|
|
|
l234_info = rxdp[i].rx.l234_info;
|
|
ol_info = rxdp[i].rx.ol_info;
|
|
bd_base_info = rxdp[i].rx.bd_base_info;
|
|
ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info,
|
|
l234_info, &cksum_err);
|
|
if (unlikely(ret)) {
|
|
retcode |= 1u << i;
|
|
continue;
|
|
}
|
|
|
|
pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
|
|
if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
|
|
hns3_rx_set_cksum_flag(pkt, pkt->packet_type,
|
|
cksum_err);
|
|
}
|
|
|
|
return retcode;
|
|
}
|
|
|
|
static inline uint16_t
|
|
hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
|
|
struct rte_mbuf **__restrict rx_pkts,
|
|
uint16_t nb_pkts,
|
|
uint64_t *bd_err_mask)
|
|
{
|
|
uint16_t rx_id = rxq->next_to_use;
|
|
struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
|
|
struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
|
|
uint32_t bd_valid_num, parse_retcode;
|
|
uint16_t nb_rx = 0;
|
|
int pos, offset;
|
|
|
|
/* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
|
|
uint8x16_t shuf_desc_fields_msk = {
|
|
0xff, 0xff, 0xff, 0xff, /* packet type init zero */
|
|
22, 23, 0xff, 0xff, /* rx.pkt_len to rte_mbuf.pkt_len */
|
|
20, 21, /* size to rte_mbuf.data_len */
|
|
0xff, 0xff, /* rte_mbuf.vlan_tci init zero */
|
|
8, 9, 10, 11, /* rx.rss_hash to rte_mbuf.hash.rss */
|
|
};
|
|
|
|
uint16x8_t crc_adjust = {
|
|
0, 0, /* ignore pkt_type field */
|
|
rxq->crc_len, /* sub crc on pkt_len */
|
|
0, /* ignore high-16bits of pkt_len */
|
|
rxq->crc_len, /* sub crc on data_len */
|
|
0, 0, 0, /* ignore non-length fields */
|
|
};
|
|
|
|
for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
|
|
rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
|
|
uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
|
|
uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
|
|
uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
|
|
uint64x2_t mbp1, mbp2;
|
|
uint16x4_t bd_vld = {0};
|
|
uint16x8_t tmp;
|
|
uint64_t stat;
|
|
|
|
/* calc how many bd valid */
|
|
bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
|
|
bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
|
|
bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
|
|
bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
|
|
|
|
/* load 2 mbuf pointer */
|
|
mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
|
|
|
|
bd_vld = vshl_n_u16(bd_vld,
|
|
HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
|
|
bd_vld = vreinterpret_u16_s16(
|
|
vshr_n_s16(vreinterpret_s16_u16(bd_vld),
|
|
HNS3_UINT16_BIT - 1));
|
|
stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
|
|
|
|
/* load 2 mbuf pointer again */
|
|
mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
|
|
|
|
if (likely(stat == 0))
|
|
bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
|
|
else
|
|
bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
|
|
if (bd_valid_num == 0)
|
|
break;
|
|
|
|
/* use offset to control below data load oper ordering */
|
|
offset = rxq->offset_table[bd_valid_num];
|
|
|
|
/* store 2 mbuf pointer into rx_pkts */
|
|
vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
|
|
|
|
/* read first two descs */
|
|
descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
|
|
descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
|
|
|
|
/* store 2 mbuf pointer into rx_pkts again */
|
|
vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
|
|
|
|
/* read remains two descs */
|
|
descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
|
|
descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
|
|
|
|
pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
|
|
pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
|
|
pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
|
|
pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
|
|
|
|
/* pkt 1,2 convert format from desc to pktmbuf */
|
|
pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk);
|
|
pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk);
|
|
|
|
/* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
|
|
*(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
|
|
rxq->mbuf_initializer;
|
|
*(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
|
|
rxq->mbuf_initializer;
|
|
|
|
/* pkt 1,2 remove crc */
|
|
tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
|
|
pkt_mb1 = vreinterpretq_u8_u16(tmp);
|
|
tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
|
|
pkt_mb2 = vreinterpretq_u8_u16(tmp);
|
|
|
|
pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
|
|
pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
|
|
pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
|
|
pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
|
|
|
|
/* pkt 3,4 convert format from desc to pktmbuf */
|
|
pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk);
|
|
pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk);
|
|
|
|
/* pkt 1,2 save to rx_pkts mbuf */
|
|
vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
|
|
pkt_mb1);
|
|
vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
|
|
pkt_mb2);
|
|
|
|
/* pkt 3,4 remove crc */
|
|
tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
|
|
pkt_mb3 = vreinterpretq_u8_u16(tmp);
|
|
tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
|
|
pkt_mb4 = vreinterpretq_u8_u16(tmp);
|
|
|
|
/* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
|
|
*(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
|
|
rxq->mbuf_initializer;
|
|
*(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
|
|
rxq->mbuf_initializer;
|
|
|
|
/* pkt 3,4 save to rx_pkts mbuf */
|
|
vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
|
|
pkt_mb3);
|
|
vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
|
|
pkt_mb4);
|
|
|
|
rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
|
|
|
|
parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
|
|
&rxdp[offset], bd_valid_num);
|
|
if (unlikely(parse_retcode))
|
|
(*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
|
|
|
|
rte_prefetch0(sw_ring[pos +
|
|
HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
|
|
rte_prefetch0(sw_ring[pos +
|
|
HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
|
|
rte_prefetch0(sw_ring[pos +
|
|
HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
|
|
rte_prefetch0(sw_ring[pos +
|
|
HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
|
|
|
|
nb_rx += bd_valid_num;
|
|
if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
|
|
break;
|
|
}
|
|
|
|
rxq->rx_rearm_nb += nb_rx;
|
|
rxq->next_to_use += nb_rx;
|
|
if (rxq->next_to_use >= rxq->nb_rx_desc)
|
|
rxq->next_to_use = 0;
|
|
|
|
return nb_rx;
|
|
}
|
|
#endif /* _HNS3_RXTX_VEC_NEON_H_ */
|