net/i40e: reduce L1 cache misses in NEON Rx
For N1 platform, packet mbuf load and descs load are hot spots to limit the performance for "desc_to_ptype_v" and "desc_to_olflags_v" functions in i40e rx NEON path. This is because packet mbuf and descs are evicted from l1d-cache to l2d-cache. To reduce l1d-cache-misses and improve the performance, change the code order and move "desc_to_ptype_v" and "desc_to_olflags_v" functions forward to the location, where packet mbuf and descs are just loaded. Test Result: dpdk:21.08-rc1 gcc-9 For n1sdp, the patch improves the performance by 1.8%. For thunderx2, no performance changes. Signed-off-by: Feifei Wang <feifei.wang2@arm.com> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
This commit is contained in:
parent
decc3b6aa5
commit
319df9f9bf
@ -301,18 +301,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
|
||||
rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
|
||||
}
|
||||
|
||||
/* C.1 4=>2 filter staterr info only */
|
||||
sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
|
||||
vreinterpretq_u16_u64(descs[3]));
|
||||
sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
|
||||
vreinterpretq_u16_u64(descs[2]));
|
||||
|
||||
/* C.2 get 4 pkts staterr value */
|
||||
staterr = vzipq_u16(sterr_tmp1.val[1],
|
||||
sterr_tmp2.val[1]).val[0];
|
||||
|
||||
desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
|
||||
|
||||
/* pkts shift the pktlen field to be 16-bit aligned*/
|
||||
uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
|
||||
len_shl);
|
||||
@ -367,10 +355,22 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
|
||||
|
||||
desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
|
||||
|
||||
desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
|
||||
|
||||
if (likely(pos + RTE_I40E_DESCS_PER_LOOP < nb_pkts)) {
|
||||
rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP);
|
||||
}
|
||||
|
||||
/* C.1 4=>2 filter staterr info only */
|
||||
sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
|
||||
vreinterpretq_u16_u64(descs[3]));
|
||||
sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
|
||||
vreinterpretq_u16_u64(descs[2]));
|
||||
|
||||
/* C.2 get 4 pkts staterr value */
|
||||
staterr = vzipq_u16(sterr_tmp1.val[1],
|
||||
sterr_tmp2.val[1]).val[0];
|
||||
|
||||
/* C* extract and record EOP bit */
|
||||
if (split_packet) {
|
||||
uint8x16_t eop_shuf_mask = {
|
||||
|
Loading…
Reference in New Issue
Block a user