numam-dpdk/examples/l3fwd/l3fwd_lpm_neon.h
Ruifeng Wang d76d65060c examples/l3fwd: remove useless calculations in NEON LPM
Both L2 and L3 headers will be used in forward processing. And these
two headers are in the same cache line. It has the same effect for
prefetching with L2 header address and prefetching with L3 header
address.

Changed to use L2 header address for prefetching. The change showed
no measurable performance improvement, but it definitely removed
unnecessary instructions for address calculation.

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
2021-07-05 11:30:58 +02:00

165 lines
4.0 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2018 Intel Corporation.
* Copyright(c) 2017-2018 Linaro Limited.
*/
#ifndef __L3FWD_LPM_NEON_H__
#define __L3FWD_LPM_NEON_H__
#include <arm_neon.h>
#include "l3fwd_neon.h"
/*
* Read packet_type and destination IPV4 addresses from 4 mbufs.
*/
static inline void
processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
int32x4_t *dip,
uint32_t *ipv4_flag)
{
struct rte_ipv4_hdr *ipv4_hdr;
struct rte_ether_hdr *eth_hdr;
int32_t dst[FWDSTEP];
eth_hdr = rte_pktmbuf_mtod(pkt[0], struct rte_ether_hdr *);
ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
dst[0] = ipv4_hdr->dst_addr;
ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
eth_hdr = rte_pktmbuf_mtod(pkt[1], struct rte_ether_hdr *);
ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
dst[1] = ipv4_hdr->dst_addr;
ipv4_flag[0] &= pkt[1]->packet_type;
eth_hdr = rte_pktmbuf_mtod(pkt[2], struct rte_ether_hdr *);
ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
dst[2] = ipv4_hdr->dst_addr;
ipv4_flag[0] &= pkt[2]->packet_type;
eth_hdr = rte_pktmbuf_mtod(pkt[3], struct rte_ether_hdr *);
ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
dst[3] = ipv4_hdr->dst_addr;
ipv4_flag[0] &= pkt[3]->packet_type;
dip[0] = vld1q_s32(dst);
}
/*
* Lookup into LPM for destination port.
* If lookup fails, use incoming port (portid) as destination port.
*/
static inline void
processx4_step2(const struct lcore_conf *qconf,
int32x4_t dip,
uint32_t ipv4_flag,
uint16_t portid,
struct rte_mbuf *pkt[FWDSTEP],
uint16_t dprt[FWDSTEP])
{
rte_xmm_t dst;
dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
/* if all 4 packets are IPV4. */
if (likely(ipv4_flag)) {
rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dst.u32,
portid);
/* get rid of unused upper 16 bit for each dport. */
vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
} else {
dst.x = dip;
dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
dst.u32[0], portid);
dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
dst.u32[1], portid);
dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
dst.u32[2], portid);
dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
dst.u32[3], portid);
}
}
/*
* Buffer optimized handling of packets, invoked
* from main_loop.
*/
static inline void
l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
uint16_t portid, struct lcore_conf *qconf)
{
int32_t i = 0, j = 0;
uint16_t dst_port[MAX_PKT_BURST];
int32x4_t dip;
uint32_t ipv4_flag;
const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
const int32_t m = nb_rx % FWDSTEP;
if (k) {
for (i = 0; i < FWDSTEP; i++) {
rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
void *));
}
for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
for (i = 0; i < FWDSTEP; i++) {
rte_prefetch0(rte_pktmbuf_mtod(
pkts_burst[j + i + FWDSTEP],
void *));
}
processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
processx4_step2(qconf, dip, ipv4_flag, portid,
&pkts_burst[j], &dst_port[j]);
}
processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
&dst_port[j]);
j += FWDSTEP;
}
if (m) {
/* Prefetch last up to 3 packets one by one */
switch (m) {
case 3:
rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
void *));
j++;
/* fallthrough */
case 2:
rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
void *));
j++;
/* fallthrough */
case 1:
rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
void *));
j++;
}
j -= m;
/* Classify last up to 3 packets one by one */
switch (m) {
case 3:
dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
portid);
j++;
/* fallthrough */
case 2:
dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
portid);
j++;
/* fallthrough */
case 1:
dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
portid);
}
}
send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
}
#endif /* __L3FWD_LPM_NEON_H__ */