numam-dpdk/drivers/net/virtio/virtio_rxtx_simple_neon.c
Xueming Li d99088431e net/virtio: fix vectorized Rx queue rearm
When Rx queue worked in vectorized mode and rxd <= 512, under traffic of
high PPS rate, testpmd often start and receive packets of rxd without
further growth.

Testpmd started with rxq flush which tried to rx MAX_PKT_BURST(512)
packets and drop. When Rx burst size >= Rx queue size, all descriptors
in used queue consumed without rearm, device can't receive more packets.
The next Rx burst returned at once since no used descriptors found,
rearm logic was skipped, rx vq kept in starving state.

To avoid rx vq starving, this patch always check the available queue,
rearm if needed even no used descriptor reported by device.

Fixes: fc3d66212f ("virtio: add vector Rx")
Fixes: 2d7c37194e ("net/virtio: add NEON based Rx handler")
Fixes: 52b5a707e6 ("net/virtio: add Altivec Rx")
Cc: stable@dpdk.org

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Reviewed-by: David Christensen <drc@linux.vnet.ibm.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
2021-05-04 10:08:11 +02:00

215 lines
6.5 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2016 Cavium, Inc
*/
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <rte_byteorder.h>
#include <rte_branch_prediction.h>
#include <rte_cycles.h>
#include <rte_ether.h>
#include <ethdev_driver.h>
#include <rte_errno.h>
#include <rte_memory.h>
#include <rte_mempool.h>
#include <rte_malloc.h>
#include <rte_mbuf.h>
#include <rte_prefetch.h>
#include <rte_string_fns.h>
#include <rte_vect.h>
#include "virtio_rxtx_simple.h"
#define RTE_VIRTIO_DESC_PER_LOOP 8
/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
*
* This routine is for non-mergeable RX, one desc for each guest buffer.
* This routine is based on the RX ring layout optimization. Each entry in the
* avail ring points to the desc with the same index in the desc ring and this
* will never be changed in the driver.
*
* - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
*/
uint16_t
virtio_recv_pkts_vec(void *rx_queue,
struct rte_mbuf **__rte_restrict rx_pkts,
uint16_t nb_pkts)
{
struct virtnet_rx *rxvq = rx_queue;
struct virtqueue *vq = virtnet_rxq_to_vq(rxvq);
struct virtio_hw *hw = vq->hw;
uint16_t nb_used, nb_total;
uint16_t desc_idx;
struct vring_used_elem *rused;
struct rte_mbuf **sw_ring;
struct rte_mbuf **sw_ring_end;
struct rte_mbuf **ref_rx_pkts;
uint16_t nb_pkts_received = 0;
uint8x16_t shuf_msk1 = {
0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
4, 5, 0xFF, 0xFF, /* pkt len */
4, 5, /* dat len */
0xFF, 0xFF, /* vlan tci */
0xFF, 0xFF, 0xFF, 0xFF
};
uint8x16_t shuf_msk2 = {
0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
12, 13, 0xFF, 0xFF, /* pkt len */
12, 13, /* dat len */
0xFF, 0xFF, /* vlan tci */
0xFF, 0xFF, 0xFF, 0xFF
};
/* Subtract the header length.
* In which case do we need the header length in used->len ?
*/
uint16x8_t len_adjust = {
0, 0,
(uint16_t)hw->vtnet_hdr_size, 0,
(uint16_t)hw->vtnet_hdr_size,
0,
0, 0
};
if (unlikely(hw->started == 0))
return nb_pkts_received;
if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
return 0;
if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
virtio_rxq_rearm_vec(rxvq);
if (unlikely(virtqueue_kick_prepare(vq)))
virtqueue_notify(vq);
}
/* virtqueue_nused has a load-acquire or rte_io_rmb inside */
nb_used = virtqueue_nused(vq);
if (unlikely(nb_used == 0))
return 0;
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
nb_used = RTE_MIN(nb_used, nb_pkts);
desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
rused = &vq->vq_split.ring.used->ring[desc_idx];
sw_ring = &vq->sw_ring[desc_idx];
sw_ring_end = &vq->sw_ring[vq->vq_nentries];
rte_prefetch_non_temporal(rused);
nb_total = nb_used;
ref_rx_pkts = rx_pkts;
for (nb_pkts_received = 0;
nb_pkts_received < nb_total;) {
uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0));
desc[0] = vld1q_u64((uint64_t *)(rused + 0));
vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]);
mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2));
desc[1] = vld1q_u64((uint64_t *)(rused + 2));
vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]);
mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4));
desc[2] = vld1q_u64((uint64_t *)(rused + 4));
vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]);
mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6));
desc[3] = vld1q_u64((uint64_t *)(rused + 6));
vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]);
pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
vreinterpretq_u8_u64(desc[0]), shuf_msk2));
pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
vreinterpretq_u8_u64(desc[0]), shuf_msk1));
pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1,
pkt_mb[1]);
vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1,
pkt_mb[0]);
pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
vreinterpretq_u8_u64(desc[1]), shuf_msk2));
pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
vreinterpretq_u8_u64(desc[1]), shuf_msk1));
pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1,
pkt_mb[3]);
vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1,
pkt_mb[2]);
pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8(
vreinterpretq_u8_u64(desc[2]), shuf_msk2));
pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8(
vreinterpretq_u8_u64(desc[2]), shuf_msk1));
pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16(
vreinterpretq_u16_u64(pkt_mb[5]), len_adjust));
pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16(
vreinterpretq_u16_u64(pkt_mb[4]), len_adjust));
vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1,
pkt_mb[5]);
vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1,
pkt_mb[4]);
pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8(
vreinterpretq_u8_u64(desc[3]), shuf_msk2));
pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8(
vreinterpretq_u8_u64(desc[3]), shuf_msk1));
pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16(
vreinterpretq_u16_u64(pkt_mb[7]), len_adjust));
pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16(
vreinterpretq_u16_u64(pkt_mb[6]), len_adjust));
vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1,
pkt_mb[7]);
vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1,
pkt_mb[6]);
if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
if (sw_ring + nb_used <= sw_ring_end)
nb_pkts_received += nb_used;
else
nb_pkts_received += sw_ring_end - sw_ring;
break;
} else {
if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
sw_ring_end)) {
nb_pkts_received += sw_ring_end - sw_ring;
break;
} else {
nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
rused += RTE_VIRTIO_DESC_PER_LOOP;
nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
}
}
}
vq->vq_used_cons_idx += nb_pkts_received;
vq->vq_free_cnt += nb_pkts_received;
rxvq->stats.packets += nb_pkts_received;
for (nb_used = 0; nb_used < nb_pkts_received; nb_used++)
virtio_update_packet_stats(&rxvq->stats, ref_rx_pkts[nb_used]);
return nb_pkts_received;
}