numam-dpdk/lib/vhost/virtio_net.c

3172 lines
80 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2016 Intel Corporation
*/
#include <stdint.h>
#include <stdbool.h>
#include <linux/virtio_net.h>
#include <rte_mbuf.h>
#include <rte_memcpy.h>
#include <rte_net.h>
#include <rte_ether.h>
#include <rte_ip.h>
#include <rte_vhost.h>
#include <rte_tcp.h>
#include <rte_udp.h>
#include <rte_sctp.h>
vhost: broadcast RARP by injecting in receiving mbuf array Broadcast RARP packet by injecting it to receiving mbuf array at rte_vhost_dequeue_burst(). Commit 33226236a35e ("vhost: handle request to send RARP") iterates all host interfaces and then broadcast it by all of them. It did notify the switches about the new location of the migrated VM, however, the mac learning table in the target host is wrong (at least in my test with OVS): $ ovs-appctl fdb/show ovsbr0 port VLAN MAC Age 1 0 b6:3c:72:71:cd:4d 10 LOCAL 0 b6:3c:72:71:cd:4e 10 LOCAL 0 52:54:00:12:34:68 9 1 0 56:f6:64:2c:bc:c0 1 Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the above, the port learned is "LOCAL", which is the "ovsbr0" port. That is reasonable, since we indeed send the pkt by the "ovsbr0" interface. The wrong mac table lead all the packets to the VM go to the "ovsbr0" in the end, which ends up with all packets being lost, until the guest send a ARP quest (or reply) to refresh the mac learning table. Jianfeng then came up with a solution I have thought of firstly but NAKed by myself, concerning it has potential issues [0]. The solution is as title stated: broadcast the RARP packet by injecting it to the receiving mbuf arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me think it twice; it looked like a false concern to me then. And I had done a rough verification: it worked as expected. [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html Another note is that while preparing this version, I found that DPDK has some ARP related structures and macros defined. So, use them instead of the one from standard header files here. Cc: Thibaut Collet <thibaut.collet@6wind.com> Suggested-by: Jianfeng Tan <jianfeng.tan@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-22 22:36:11 +08:00
#include <rte_arp.h>
#include <rte_spinlock.h>
#include <rte_malloc.h>
#include <rte_vhost_async.h>
#include "iotlb.h"
#include "vhost.h"
#define MAX_BATCH_LEN 256
#define VHOST_ASYNC_BATCH_THRESHOLD 32
static __rte_always_inline bool
rxvq_is_mergeable(struct virtio_net *dev)
{
return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
}
static __rte_always_inline bool
virtio_net_is_inorder(struct virtio_net *dev)
{
return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
}
static bool
is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
{
return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
}
static inline void
do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
{
struct batch_copy_elem *elem = vq->batch_copy_elems;
uint16_t count = vq->batch_copy_nb_elems;
int i;
for (i = 0; i < count; i++) {
rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
elem[i].len);
PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
}
vq->batch_copy_nb_elems = 0;
}
static inline void
do_data_copy_dequeue(struct vhost_virtqueue *vq)
{
struct batch_copy_elem *elem = vq->batch_copy_elems;
uint16_t count = vq->batch_copy_nb_elems;
int i;
for (i = 0; i < count; i++)
rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
vq->batch_copy_nb_elems = 0;
}
static __rte_always_inline void
do_flush_shadow_used_ring_split(struct virtio_net *dev,
struct vhost_virtqueue *vq,
uint16_t to, uint16_t from, uint16_t size)
{
rte_memcpy(&vq->used->ring[to],
&vq->shadow_used_split[from],
size * sizeof(struct vring_used_elem));
vhost_log_cache_used_vring(dev, vq,
offsetof(struct vring_used, ring[to]),
size * sizeof(struct vring_used_elem));
}
static __rte_always_inline void
flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
{
uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
if (used_idx + vq->shadow_used_idx <= vq->size) {
do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
vq->shadow_used_idx);
} else {
uint16_t size;
/* update used ring interval [used_idx, vq->size] */
size = vq->size - used_idx;
do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
/* update the left half used ring interval [0, left_size] */
do_flush_shadow_used_ring_split(dev, vq, 0, size,
vq->shadow_used_idx - size);
}
vq->last_used_idx += vq->shadow_used_idx;
vhost_log_cache_sync(dev, vq);
__atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
__ATOMIC_RELEASE);
vq->shadow_used_idx = 0;
vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
sizeof(vq->used->idx));
}
static __rte_always_inline void
update_shadow_used_ring_split(struct vhost_virtqueue *vq,
uint16_t desc_idx, uint32_t len)
{
uint16_t i = vq->shadow_used_idx++;
vq->shadow_used_split[i].id = desc_idx;
vq->shadow_used_split[i].len = len;
}
static __rte_always_inline void
vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq)
{
int i;
uint16_t used_idx = vq->last_used_idx;
uint16_t head_idx = vq->last_used_idx;
uint16_t head_flags = 0;
/* Split loop in two to save memory barriers */
for (i = 0; i < vq->shadow_used_idx; i++) {
vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
used_idx += vq->shadow_used_packed[i].count;
if (used_idx >= vq->size)
used_idx -= vq->size;
}
/* The ordering for storing desc flags needs to be enforced. */
rte_atomic_thread_fence(__ATOMIC_RELEASE);
for (i = 0; i < vq->shadow_used_idx; i++) {
uint16_t flags;
if (vq->shadow_used_packed[i].len)
flags = VRING_DESC_F_WRITE;
else
flags = 0;
if (vq->used_wrap_counter) {
flags |= VRING_DESC_F_USED;
flags |= VRING_DESC_F_AVAIL;
} else {
flags &= ~VRING_DESC_F_USED;
flags &= ~VRING_DESC_F_AVAIL;
}
if (i > 0) {
vq->desc_packed[vq->last_used_idx].flags = flags;
vhost_log_cache_used_vring(dev, vq,
vq->last_used_idx *
sizeof(struct vring_packed_desc),
sizeof(struct vring_packed_desc));
} else {
head_idx = vq->last_used_idx;
head_flags = flags;
}
vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
}
vq->desc_packed[head_idx].flags = head_flags;
vhost_log_cache_used_vring(dev, vq,
head_idx *
sizeof(struct vring_packed_desc),
sizeof(struct vring_packed_desc));
vq->shadow_used_idx = 0;
vhost_log_cache_sync(dev, vq);
}
static __rte_always_inline void
vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq)
{
struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
/* desc flags is the synchronization point for virtio packed vring */
__atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
used_elem->flags, __ATOMIC_RELEASE);
vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
sizeof(struct vring_packed_desc),
sizeof(struct vring_packed_desc));
vq->shadow_used_idx = 0;
vhost_log_cache_sync(dev, vq);
}
static __rte_always_inline void
vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
uint64_t *lens,
uint16_t *ids)
{
uint16_t i;
uint16_t flags;
uint16_t last_used_idx;
struct vring_packed_desc *desc_base;
last_used_idx = vq->last_used_idx;
desc_base = &vq->desc_packed[last_used_idx];
flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
desc_base[i].id = ids[i];
desc_base[i].len = lens[i];
}
rte_atomic_thread_fence(__ATOMIC_RELEASE);
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
desc_base[i].flags = flags;
}
vhost_log_cache_used_vring(dev, vq, last_used_idx *
sizeof(struct vring_packed_desc),
sizeof(struct vring_packed_desc) *
PACKED_BATCH_SIZE);
vhost_log_cache_sync(dev, vq);
vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
}
static __rte_always_inline void
vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
uint16_t id)
{
vq->shadow_used_packed[0].id = id;
if (!vq->shadow_used_idx) {
vq->shadow_last_used_idx = vq->last_used_idx;
vq->shadow_used_packed[0].flags =
PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
vq->shadow_used_packed[0].len = 0;
vq->shadow_used_packed[0].count = 1;
vq->shadow_used_idx++;
}
vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
}
static __rte_always_inline void
vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
uint16_t *ids)
{
uint16_t flags;
uint16_t i;
uint16_t begin;
flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
if (!vq->shadow_used_idx) {
vq->shadow_last_used_idx = vq->last_used_idx;
vq->shadow_used_packed[0].id = ids[0];
vq->shadow_used_packed[0].len = 0;
vq->shadow_used_packed[0].count = 1;
vq->shadow_used_packed[0].flags = flags;
vq->shadow_used_idx++;
begin = 1;
} else
begin = 0;
vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
vq->desc_packed[vq->last_used_idx + i].id = ids[i];
vq->desc_packed[vq->last_used_idx + i].len = 0;
}
rte_atomic_thread_fence(__ATOMIC_RELEASE);
vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
vq->desc_packed[vq->last_used_idx + i].flags = flags;
vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
sizeof(struct vring_packed_desc),
sizeof(struct vring_packed_desc) *
PACKED_BATCH_SIZE);
vhost_log_cache_sync(dev, vq);
vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
}
static __rte_always_inline void
vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
uint16_t buf_id,
uint16_t count)
{
uint16_t flags;
flags = vq->desc_packed[vq->last_used_idx].flags;
if (vq->used_wrap_counter) {
flags |= VRING_DESC_F_USED;
flags |= VRING_DESC_F_AVAIL;
} else {
flags &= ~VRING_DESC_F_USED;
flags &= ~VRING_DESC_F_AVAIL;
}
if (!vq->shadow_used_idx) {
vq->shadow_last_used_idx = vq->last_used_idx;
vq->shadow_used_packed[0].id = buf_id;
vq->shadow_used_packed[0].len = 0;
vq->shadow_used_packed[0].flags = flags;
vq->shadow_used_idx++;
} else {
vq->desc_packed[vq->last_used_idx].id = buf_id;
vq->desc_packed[vq->last_used_idx].len = 0;
vq->desc_packed[vq->last_used_idx].flags = flags;
}
vq_inc_last_used_packed(vq, count);
}
static __rte_always_inline void
vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
uint16_t buf_id,
uint16_t count)
{
uint16_t flags;
vq->shadow_used_packed[0].id = buf_id;
flags = vq->desc_packed[vq->last_used_idx].flags;
if (vq->used_wrap_counter) {
flags |= VRING_DESC_F_USED;
flags |= VRING_DESC_F_AVAIL;
} else {
flags &= ~VRING_DESC_F_USED;
flags &= ~VRING_DESC_F_AVAIL;
}
if (!vq->shadow_used_idx) {
vq->shadow_last_used_idx = vq->last_used_idx;
vq->shadow_used_packed[0].len = 0;
vq->shadow_used_packed[0].flags = flags;
vq->shadow_used_idx++;
}
vq_inc_last_used_packed(vq, count);
}
static __rte_always_inline void
vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
uint32_t *len,
uint16_t *id,
uint16_t *count,
uint16_t num_buffers)
{
uint16_t i;
for (i = 0; i < num_buffers; i++) {
/* enqueue shadow flush action aligned with batch num */
if (!vq->shadow_used_idx)
vq->shadow_aligned_idx = vq->last_used_idx &
PACKED_BATCH_MASK;
vq->shadow_used_packed[vq->shadow_used_idx].id = id[i];
vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
vq->shadow_aligned_idx += count[i];
vq->shadow_used_idx++;
}
}
static __rte_always_inline void
vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
uint32_t *len,
uint16_t *id,
uint16_t *count,
uint16_t num_buffers)
{
vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
do_data_copy_enqueue(dev, vq);
vhost_flush_enqueue_shadow_packed(dev, vq);
}
}
/* avoid write operation when necessary, to lessen cache issues */
#define ASSIGN_UNLESS_EQUAL(var, val) do { \
if ((var) != (val)) \
(var) = (val); \
} while (0)
static __rte_always_inline void
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
uint64_t csum_l4 = m_buf->ol_flags & RTE_MBUF_F_TX_L4_MASK;
if (m_buf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)
csum_l4 |= RTE_MBUF_F_TX_TCP_CKSUM;
if (csum_l4) {
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
switch (csum_l4) {
case RTE_MBUF_F_TX_TCP_CKSUM:
net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
cksum));
break;
case RTE_MBUF_F_TX_UDP_CKSUM:
net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
dgram_cksum));
break;
case RTE_MBUF_F_TX_SCTP_CKSUM:
net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
cksum));
break;
}
} else {
ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
}
/* IP cksum verification cannot be bypassed, then calculate here */
if (m_buf->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) {
struct rte_ipv4_hdr *ipv4_hdr;
ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
m_buf->l2_len);
ipv4_hdr->hdr_checksum = 0;
ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
}
if (m_buf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
if (m_buf->ol_flags & RTE_MBUF_F_TX_IPV4)
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
else
net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
} else if (m_buf->ol_flags & RTE_MBUF_F_TX_UDP_SEG) {
net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
m_buf->l4_len;
} else {
ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
}
}
static __rte_always_inline int
map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct buf_vector *buf_vec, uint16_t *vec_idx,
uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
{
uint16_t vec_id = *vec_idx;
while (desc_len) {
uint64_t desc_addr;
uint64_t desc_chunck_len = desc_len;
if (unlikely(vec_id >= BUF_VECTOR_MAX))
return -1;
desc_addr = vhost_iova_to_vva(dev, vq,
desc_iova,
&desc_chunck_len,
perm);
if (unlikely(!desc_addr))
return -1;
rte_prefetch0((void *)(uintptr_t)desc_addr);
buf_vec[vec_id].buf_iova = desc_iova;
buf_vec[vec_id].buf_addr = desc_addr;
buf_vec[vec_id].buf_len = desc_chunck_len;
desc_len -= desc_chunck_len;
desc_iova += desc_chunck_len;
vec_id++;
}
*vec_idx = vec_id;
return 0;
}
static __rte_always_inline int
fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t avail_idx, uint16_t *vec_idx,
struct buf_vector *buf_vec, uint16_t *desc_chain_head,
uint32_t *desc_chain_len, uint8_t perm)
{
uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
uint16_t vec_id = *vec_idx;
uint32_t len = 0;
uint64_t dlen;
uint32_t nr_descs = vq->size;
uint32_t cnt = 0;
struct vring_desc *descs = vq->desc;
struct vring_desc *idesc = NULL;
if (unlikely(idx >= vq->size))
return -1;
*desc_chain_head = idx;
if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
dlen = vq->desc[idx].len;
nr_descs = dlen / sizeof(struct vring_desc);
if (unlikely(nr_descs > vq->size))
return -1;
descs = (struct vring_desc *)(uintptr_t)
vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
&dlen,
VHOST_ACCESS_RO);
if (unlikely(!descs))
return -1;
if (unlikely(dlen < vq->desc[idx].len)) {
/*
* The indirect desc table is not contiguous
* in process VA space, we have to copy it.
*/
idesc = vhost_alloc_copy_ind_table(dev, vq,
vq->desc[idx].addr, vq->desc[idx].len);
if (unlikely(!idesc))
return -1;
descs = idesc;
}
idx = 0;
}
while (1) {
if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
free_ind_table(idesc);
return -1;
}
dlen = descs[idx].len;
len += dlen;
if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
descs[idx].addr, dlen,
perm))) {
free_ind_table(idesc);
return -1;
}
if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
break;
idx = descs[idx].next;
}
*desc_chain_len = len;
*vec_idx = vec_id;
if (unlikely(!!idesc))
free_ind_table(idesc);
return 0;
}
/*
* Returns -1 on fail, 0 on success
*/
static inline int
reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t size, struct buf_vector *buf_vec,
uint16_t *num_buffers, uint16_t avail_head,
uint16_t *nr_vec)
{
uint16_t cur_idx;
uint16_t vec_idx = 0;
uint16_t max_tries, tries = 0;
vhost: add guest offload setting Add guest offload setting in vhost lib. Virtio 1.0 spec (5.1.6.4 Processing of Incoming Packets) says: 1. If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEEDS_CSUM bit in flags can be set: if so, the packet checksum at offset csum_offset from csum_start and any preceding checksums have been validated. The checksum on the packet is incomplete and csum_start and csum_offset indicate how to calculate it (see Packet Transmission point 1). 2. If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then gso_type MAY be something other than VIRTIO_NET_HDR_GSO_NONE, and gso_size field indicates the desired MSS (see Packet Transmission point 2). In order to support these features, the following changes are added, 1. Extend 'VHOST_SUPPORTED_FEATURES' macro to add the offload features negotiation. 2. Enqueue these offloads: convert some fields in mbuf to the fields in virtio_net_hdr. There are more explanations for the implementation. For VM2VM case, there is no need to do checksum, for we think the data should be reliable enough, and setting VIRTIO_NET_HDR_F_NEEDS_CSUM at RX side will let the TCP layer to bypass the checksum validation, so that the RX side could receive the packet in the end. In terms of us-vhost, at vhost RX side, the offload information is inherited from mbuf, which is in turn inherited from TX side. If we can still get those info at RX side, it means the packet is from another VM at same host. So, it's safe to set the VIRTIO_NET_HDR_F_NEEDS_CSUM, to skip checksum validation. Signed-off-by: Jijiang Liu <jijiang.liu@intel.com> Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-05 15:31:39 +08:00
uint16_t head_idx = 0;
uint32_t len = 0;
*num_buffers = 0;
cur_idx = vq->last_avail_idx;
if (rxvq_is_mergeable(dev))
max_tries = vq->size - 1;
else
max_tries = 1;
while (size > 0) {
if (unlikely(cur_idx == avail_head))
return -1;
/*
* if we tried all available ring items, and still
* can't get enough buf, it means something abnormal
* happened.
*/
if (unlikely(++tries > max_tries))
return -1;
if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
&vec_idx, buf_vec,
&head_idx, &len,
VHOST_ACCESS_RW) < 0))
return -1;
len = RTE_MIN(len, size);
update_shadow_used_ring_split(vq, head_idx, len);
size -= len;
cur_idx++;
*num_buffers += 1;
}
*nr_vec = vec_idx;
return 0;
}
static __rte_always_inline int
fill_vec_buf_packed_indirect(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct vring_packed_desc *desc, uint16_t *vec_idx,
struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
{
uint16_t i;
uint32_t nr_descs;
uint16_t vec_id = *vec_idx;
uint64_t dlen;
struct vring_packed_desc *descs, *idescs = NULL;
dlen = desc->len;
descs = (struct vring_packed_desc *)(uintptr_t)
vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
if (unlikely(!descs))
return -1;
if (unlikely(dlen < desc->len)) {
/*
* The indirect desc table is not contiguous
* in process VA space, we have to copy it.
*/
idescs = vhost_alloc_copy_ind_table(dev,
vq, desc->addr, desc->len);
if (unlikely(!idescs))
return -1;
descs = idescs;
}
nr_descs = desc->len / sizeof(struct vring_packed_desc);
if (unlikely(nr_descs >= vq->size)) {
free_ind_table(idescs);
return -1;
}
for (i = 0; i < nr_descs; i++) {
if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
free_ind_table(idescs);
return -1;
}
dlen = descs[i].len;
*len += dlen;
if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
descs[i].addr, dlen,
perm)))
return -1;
}
*vec_idx = vec_id;
if (unlikely(!!idescs))
free_ind_table(idescs);
return 0;
}
static __rte_always_inline int
fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t avail_idx, uint16_t *desc_count,
struct buf_vector *buf_vec, uint16_t *vec_idx,
uint16_t *buf_id, uint32_t *len, uint8_t perm)
{
bool wrap_counter = vq->avail_wrap_counter;
struct vring_packed_desc *descs = vq->desc_packed;
uint16_t vec_id = *vec_idx;
uint64_t dlen;
if (avail_idx < vq->last_avail_idx)
wrap_counter ^= 1;
/*
* Perform a load-acquire barrier in desc_is_avail to
* enforce the ordering between desc flags and desc
* content.
*/
if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
return -1;
*desc_count = 0;
*len = 0;
while (1) {
if (unlikely(vec_id >= BUF_VECTOR_MAX))
return -1;
if (unlikely(*desc_count >= vq->size))
return -1;
*desc_count += 1;
*buf_id = descs[avail_idx].id;
if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
&descs[avail_idx],
&vec_id, buf_vec,
len, perm) < 0))
return -1;
} else {
dlen = descs[avail_idx].len;
*len += dlen;
if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
descs[avail_idx].addr,
dlen,
perm)))
return -1;
}
if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
break;
if (++avail_idx >= vq->size) {
avail_idx -= vq->size;
wrap_counter ^= 1;
}
}
*vec_idx = vec_id;
return 0;
}
static __rte_noinline void
copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct buf_vector *buf_vec,
struct virtio_net_hdr_mrg_rxbuf *hdr)
{
uint64_t len;
uint64_t remain = dev->vhost_hlen;
uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
uint64_t iova = buf_vec->buf_iova;
while (remain) {
len = RTE_MIN(remain,
buf_vec->buf_len);
dst = buf_vec->buf_addr;
rte_memcpy((void *)(uintptr_t)dst,
(void *)(uintptr_t)src,
len);
PRINT_PACKET(dev, (uintptr_t)dst,
(uint32_t)len, 0);
vhost_log_cache_write_iova(dev, vq,
iova, len);
remain -= len;
iova += len;
src += len;
buf_vec++;
}
}
static __rte_always_inline int
copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct rte_mbuf *m, struct buf_vector *buf_vec,
uint16_t nr_vec, uint16_t num_buffers)
{
uint32_t vec_idx = 0;
uint32_t mbuf_offset, mbuf_avail;
uint32_t buf_offset, buf_avail;
uint64_t buf_addr, buf_iova, buf_len;
uint32_t cpy_len;
uint64_t hdr_addr;
struct rte_mbuf *hdr_mbuf;
struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
int error = 0;
if (unlikely(m == NULL)) {
error = -1;
goto out;
}
buf_addr = buf_vec[vec_idx].buf_addr;
buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
error = -1;
goto out;
}
hdr_mbuf = m;
hdr_addr = buf_addr;
if (unlikely(buf_len < dev->vhost_hlen)) {
memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
hdr = &tmp_hdr;
} else
hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
dev->vid, num_buffers);
if (unlikely(buf_len < dev->vhost_hlen)) {
buf_offset = dev->vhost_hlen - buf_len;
vec_idx++;
buf_addr = buf_vec[vec_idx].buf_addr;
buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
buf_avail = buf_len - buf_offset;
} else {
buf_offset = dev->vhost_hlen;
buf_avail = buf_len - dev->vhost_hlen;
}
mbuf_avail = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
while (mbuf_avail != 0 || m->next != NULL) {
/* done with current buf, get the next one */
if (buf_avail == 0) {
vec_idx++;
if (unlikely(vec_idx >= nr_vec)) {
error = -1;
goto out;
}
buf_addr = buf_vec[vec_idx].buf_addr;
buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
buf_offset = 0;
buf_avail = buf_len;
}
/* done with current mbuf, get the next one */
if (mbuf_avail == 0) {
m = m->next;
mbuf_offset = 0;
mbuf_avail = rte_pktmbuf_data_len(m);
}
if (hdr_addr) {
virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
if (rxvq_is_mergeable(dev))
ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
num_buffers);
if (unlikely(hdr == &tmp_hdr)) {
copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
} else {
PRINT_PACKET(dev, (uintptr_t)hdr_addr,
dev->vhost_hlen, 0);
vhost_log_cache_write_iova(dev, vq,
buf_vec[0].buf_iova,
dev->vhost_hlen);
}
hdr_addr = 0;
}
cpy_len = RTE_MIN(buf_avail, mbuf_avail);
if (likely(cpy_len > MAX_BATCH_LEN ||
vq->batch_copy_nb_elems >= vq->size)) {
rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
cpy_len);
vhost_log_cache_write_iova(dev, vq,
buf_iova + buf_offset,
cpy_len);
PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
cpy_len, 0);
} else {
batch_copy[vq->batch_copy_nb_elems].dst =
(void *)((uintptr_t)(buf_addr + buf_offset));
batch_copy[vq->batch_copy_nb_elems].src =
rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
batch_copy[vq->batch_copy_nb_elems].log_addr =
buf_iova + buf_offset;
batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
vq->batch_copy_nb_elems++;
}
mbuf_avail -= cpy_len;
mbuf_offset += cpy_len;
buf_avail -= cpy_len;
buf_offset += cpy_len;
}
out:
return error;
}
static __rte_always_inline void
async_fill_vec(struct iovec *v, void *base, size_t len)
{
v->iov_base = base;
v->iov_len = len;
}
static __rte_always_inline void
async_fill_iter(struct rte_vhost_iov_iter *it, size_t count,
struct iovec *vec, unsigned long nr_seg)
{
it->offset = 0;
it->count = count;
if (count) {
it->iov = vec;
it->nr_segs = nr_seg;
} else {
it->iov = 0;
it->nr_segs = 0;
}
}
static __rte_always_inline void
async_fill_desc(struct rte_vhost_async_desc *desc,
struct rte_vhost_iov_iter *src, struct rte_vhost_iov_iter *dst)
{
desc->src = src;
desc->dst = dst;
}
static __rte_always_inline int
async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct rte_mbuf *m, struct buf_vector *buf_vec,
uint16_t nr_vec, uint16_t num_buffers,
struct iovec *src_iovec, struct iovec *dst_iovec,
struct rte_vhost_iov_iter *src_it,
struct rte_vhost_iov_iter *dst_it)
{
struct rte_mbuf *hdr_mbuf;
struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
uint64_t buf_addr, buf_iova;
uint64_t hdr_addr;
uint64_t mapped_len;
uint32_t vec_idx = 0;
uint32_t mbuf_offset, mbuf_avail;
uint32_t buf_offset, buf_avail;
uint32_t cpy_len, buf_len;
int error = 0;
uint32_t tlen = 0;
int tvec_idx = 0;
void *hpa;
if (unlikely(m == NULL)) {
error = -1;
goto out;
}
buf_addr = buf_vec[vec_idx].buf_addr;
buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
error = -1;
goto out;
}
hdr_mbuf = m;
hdr_addr = buf_addr;
if (unlikely(buf_len < dev->vhost_hlen)) {
memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
hdr = &tmp_hdr;
} else
hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
dev->vid, num_buffers);
if (unlikely(buf_len < dev->vhost_hlen)) {
buf_offset = dev->vhost_hlen - buf_len;
vec_idx++;
buf_addr = buf_vec[vec_idx].buf_addr;
buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
buf_avail = buf_len - buf_offset;
} else {
buf_offset = dev->vhost_hlen;
buf_avail = buf_len - dev->vhost_hlen;
}
mbuf_avail = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
while (mbuf_avail != 0 || m->next != NULL) {
/* done with current buf, get the next one */
if (buf_avail == 0) {
vec_idx++;
if (unlikely(vec_idx >= nr_vec)) {
error = -1;
goto out;
}
buf_addr = buf_vec[vec_idx].buf_addr;
buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
buf_offset = 0;
buf_avail = buf_len;
}
/* done with current mbuf, get the next one */
if (mbuf_avail == 0) {
m = m->next;
mbuf_offset = 0;
mbuf_avail = rte_pktmbuf_data_len(m);
}
if (hdr_addr) {
virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
if (rxvq_is_mergeable(dev))
ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
num_buffers);
if (unlikely(hdr == &tmp_hdr)) {
copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
} else {
PRINT_PACKET(dev, (uintptr_t)hdr_addr,
dev->vhost_hlen, 0);
vhost_log_cache_write_iova(dev, vq,
buf_vec[0].buf_iova,
dev->vhost_hlen);
}
hdr_addr = 0;
}
cpy_len = RTE_MIN(buf_avail, mbuf_avail);
while (unlikely(cpy_len)) {
hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
buf_iova + buf_offset,
cpy_len, &mapped_len);
if (unlikely(!hpa)) {
VHOST_LOG_DATA(ERR, "(%d) %s: failed to get hpa.\n",
dev->vid, __func__);
error = -1;
goto out;
}
async_fill_vec(src_iovec + tvec_idx,
(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
mbuf_offset), (size_t)mapped_len);
async_fill_vec(dst_iovec + tvec_idx,
hpa, (size_t)mapped_len);
tlen += (uint32_t)mapped_len;
cpy_len -= (uint32_t)mapped_len;
mbuf_avail -= (uint32_t)mapped_len;
mbuf_offset += (uint32_t)mapped_len;
buf_avail -= (uint32_t)mapped_len;
buf_offset += (uint32_t)mapped_len;
tvec_idx++;
}
}
async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
out:
return error;
}
static __rte_always_inline int
vhost_enqueue_single_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf *pkt,
struct buf_vector *buf_vec,
uint16_t *nr_descs)
{
uint16_t nr_vec = 0;
uint16_t avail_idx = vq->last_avail_idx;
uint16_t max_tries, tries = 0;
uint16_t buf_id = 0;
uint32_t len = 0;
uint16_t desc_count;
uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
uint16_t num_buffers = 0;
uint32_t buffer_len[vq->size];
uint16_t buffer_buf_id[vq->size];
uint16_t buffer_desc_count[vq->size];
if (rxvq_is_mergeable(dev))
max_tries = vq->size - 1;
else
max_tries = 1;
while (size > 0) {
/*
* if we tried all available ring items, and still
* can't get enough buf, it means something abnormal
* happened.
*/
if (unlikely(++tries > max_tries))
return -1;
if (unlikely(fill_vec_buf_packed(dev, vq,
avail_idx, &desc_count,
buf_vec, &nr_vec,
&buf_id, &len,
VHOST_ACCESS_RW) < 0))
return -1;
len = RTE_MIN(len, size);
size -= len;
buffer_len[num_buffers] = len;
buffer_buf_id[num_buffers] = buf_id;
buffer_desc_count[num_buffers] = desc_count;
num_buffers += 1;
*nr_descs += desc_count;
avail_idx += desc_count;
if (avail_idx >= vq->size)
avail_idx -= vq->size;
}
if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
return -1;
vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
buffer_desc_count, num_buffers);
return 0;
}
static __rte_noinline uint32_t
virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct rte_mbuf **pkts, uint32_t count)
{
uint32_t pkt_idx = 0;
uint16_t num_buffers;
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t avail_head;
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
uint16_t nr_vec = 0;
if (unlikely(reserve_avail_buf_split(dev, vq,
pkt_len, buf_vec, &num_buffers,
avail_head, &nr_vec) < 0)) {
VHOST_LOG_DATA(DEBUG,
"(%d) failed to get enough desc from vring\n",
dev->vid);
vq->shadow_used_idx -= num_buffers;
break;
}
VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
dev->vid, vq->last_avail_idx,
vq->last_avail_idx + num_buffers);
if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
buf_vec, nr_vec,
num_buffers) < 0) {
vq->shadow_used_idx -= num_buffers;
break;
}
vq->last_avail_idx += num_buffers;
}
do_data_copy_enqueue(dev, vq);
if (likely(vq->shadow_used_idx)) {
flush_shadow_used_ring_split(dev, vq);
vhost_vring_call_split(dev, vq);
}
return pkt_idx;
}
static __rte_always_inline int
virtio_dev_rx_sync_batch_check(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf **pkts,
uint64_t *desc_addrs,
uint64_t *lens)
{
bool wrap_counter = vq->avail_wrap_counter;
struct vring_packed_desc *descs = vq->desc_packed;
uint16_t avail_idx = vq->last_avail_idx;
uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
uint16_t i;
if (unlikely(avail_idx & PACKED_BATCH_MASK))
return -1;
if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
return -1;
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
if (unlikely(pkts[i]->next != NULL))
return -1;
if (unlikely(!desc_is_avail(&descs[avail_idx + i],
wrap_counter)))
return -1;
}
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
lens[i] = descs[avail_idx + i].len;
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
return -1;
}
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
desc_addrs[i] = vhost_iova_to_vva(dev, vq,
descs[avail_idx + i].addr,
&lens[i],
VHOST_ACCESS_RW);
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
if (unlikely(!desc_addrs[i]))
return -1;
if (unlikely(lens[i] != descs[avail_idx + i].len))
return -1;
}
return 0;
}
static __rte_always_inline void
virtio_dev_rx_batch_packed_copy(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf **pkts,
uint64_t *desc_addrs,
uint64_t *lens)
{
uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
struct vring_packed_desc *descs = vq->desc_packed;
uint16_t avail_idx = vq->last_avail_idx;
uint16_t ids[PACKED_BATCH_SIZE];
uint16_t i;
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
(uintptr_t)desc_addrs[i];
lens[i] = pkts[i]->pkt_len +
sizeof(struct virtio_net_hdr_mrg_rxbuf);
}
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
pkts[i]->pkt_len);
}
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
lens[i]);
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
ids[i] = descs[avail_idx + i].id;
vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
}
static __rte_always_inline int
virtio_dev_rx_sync_batch_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf **pkts)
{
uint64_t desc_addrs[PACKED_BATCH_SIZE];
uint64_t lens[PACKED_BATCH_SIZE];
if (virtio_dev_rx_sync_batch_check(dev, vq, pkts, desc_addrs, lens) == -1)
return -1;
if (vq->shadow_used_idx) {
do_data_copy_enqueue(dev, vq);
vhost_flush_enqueue_shadow_packed(dev, vq);
}
virtio_dev_rx_batch_packed_copy(dev, vq, pkts, desc_addrs, lens);
return 0;
}
static __rte_always_inline int16_t
virtio_dev_rx_single_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf *pkt)
{
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t nr_descs = 0;
if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
&nr_descs) < 0)) {
VHOST_LOG_DATA(DEBUG,
"(%d) failed to get enough desc from vring\n",
dev->vid);
return -1;
}
VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
dev->vid, vq->last_avail_idx,
vq->last_avail_idx + nr_descs);
vq_inc_last_avail_packed(vq, nr_descs);
return 0;
}
static __rte_noinline uint32_t
virtio_dev_rx_packed(struct virtio_net *dev,
struct vhost_virtqueue *__rte_restrict vq,
struct rte_mbuf **__rte_restrict pkts,
uint32_t count)
{
uint32_t pkt_idx = 0;
do {
rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
if (count - pkt_idx >= PACKED_BATCH_SIZE) {
if (!virtio_dev_rx_sync_batch_packed(dev, vq,
&pkts[pkt_idx])) {
pkt_idx += PACKED_BATCH_SIZE;
continue;
}
}
if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
break;
pkt_idx++;
} while (pkt_idx < count);
if (vq->shadow_used_idx) {
do_data_copy_enqueue(dev, vq);
vhost_flush_enqueue_shadow_packed(dev, vq);
}
if (pkt_idx)
vhost_vring_call_packed(dev, vq);
return pkt_idx;
}
static __rte_always_inline uint32_t
virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
struct rte_mbuf **pkts, uint32_t count)
{
struct vhost_virtqueue *vq;
uint32_t nb_tx = 0;
VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
vq = dev->virtqueue[queue_id];
rte_spinlock_lock(&vq->access_lock);
if (unlikely(!vq->enabled))
goto out_access_unlock;
if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
vhost_user_iotlb_rd_lock(vq);
if (unlikely(!vq->access_ok))
if (unlikely(vring_translate(dev, vq) < 0))
goto out;
count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
if (count == 0)
goto out;
if (vq_is_packed(dev))
nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
else
nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
out:
if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
vhost_user_iotlb_rd_unlock(vq);
out_access_unlock:
rte_spinlock_unlock(&vq->access_lock);
return nb_tx;
}
uint16_t
rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
struct rte_mbuf **__rte_restrict pkts, uint16_t count)
{
struct virtio_net *dev = get_device(vid);
if (!dev)
return 0;
if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
VHOST_LOG_DATA(ERR,
"(%d) %s: built-in vhost net backend is disabled.\n",
dev->vid, __func__);
return 0;
}
return virtio_dev_rx(dev, queue_id, pkts, count);
}
static __rte_always_inline uint16_t
virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
uint16_t vq_size, uint16_t n_inflight)
{
return pkts_idx > n_inflight ? (pkts_idx - n_inflight) :
(vq_size - n_inflight + pkts_idx) % vq_size;
}
static __rte_always_inline void
store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
{
size_t elem_size = sizeof(struct vring_used_elem);
if (d_idx + count <= ring_size) {
rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
} else {
uint16_t size = ring_size - d_idx;
rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
}
}
static __rte_always_inline void
store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
struct vring_used_elem_packed *d_ring,
uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
{
size_t elem_size = sizeof(struct vring_used_elem_packed);
if (d_idx + count <= ring_size) {
rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
} else {
uint16_t size = ring_size - d_idx;
rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
}
}
static __rte_noinline uint32_t
virtio_dev_rx_async_submit_split(struct virtio_net *dev,
struct vhost_virtqueue *vq, uint16_t queue_id,
struct rte_mbuf **pkts, uint32_t count)
{
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint32_t pkt_idx = 0, pkt_burst_idx = 0;
uint16_t num_buffers;
uint16_t avail_head;
struct rte_vhost_iov_iter *it_pool = vq->it_pool;
struct iovec *vec_pool = vq->vec_pool;
struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
struct iovec *src_iovec = vec_pool;
struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
struct async_inflight_info *pkts_info = vq->async_pkts_info;
uint32_t n_pkts = 0, pkt_err = 0;
int32_t n_xfer;
uint16_t segs_await = 0;
uint16_t iovec_idx = 0, it_idx = 0, slot_idx = 0;
/*
* The ordering between avail index and desc reads need to be enforced.
*/
avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
uint16_t nr_vec = 0;
if (unlikely(reserve_avail_buf_split(dev, vq,
pkt_len, buf_vec, &num_buffers,
avail_head, &nr_vec) < 0)) {
VHOST_LOG_DATA(DEBUG,
"(%d) failed to get enough desc from vring\n",
dev->vid);
vq->shadow_used_idx -= num_buffers;
break;
}
VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
dev->vid, vq->last_avail_idx,
vq->last_avail_idx + num_buffers);
if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
&it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
vq->shadow_used_idx -= num_buffers;
break;
}
async_fill_desc(&tdes[pkt_burst_idx++], &it_pool[it_idx],
&it_pool[it_idx + 1]);
slot_idx = (vq->async_pkts_idx + pkt_idx) & (vq->size - 1);
pkts_info[slot_idx].descs = num_buffers;
pkts_info[slot_idx].mbuf = pkts[pkt_idx];
iovec_idx += it_pool[it_idx].nr_segs;
segs_await += it_pool[it_idx].nr_segs;
it_idx += 2;
vq->last_avail_idx += num_buffers;
/*
* conditions to trigger async device transfer:
* - buffered packet number reaches transfer threshold
* - unused async iov number is less than max vhost vector
*/
if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
BUF_VECTOR_MAX))) {
n_xfer = vq->async_ops.transfer_data(dev->vid,
queue_id, tdes, 0, pkt_burst_idx);
if (likely(n_xfer >= 0)) {
n_pkts = n_xfer;
} else {
VHOST_LOG_DATA(ERR,
"(%d) %s: failed to transfer data for queue id %d.\n",
dev->vid, __func__, queue_id);
n_pkts = 0;
}
iovec_idx = 0;
it_idx = 0;
segs_await = 0;
if (unlikely(n_pkts < pkt_burst_idx)) {
/*
* log error packets number here and do actual
* error processing when applications poll
* completion
*/
pkt_err = pkt_burst_idx - n_pkts;
pkt_idx++;
pkt_burst_idx = 0;
break;
}
pkt_burst_idx = 0;
}
}
if (pkt_burst_idx) {
n_xfer = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
if (likely(n_xfer >= 0)) {
n_pkts = n_xfer;
} else {
VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
dev->vid, __func__, queue_id);
n_pkts = 0;
}
if (unlikely(n_pkts < pkt_burst_idx))
pkt_err = pkt_burst_idx - n_pkts;
}
if (unlikely(pkt_err)) {
uint16_t num_descs = 0;
/* update number of completed packets */
pkt_idx -= pkt_err;
/* calculate the sum of descriptors to revert */
while (pkt_err-- > 0) {
num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
slot_idx--;
}
/* recover shadow used ring and available ring */
vq->shadow_used_idx -= num_descs;
vq->last_avail_idx -= num_descs;
}
/* keep used descriptors */
if (likely(vq->shadow_used_idx)) {
uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
store_dma_desc_info_split(vq->shadow_used_split,
vq->async_descs_split, vq->size, 0, to,
vq->shadow_used_idx);
vq->async_desc_idx_split += vq->shadow_used_idx;
vq->async_pkts_idx += pkt_idx;
vq->async_pkts_inflight_n += pkt_idx;
vq->shadow_used_idx = 0;
}
return pkt_idx;
}
static __rte_always_inline void
vhost_update_used_packed(struct vhost_virtqueue *vq,
struct vring_used_elem_packed *shadow_ring,
uint16_t count)
{
int i;
uint16_t used_idx = vq->last_used_idx;
uint16_t head_idx = vq->last_used_idx;
uint16_t head_flags = 0;
if (count == 0)
return;
/* Split loop in two to save memory barriers */
for (i = 0; i < count; i++) {
vq->desc_packed[used_idx].id = shadow_ring[i].id;
vq->desc_packed[used_idx].len = shadow_ring[i].len;
used_idx += shadow_ring[i].count;
if (used_idx >= vq->size)
used_idx -= vq->size;
}
/* The ordering for storing desc flags needs to be enforced. */
rte_atomic_thread_fence(__ATOMIC_RELEASE);
for (i = 0; i < count; i++) {
uint16_t flags;
if (vq->shadow_used_packed[i].len)
flags = VRING_DESC_F_WRITE;
else
flags = 0;
if (vq->used_wrap_counter) {
flags |= VRING_DESC_F_USED;
flags |= VRING_DESC_F_AVAIL;
} else {
flags &= ~VRING_DESC_F_USED;
flags &= ~VRING_DESC_F_AVAIL;
}
if (i > 0) {
vq->desc_packed[vq->last_used_idx].flags = flags;
} else {
head_idx = vq->last_used_idx;
head_flags = flags;
}
vq_inc_last_used_packed(vq, shadow_ring[i].count);
}
vq->desc_packed[head_idx].flags = head_flags;
}
static __rte_always_inline int
vhost_enqueue_async_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf *pkt,
struct buf_vector *buf_vec,
uint16_t *nr_descs,
uint16_t *nr_buffers,
struct iovec *src_iovec, struct iovec *dst_iovec,
struct rte_vhost_iov_iter *src_it,
struct rte_vhost_iov_iter *dst_it)
{
uint16_t nr_vec = 0;
uint16_t avail_idx = vq->last_avail_idx;
uint16_t max_tries, tries = 0;
uint16_t buf_id = 0;
uint32_t len = 0;
uint16_t desc_count = 0;
uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
uint32_t buffer_len[vq->size];
uint16_t buffer_buf_id[vq->size];
uint16_t buffer_desc_count[vq->size];
if (rxvq_is_mergeable(dev))
max_tries = vq->size - 1;
else
max_tries = 1;
while (size > 0) {
/*
* if we tried all available ring items, and still
* can't get enough buf, it means something abnormal
* happened.
*/
if (unlikely(++tries > max_tries))
return -1;
if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
&buf_id, &len, VHOST_ACCESS_RW) < 0))
return -1;
len = RTE_MIN(len, size);
size -= len;
buffer_len[*nr_buffers] = len;
buffer_buf_id[*nr_buffers] = buf_id;
buffer_desc_count[*nr_buffers] = desc_count;
*nr_buffers += 1;
*nr_descs += desc_count;
avail_idx += desc_count;
if (avail_idx >= vq->size)
avail_idx -= vq->size;
}
if (unlikely(async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec,
*nr_buffers, src_iovec, dst_iovec,
src_it, dst_it) < 0))
return -1;
vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
return 0;
}
static __rte_always_inline int16_t
virtio_dev_rx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
struct iovec *src_iovec, struct iovec *dst_iovec,
struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
{
struct buf_vector buf_vec[BUF_VECTOR_MAX];
if (unlikely(vhost_enqueue_async_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
src_iovec, dst_iovec,
src_it, dst_it) < 0)) {
VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
return -1;
}
VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
return 0;
}
static __rte_always_inline void
dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
uint32_t nr_err, uint32_t *pkt_idx)
{
uint16_t descs_err = 0;
uint16_t buffers_err = 0;
struct async_inflight_info *pkts_info = vq->async_pkts_info;
*pkt_idx -= nr_err;
/* calculate the sum of buffers and descs of DMA-error packets. */
while (nr_err-- > 0) {
descs_err += pkts_info[slot_idx % vq->size].descs;
buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
slot_idx--;
}
if (vq->last_avail_idx >= descs_err) {
vq->last_avail_idx -= descs_err;
} else {
vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
vq->avail_wrap_counter ^= 1;
}
vq->shadow_used_idx -= buffers_err;
}
static __rte_noinline uint32_t
virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq, uint16_t queue_id,
struct rte_mbuf **pkts, uint32_t count)
{
uint32_t pkt_idx = 0, pkt_burst_idx = 0;
uint32_t remained = count;
int32_t n_xfer;
uint16_t num_buffers;
uint16_t num_descs;
struct rte_vhost_iov_iter *it_pool = vq->it_pool;
struct iovec *vec_pool = vq->vec_pool;
struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
struct iovec *src_iovec = vec_pool;
struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
struct async_inflight_info *pkts_info = vq->async_pkts_info;
uint32_t n_pkts = 0, pkt_err = 0;
uint16_t slot_idx = 0;
uint16_t segs_await = 0;
uint16_t iovec_idx = 0, it_idx = 0;
do {
rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
num_buffers = 0;
num_descs = 0;
if (unlikely(virtio_dev_rx_async_packed(dev, vq, pkts[pkt_idx],
&num_descs, &num_buffers,
&src_iovec[iovec_idx], &dst_iovec[iovec_idx],
&it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
break;
slot_idx = (vq->async_pkts_idx + pkt_idx) % vq->size;
async_fill_desc(&tdes[pkt_burst_idx++], &it_pool[it_idx],
&it_pool[it_idx + 1]);
pkts_info[slot_idx].descs = num_descs;
pkts_info[slot_idx].nr_buffers = num_buffers;
pkts_info[slot_idx].mbuf = pkts[pkt_idx];
iovec_idx += it_pool[it_idx].nr_segs;
segs_await += it_pool[it_idx].nr_segs;
it_idx += 2;
pkt_idx++;
remained--;
vq_inc_last_avail_packed(vq, num_descs);
/*
* conditions to trigger async device transfer:
* - buffered packet number reaches transfer threshold
* - unused async iov number is less than max vhost vector
*/
if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
n_xfer = vq->async_ops.transfer_data(dev->vid,
queue_id, tdes, 0, pkt_burst_idx);
if (likely(n_xfer >= 0)) {
n_pkts = n_xfer;
} else {
VHOST_LOG_DATA(ERR,
"(%d) %s: failed to transfer data for queue id %d.\n",
dev->vid, __func__, queue_id);
n_pkts = 0;
}
iovec_idx = 0;
it_idx = 0;
segs_await = 0;
if (unlikely(n_pkts < pkt_burst_idx)) {
/*
* log error packets number here and do actual
* error processing when applications poll
* completion
*/
pkt_err = pkt_burst_idx - n_pkts;
pkt_burst_idx = 0;
break;
}
pkt_burst_idx = 0;
}
} while (pkt_idx < count);
if (pkt_burst_idx) {
n_xfer = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
if (likely(n_xfer >= 0)) {
n_pkts = n_xfer;
} else {
VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
dev->vid, __func__, queue_id);
n_pkts = 0;
}
if (unlikely(n_pkts < pkt_burst_idx))
pkt_err = pkt_burst_idx - n_pkts;
}
if (unlikely(pkt_err))
dma_error_handler_packed(vq, slot_idx, pkt_err, &pkt_idx);
if (likely(vq->shadow_used_idx)) {
/* keep used descriptors. */
store_dma_desc_info_packed(vq->shadow_used_packed, vq->async_buffers_packed,
vq->size, 0, vq->async_buffer_idx_packed,
vq->shadow_used_idx);
vq->async_buffer_idx_packed += vq->shadow_used_idx;
if (vq->async_buffer_idx_packed >= vq->size)
vq->async_buffer_idx_packed -= vq->size;
vq->async_pkts_idx += pkt_idx;
if (vq->async_pkts_idx >= vq->size)
vq->async_pkts_idx -= vq->size;
vq->shadow_used_idx = 0;
vq->async_pkts_inflight_n += pkt_idx;
}
return pkt_idx;
}
static __rte_always_inline void
write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
{
uint16_t nr_left = n_descs;
uint16_t nr_copy;
uint16_t to, from;
do {
from = vq->last_async_desc_idx_split & (vq->size - 1);
nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
to = vq->last_used_idx & (vq->size - 1);
if (to + nr_copy <= vq->size) {
rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
nr_copy * sizeof(struct vring_used_elem));
} else {
uint16_t size = vq->size - to;
rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
size * sizeof(struct vring_used_elem));
rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
(nr_copy - size) * sizeof(struct vring_used_elem));
}
vq->last_async_desc_idx_split += nr_copy;
vq->last_used_idx += nr_copy;
nr_left -= nr_copy;
} while (nr_left > 0);
}
static __rte_always_inline void
write_back_completed_descs_packed(struct vhost_virtqueue *vq,
uint16_t n_buffers)
{
uint16_t nr_left = n_buffers;
uint16_t from, to;
do {
from = vq->last_async_buffer_idx_packed;
to = (from + nr_left) % vq->size;
if (to > from) {
vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
vq->last_async_buffer_idx_packed += nr_left;
nr_left = 0;
} else {
vhost_update_used_packed(vq, vq->async_buffers_packed + from,
vq->size - from);
vq->last_async_buffer_idx_packed = 0;
nr_left -= vq->size - from;
}
} while (nr_left > 0);
}
static __rte_always_inline uint16_t
vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
struct rte_mbuf **pkts, uint16_t count)
{
struct vhost_virtqueue *vq;
struct async_inflight_info *pkts_info;
int32_t n_cpl;
uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
uint16_t start_idx, pkts_idx, vq_size;
uint16_t from, i;
vq = dev->virtqueue[queue_id];
pkts_idx = vq->async_pkts_idx % vq->size;
pkts_info = vq->async_pkts_info;
vq_size = vq->size;
start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
vq_size, vq->async_pkts_inflight_n);
if (count > vq->async_last_pkts_n) {
n_cpl = vq->async_ops.check_completed_copies(dev->vid,
queue_id, 0, count - vq->async_last_pkts_n);
if (likely(n_cpl >= 0)) {
n_pkts_cpl = n_cpl;
} else {
VHOST_LOG_DATA(ERR,
"(%d) %s: failed to check completed copies for queue id %d.\n",
dev->vid, __func__, queue_id);
n_pkts_cpl = 0;
}
}
n_pkts_cpl += vq->async_last_pkts_n;
n_pkts_put = RTE_MIN(n_pkts_cpl, count);
if (unlikely(n_pkts_put == 0)) {
vq->async_last_pkts_n = n_pkts_cpl;
return 0;
}
if (vq_is_packed(dev)) {
for (i = 0; i < n_pkts_put; i++) {
from = (start_idx + i) % vq_size;
n_buffers += pkts_info[from].nr_buffers;
pkts[i] = pkts_info[from].mbuf;
}
} else {
for (i = 0; i < n_pkts_put; i++) {
from = (start_idx + i) & (vq_size - 1);
n_descs += pkts_info[from].descs;
pkts[i] = pkts_info[from].mbuf;
}
}
vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
vq->async_pkts_inflight_n -= n_pkts_put;
if (likely(vq->enabled && vq->access_ok)) {
if (vq_is_packed(dev)) {
write_back_completed_descs_packed(vq, n_buffers);
vhost_vring_call_packed(dev, vq);
} else {
write_back_completed_descs_split(vq, n_descs);
__atomic_add_fetch(&vq->used->idx, n_descs,
__ATOMIC_RELEASE);
vhost_vring_call_split(dev, vq);
}
} else {
if (vq_is_packed(dev)) {
vq->last_async_buffer_idx_packed += n_buffers;
if (vq->last_async_buffer_idx_packed >= vq->size)
vq->last_async_buffer_idx_packed -= vq->size;
} else {
vq->last_async_desc_idx_split += n_descs;
}
}
return n_pkts_put;
}
uint16_t
rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
struct rte_mbuf **pkts, uint16_t count)
{
struct virtio_net *dev = get_device(vid);
struct vhost_virtqueue *vq;
uint16_t n_pkts_cpl = 0;
if (unlikely(!dev))
return 0;
VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
vq = dev->virtqueue[queue_id];
if (unlikely(!vq->async_registered)) {
VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
rte_spinlock_lock(&vq->access_lock);
n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
rte_spinlock_unlock(&vq->access_lock);
return n_pkts_cpl;
}
uint16_t
rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
struct rte_mbuf **pkts, uint16_t count)
{
struct virtio_net *dev = get_device(vid);
struct vhost_virtqueue *vq;
uint16_t n_pkts_cpl = 0;
if (!dev)
return 0;
VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
vq = dev->virtqueue[queue_id];
if (unlikely(!vq->async_registered)) {
VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
return n_pkts_cpl;
}
static __rte_always_inline uint32_t
virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
struct rte_mbuf **pkts, uint32_t count)
{
struct vhost_virtqueue *vq;
uint32_t nb_tx = 0;
VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
vq = dev->virtqueue[queue_id];
rte_spinlock_lock(&vq->access_lock);
if (unlikely(!vq->enabled || !vq->async_registered))
goto out_access_unlock;
if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
vhost_user_iotlb_rd_lock(vq);
if (unlikely(!vq->access_ok))
if (unlikely(vring_translate(dev, vq) < 0))
goto out;
count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
if (count == 0)
goto out;
if (vq_is_packed(dev))
nb_tx = virtio_dev_rx_async_submit_packed(dev, vq, queue_id,
pkts, count);
else
nb_tx = virtio_dev_rx_async_submit_split(dev, vq, queue_id,
pkts, count);
out:
if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
vhost_user_iotlb_rd_unlock(vq);
out_access_unlock:
rte_spinlock_unlock(&vq->access_lock);
return nb_tx;
}
uint16_t
rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
struct rte_mbuf **pkts, uint16_t count)
{
struct virtio_net *dev = get_device(vid);
if (!dev)
return 0;
if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
VHOST_LOG_DATA(ERR,
"(%d) %s: built-in vhost net backend is disabled.\n",
dev->vid, __func__);
return 0;
}
return virtio_dev_rx_async_submit(dev, queue_id, pkts, count);
}
static inline bool
virtio_net_with_host_offload(struct virtio_net *dev)
{
if (dev->features &
((1ULL << VIRTIO_NET_F_CSUM) |
(1ULL << VIRTIO_NET_F_HOST_ECN) |
(1ULL << VIRTIO_NET_F_HOST_TSO4) |
(1ULL << VIRTIO_NET_F_HOST_TSO6) |
(1ULL << VIRTIO_NET_F_HOST_UFO)))
return true;
return false;
}
static int
parse_headers(struct rte_mbuf *m, uint8_t *l4_proto)
{
struct rte_ipv4_hdr *ipv4_hdr;
struct rte_ipv6_hdr *ipv6_hdr;
struct rte_ether_hdr *eth_hdr;
uint16_t ethertype;
uint16_t data_len = rte_pktmbuf_data_len(m);
if (data_len < sizeof(struct rte_ether_hdr))
return -EINVAL;
eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
m->l2_len = sizeof(struct rte_ether_hdr);
ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
net: add rte prefix to ether defines Add 'RTE_' prefix to defines: - rename ETHER_ADDR_LEN as RTE_ETHER_ADDR_LEN. - rename ETHER_TYPE_LEN as RTE_ETHER_TYPE_LEN. - rename ETHER_CRC_LEN as RTE_ETHER_CRC_LEN. - rename ETHER_HDR_LEN as RTE_ETHER_HDR_LEN. - rename ETHER_MIN_LEN as RTE_ETHER_MIN_LEN. - rename ETHER_MAX_LEN as RTE_ETHER_MAX_LEN. - rename ETHER_MTU as RTE_ETHER_MTU. - rename ETHER_MAX_VLAN_FRAME_LEN as RTE_ETHER_MAX_VLAN_FRAME_LEN. - rename ETHER_MAX_VLAN_ID as RTE_ETHER_MAX_VLAN_ID. - rename ETHER_MAX_JUMBO_FRAME_LEN as RTE_ETHER_MAX_JUMBO_FRAME_LEN. - rename ETHER_MIN_MTU as RTE_ETHER_MIN_MTU. - rename ETHER_LOCAL_ADMIN_ADDR as RTE_ETHER_LOCAL_ADMIN_ADDR. - rename ETHER_GROUP_ADDR as RTE_ETHER_GROUP_ADDR. - rename ETHER_TYPE_IPv4 as RTE_ETHER_TYPE_IPv4. - rename ETHER_TYPE_IPv6 as RTE_ETHER_TYPE_IPv6. - rename ETHER_TYPE_ARP as RTE_ETHER_TYPE_ARP. - rename ETHER_TYPE_VLAN as RTE_ETHER_TYPE_VLAN. - rename ETHER_TYPE_RARP as RTE_ETHER_TYPE_RARP. - rename ETHER_TYPE_QINQ as RTE_ETHER_TYPE_QINQ. - rename ETHER_TYPE_ETAG as RTE_ETHER_TYPE_ETAG. - rename ETHER_TYPE_1588 as RTE_ETHER_TYPE_1588. - rename ETHER_TYPE_SLOW as RTE_ETHER_TYPE_SLOW. - rename ETHER_TYPE_TEB as RTE_ETHER_TYPE_TEB. - rename ETHER_TYPE_LLDP as RTE_ETHER_TYPE_LLDP. - rename ETHER_TYPE_MPLS as RTE_ETHER_TYPE_MPLS. - rename ETHER_TYPE_MPLSM as RTE_ETHER_TYPE_MPLSM. - rename ETHER_VXLAN_HLEN as RTE_ETHER_VXLAN_HLEN. - rename ETHER_ADDR_FMT_SIZE as RTE_ETHER_ADDR_FMT_SIZE. - rename VXLAN_GPE_TYPE_IPV4 as RTE_VXLAN_GPE_TYPE_IPV4. - rename VXLAN_GPE_TYPE_IPV6 as RTE_VXLAN_GPE_TYPE_IPV6. - rename VXLAN_GPE_TYPE_ETH as RTE_VXLAN_GPE_TYPE_ETH. - rename VXLAN_GPE_TYPE_NSH as RTE_VXLAN_GPE_TYPE_NSH. - rename VXLAN_GPE_TYPE_MPLS as RTE_VXLAN_GPE_TYPE_MPLS. - rename VXLAN_GPE_TYPE_GBP as RTE_VXLAN_GPE_TYPE_GBP. - rename VXLAN_GPE_TYPE_VBNG as RTE_VXLAN_GPE_TYPE_VBNG. - rename ETHER_VXLAN_GPE_HLEN as RTE_ETHER_VXLAN_GPE_HLEN. Do not update the command line library to avoid adding a dependency to librte_net. Signed-off-by: Olivier Matz <olivier.matz@6wind.com> Reviewed-by: Stephen Hemminger <stephen@networkplumber.org> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
2019-05-21 18:13:05 +02:00
if (ethertype == RTE_ETHER_TYPE_VLAN) {
if (data_len < sizeof(struct rte_ether_hdr) +
sizeof(struct rte_vlan_hdr))
goto error;
struct rte_vlan_hdr *vlan_hdr =
(struct rte_vlan_hdr *)(eth_hdr + 1);
m->l2_len += sizeof(struct rte_vlan_hdr);
ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
}
switch (ethertype) {
case RTE_ETHER_TYPE_IPV4:
if (data_len < m->l2_len + sizeof(struct rte_ipv4_hdr))
goto error;
ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
m->l2_len);
m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
if (data_len < m->l2_len + m->l3_len)
goto error;
m->ol_flags |= RTE_MBUF_F_TX_IPV4;
*l4_proto = ipv4_hdr->next_proto_id;
break;
case RTE_ETHER_TYPE_IPV6:
if (data_len < m->l2_len + sizeof(struct rte_ipv6_hdr))
goto error;
ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *,
m->l2_len);
m->l3_len = sizeof(struct rte_ipv6_hdr);
m->ol_flags |= RTE_MBUF_F_TX_IPV6;
*l4_proto = ipv6_hdr->proto;
break;
default:
/* a valid L3 header is needed for further L4 parsing */
goto error;
}
/* both CSUM and GSO need a valid L4 header */
switch (*l4_proto) {
case IPPROTO_TCP:
if (data_len < m->l2_len + m->l3_len +
sizeof(struct rte_tcp_hdr))
goto error;
break;
case IPPROTO_UDP:
if (data_len < m->l2_len + m->l3_len +
sizeof(struct rte_udp_hdr))
goto error;
break;
case IPPROTO_SCTP:
if (data_len < m->l2_len + m->l3_len +
sizeof(struct rte_sctp_hdr))
goto error;
break;
default:
goto error;
}
return 0;
error:
m->l2_len = 0;
m->l3_len = 0;
m->ol_flags = 0;
return -EINVAL;
}
static __rte_always_inline void
vhost_dequeue_offload_legacy(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
{
uint8_t l4_proto = 0;
struct rte_tcp_hdr *tcp_hdr = NULL;
uint16_t tcp_len;
uint16_t data_len = rte_pktmbuf_data_len(m);
if (parse_headers(m, &l4_proto) < 0)
return;
if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
if (hdr->csum_start == (m->l2_len + m->l3_len)) {
switch (hdr->csum_offset) {
case (offsetof(struct rte_tcp_hdr, cksum)):
if (l4_proto != IPPROTO_TCP)
goto error;
m->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
break;
case (offsetof(struct rte_udp_hdr, dgram_cksum)):
if (l4_proto != IPPROTO_UDP)
goto error;
m->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
break;
case (offsetof(struct rte_sctp_hdr, cksum)):
if (l4_proto != IPPROTO_SCTP)
goto error;
m->ol_flags |= RTE_MBUF_F_TX_SCTP_CKSUM;
break;
default:
goto error;
}
} else {
goto error;
}
}
if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
case VIRTIO_NET_HDR_GSO_TCPV4:
case VIRTIO_NET_HDR_GSO_TCPV6:
if (l4_proto != IPPROTO_TCP)
goto error;
tcp_hdr = rte_pktmbuf_mtod_offset(m,
struct rte_tcp_hdr *,
m->l2_len + m->l3_len);
tcp_len = (tcp_hdr->data_off & 0xf0) >> 2;
if (data_len < m->l2_len + m->l3_len + tcp_len)
goto error;
m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
m->tso_segsz = hdr->gso_size;
m->l4_len = tcp_len;
break;
case VIRTIO_NET_HDR_GSO_UDP:
if (l4_proto != IPPROTO_UDP)
goto error;
m->ol_flags |= RTE_MBUF_F_TX_UDP_SEG;
m->tso_segsz = hdr->gso_size;
m->l4_len = sizeof(struct rte_udp_hdr);
break;
default:
VHOST_LOG_DATA(WARNING,
"unsupported gso type %u.\n", hdr->gso_type);
goto error;
}
}
return;
error:
m->l2_len = 0;
m->l3_len = 0;
m->ol_flags = 0;
}
static __rte_always_inline void
vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m,
bool legacy_ol_flags)
{
struct rte_net_hdr_lens hdr_lens;
int l4_supported = 0;
uint32_t ptype;
if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
return;
if (legacy_ol_flags) {
vhost_dequeue_offload_legacy(hdr, m);
return;
}
m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN;
ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
m->packet_type = ptype;
if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
(ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
(ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
l4_supported = 1;
/* According to Virtio 1.1 spec, the device only needs to look at
* VIRTIO_NET_HDR_F_NEEDS_CSUM in the packet transmission path.
* This differs from the processing incoming packets path where the
* driver could rely on VIRTIO_NET_HDR_F_DATA_VALID flag set by the
* device.
*
* 5.1.6.2.1 Driver Requirements: Packet Transmission
* The driver MUST NOT set the VIRTIO_NET_HDR_F_DATA_VALID and
* VIRTIO_NET_HDR_F_RSC_INFO bits in flags.
*
* 5.1.6.2.2 Device Requirements: Packet Transmission
* The device MUST ignore flag bits that it does not recognize.
*/
if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
uint32_t hdrlen;
hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
if (hdr->csum_start <= hdrlen && l4_supported != 0) {
m->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_NONE;
} else {
/* Unknown proto or tunnel, do sw cksum. We can assume
* the cksum field is in the first segment since the
* buffers we provided to the host are large enough.
* In case of SCTP, this will be wrong since it's a CRC
* but there's nothing we can do.
*/
uint16_t csum = 0, off;
if (rte_raw_cksum_mbuf(m, hdr->csum_start,
rte_pktmbuf_pkt_len(m) - hdr->csum_start, &csum) < 0)
return;
if (likely(csum != 0xffff))
csum = ~csum;
off = hdr->csum_offset + hdr->csum_start;
if (rte_pktmbuf_data_len(m) >= off + 1)
*rte_pktmbuf_mtod_offset(m, uint16_t *, off) = csum;
}
}
if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
if (hdr->gso_size == 0)
return;
switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
case VIRTIO_NET_HDR_GSO_TCPV4:
case VIRTIO_NET_HDR_GSO_TCPV6:
if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_TCP)
break;
m->ol_flags |= RTE_MBUF_F_RX_LRO | RTE_MBUF_F_RX_L4_CKSUM_NONE;
m->tso_segsz = hdr->gso_size;
break;
case VIRTIO_NET_HDR_GSO_UDP:
if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_UDP)
break;
m->ol_flags |= RTE_MBUF_F_RX_LRO | RTE_MBUF_F_RX_L4_CKSUM_NONE;
m->tso_segsz = hdr->gso_size;
break;
default:
break;
}
}
}
static __rte_noinline void
copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
struct buf_vector *buf_vec)
{
uint64_t len;
uint64_t remain = sizeof(struct virtio_net_hdr);
uint64_t src;
uint64_t dst = (uint64_t)(uintptr_t)hdr;
while (remain) {
len = RTE_MIN(remain, buf_vec->buf_len);
src = buf_vec->buf_addr;
rte_memcpy((void *)(uintptr_t)dst,
(void *)(uintptr_t)src, len);
remain -= len;
dst += len;
buf_vec++;
}
}
static __rte_always_inline int
copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct buf_vector *buf_vec, uint16_t nr_vec,
struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
bool legacy_ol_flags)
{
uint32_t buf_avail, buf_offset;
uint64_t buf_addr, buf_len;
uint32_t mbuf_avail, mbuf_offset;
uint32_t cpy_len;
struct rte_mbuf *cur = m, *prev = m;
struct virtio_net_hdr tmp_hdr;
struct virtio_net_hdr *hdr = NULL;
/* A counter to avoid desc dead loop chain */
uint16_t vec_idx = 0;
struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
int error = 0;
buf_addr = buf_vec[vec_idx].buf_addr;
buf_len = buf_vec[vec_idx].buf_len;
if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
error = -1;
goto out;
}
if (virtio_net_with_host_offload(dev)) {
if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
/*
* No luck, the virtio-net header doesn't fit
* in a contiguous virtual area.
*/
copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
hdr = &tmp_hdr;
} else {
hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
}
}
/*
* A virtio driver normally uses at least 2 desc buffers
* for Tx: the first for storing the header, and others
* for storing the data.
*/
if (unlikely(buf_len < dev->vhost_hlen)) {
buf_offset = dev->vhost_hlen - buf_len;
vec_idx++;
buf_addr = buf_vec[vec_idx].buf_addr;
buf_len = buf_vec[vec_idx].buf_len;
buf_avail = buf_len - buf_offset;
} else if (buf_len == dev->vhost_hlen) {
if (unlikely(++vec_idx >= nr_vec))
goto out;
buf_addr = buf_vec[vec_idx].buf_addr;
buf_len = buf_vec[vec_idx].buf_len;
buf_offset = 0;
buf_avail = buf_len;
} else {
buf_offset = dev->vhost_hlen;
buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
}
PRINT_PACKET(dev,
(uintptr_t)(buf_addr + buf_offset),
(uint32_t)buf_avail, 0);
mbuf_offset = 0;
mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
while (1) {
cpy_len = RTE_MIN(buf_avail, mbuf_avail);
vhost: add dequeue zero copy The basic idea of dequeue zero copy is, instead of copying data from the desc buf, here we let the mbuf reference the desc buf addr directly. Doing so, however, has one major issue: we can't update the used ring at the end of rte_vhost_dequeue_burst. Because we don't do the copy here, an update of the used ring would let the driver to reclaim the desc buf. As a result, DPDK might reference a stale memory region. To update the used ring properly, this patch does several tricks: - when mbuf references a desc buf, refcnt is added by 1. This is to pin lock the mbuf, so that a mbuf free from the DPDK won't actually free it, instead, refcnt is subtracted by 1. - We chain all those mbuf together (by tailq) And we check it every time on the rte_vhost_dequeue_burst entrance, to see if the mbuf is freed (when refcnt equals to 1). If that happens, it means we are the last user of this mbuf and we are safe to update the used ring. - "struct zcopy_mbuf" is introduced, to associate an mbuf with the right desc idx. Dequeue zero copy is introduced for performance reason, and some rough tests show about 50% perfomance boost for packet size 1500B. For small packets, (e.g. 64B), it actually slows a bit down (well, it could up to 15%). That is expected because this patch introduces some extra works, and it outweighs the benefit from saving few bytes copy. Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Qian Xu <qian.q.xu@intel.com>
2016-10-09 15:27:57 +08:00
if (likely(cpy_len > MAX_BATCH_LEN ||
vq->batch_copy_nb_elems >= vq->size ||
(hdr && cur == m))) {
rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
mbuf_offset),
(void *)((uintptr_t)(buf_addr +
buf_offset)), cpy_len);
} else {
batch_copy[vq->batch_copy_nb_elems].dst =
rte_pktmbuf_mtod_offset(cur, void *,
mbuf_offset);
batch_copy[vq->batch_copy_nb_elems].src =
(void *)((uintptr_t)(buf_addr + buf_offset));
batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
vq->batch_copy_nb_elems++;
vhost: add dequeue zero copy The basic idea of dequeue zero copy is, instead of copying data from the desc buf, here we let the mbuf reference the desc buf addr directly. Doing so, however, has one major issue: we can't update the used ring at the end of rte_vhost_dequeue_burst. Because we don't do the copy here, an update of the used ring would let the driver to reclaim the desc buf. As a result, DPDK might reference a stale memory region. To update the used ring properly, this patch does several tricks: - when mbuf references a desc buf, refcnt is added by 1. This is to pin lock the mbuf, so that a mbuf free from the DPDK won't actually free it, instead, refcnt is subtracted by 1. - We chain all those mbuf together (by tailq) And we check it every time on the rte_vhost_dequeue_burst entrance, to see if the mbuf is freed (when refcnt equals to 1). If that happens, it means we are the last user of this mbuf and we are safe to update the used ring. - "struct zcopy_mbuf" is introduced, to associate an mbuf with the right desc idx. Dequeue zero copy is introduced for performance reason, and some rough tests show about 50% perfomance boost for packet size 1500B. For small packets, (e.g. 64B), it actually slows a bit down (well, it could up to 15%). That is expected because this patch introduces some extra works, and it outweighs the benefit from saving few bytes copy. Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Qian Xu <qian.q.xu@intel.com>
2016-10-09 15:27:57 +08:00
}
mbuf_avail -= cpy_len;
mbuf_offset += cpy_len;
buf_avail -= cpy_len;
buf_offset += cpy_len;
/* This buf reaches to its end, get the next one */
if (buf_avail == 0) {
if (++vec_idx >= nr_vec)
break;
buf_addr = buf_vec[vec_idx].buf_addr;
buf_len = buf_vec[vec_idx].buf_len;
buf_offset = 0;
buf_avail = buf_len;
PRINT_PACKET(dev, (uintptr_t)buf_addr,
(uint32_t)buf_avail, 0);
}
/*
* This mbuf reaches to its end, get a new one
* to hold more data.
*/
if (mbuf_avail == 0) {
cur = rte_pktmbuf_alloc(mbuf_pool);
if (unlikely(cur == NULL)) {
VHOST_LOG_DATA(ERR, "Failed to "
"allocate memory for mbuf.\n");
error = -1;
goto out;
}
prev->next = cur;
prev->data_len = mbuf_offset;
m->nb_segs += 1;
m->pkt_len += mbuf_offset;
prev = cur;
mbuf_offset = 0;
mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
}
}
prev->data_len = mbuf_offset;
m->pkt_len += mbuf_offset;
if (hdr)
vhost_dequeue_offload(hdr, m, legacy_ol_flags);
out:
return error;
}
static void
virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
{
rte_free(opaque);
}
static int
virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
{
struct rte_mbuf_ext_shared_info *shinfo = NULL;
uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
uint16_t buf_len;
rte_iova_t iova;
void *buf;
total_len += sizeof(*shinfo) + sizeof(uintptr_t);
total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
if (unlikely(total_len > UINT16_MAX))
return -ENOSPC;
buf_len = total_len;
buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
if (unlikely(buf == NULL))
return -ENOMEM;
/* Initialize shinfo */
shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
virtio_dev_extbuf_free, buf);
if (unlikely(shinfo == NULL)) {
rte_free(buf);
VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
return -1;
}
iova = rte_malloc_virt2iova(buf);
rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
rte_pktmbuf_reset_headroom(pkt);
return 0;
}
/*
* Prepare a host supported pktmbuf.
*/
static __rte_always_inline int
virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
uint32_t data_len)
{
if (rte_pktmbuf_tailroom(pkt) >= data_len)
return 0;
/* attach an external buffer if supported */
if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
return 0;
/* check if chained buffers are allowed */
if (!dev->linearbuf)
return 0;
return -1;
}
__rte_always_inline
static uint16_t
virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
bool legacy_ol_flags)
{
uint16_t i;
uint16_t free_entries;
uint16_t dropped = 0;
static bool allocerr_warned;
/*
* The ordering between avail index and
* desc reads needs to be enforced.
*/
free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
vq->last_avail_idx;
if (free_entries == 0)
return 0;
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
count = RTE_MIN(count, MAX_PKT_BURST);
count = RTE_MIN(count, free_entries);
VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
dev->vid, count);
if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
return 0;
for (i = 0; i < count; i++) {
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t head_idx;
uint32_t buf_len;
uint16_t nr_vec = 0;
int err;
if (unlikely(fill_vec_buf_split(dev, vq,
vq->last_avail_idx + i,
&nr_vec, buf_vec,
&head_idx, &buf_len,
VHOST_ACCESS_RO) < 0))
break;
update_shadow_used_ring_split(vq, head_idx, 0);
err = virtio_dev_pktmbuf_prep(dev, pkts[i], buf_len);
if (unlikely(err)) {
/*
* mbuf allocation fails for jumbo packets when external
* buffer allocation is not allowed and linear buffer
* is required. Drop this packet.
*/
if (!allocerr_warned) {
VHOST_LOG_DATA(ERR,
"Failed mbuf alloc of size %d from %s on %s.\n",
buf_len, mbuf_pool->name, dev->ifname);
allocerr_warned = true;
}
dropped += 1;
i++;
break;
}
vhost: add dequeue zero copy The basic idea of dequeue zero copy is, instead of copying data from the desc buf, here we let the mbuf reference the desc buf addr directly. Doing so, however, has one major issue: we can't update the used ring at the end of rte_vhost_dequeue_burst. Because we don't do the copy here, an update of the used ring would let the driver to reclaim the desc buf. As a result, DPDK might reference a stale memory region. To update the used ring properly, this patch does several tricks: - when mbuf references a desc buf, refcnt is added by 1. This is to pin lock the mbuf, so that a mbuf free from the DPDK won't actually free it, instead, refcnt is subtracted by 1. - We chain all those mbuf together (by tailq) And we check it every time on the rte_vhost_dequeue_burst entrance, to see if the mbuf is freed (when refcnt equals to 1). If that happens, it means we are the last user of this mbuf and we are safe to update the used ring. - "struct zcopy_mbuf" is introduced, to associate an mbuf with the right desc idx. Dequeue zero copy is introduced for performance reason, and some rough tests show about 50% perfomance boost for packet size 1500B. For small packets, (e.g. 64B), it actually slows a bit down (well, it could up to 15%). That is expected because this patch introduces some extra works, and it outweighs the benefit from saving few bytes copy. Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Qian Xu <qian.q.xu@intel.com>
2016-10-09 15:27:57 +08:00
err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
mbuf_pool, legacy_ol_flags);
if (unlikely(err)) {
if (!allocerr_warned) {
VHOST_LOG_DATA(ERR,
"Failed to copy desc to mbuf on %s.\n",
dev->ifname);
allocerr_warned = true;
}
dropped += 1;
i++;
break;
}
vhost: add dequeue zero copy The basic idea of dequeue zero copy is, instead of copying data from the desc buf, here we let the mbuf reference the desc buf addr directly. Doing so, however, has one major issue: we can't update the used ring at the end of rte_vhost_dequeue_burst. Because we don't do the copy here, an update of the used ring would let the driver to reclaim the desc buf. As a result, DPDK might reference a stale memory region. To update the used ring properly, this patch does several tricks: - when mbuf references a desc buf, refcnt is added by 1. This is to pin lock the mbuf, so that a mbuf free from the DPDK won't actually free it, instead, refcnt is subtracted by 1. - We chain all those mbuf together (by tailq) And we check it every time on the rte_vhost_dequeue_burst entrance, to see if the mbuf is freed (when refcnt equals to 1). If that happens, it means we are the last user of this mbuf and we are safe to update the used ring. - "struct zcopy_mbuf" is introduced, to associate an mbuf with the right desc idx. Dequeue zero copy is introduced for performance reason, and some rough tests show about 50% perfomance boost for packet size 1500B. For small packets, (e.g. 64B), it actually slows a bit down (well, it could up to 15%). That is expected because this patch introduces some extra works, and it outweighs the benefit from saving few bytes copy. Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Qian Xu <qian.q.xu@intel.com>
2016-10-09 15:27:57 +08:00
}
if (dropped)
rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
vq->last_avail_idx += i;
vhost: broadcast RARP by injecting in receiving mbuf array Broadcast RARP packet by injecting it to receiving mbuf array at rte_vhost_dequeue_burst(). Commit 33226236a35e ("vhost: handle request to send RARP") iterates all host interfaces and then broadcast it by all of them. It did notify the switches about the new location of the migrated VM, however, the mac learning table in the target host is wrong (at least in my test with OVS): $ ovs-appctl fdb/show ovsbr0 port VLAN MAC Age 1 0 b6:3c:72:71:cd:4d 10 LOCAL 0 b6:3c:72:71:cd:4e 10 LOCAL 0 52:54:00:12:34:68 9 1 0 56:f6:64:2c:bc:c0 1 Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the above, the port learned is "LOCAL", which is the "ovsbr0" port. That is reasonable, since we indeed send the pkt by the "ovsbr0" interface. The wrong mac table lead all the packets to the VM go to the "ovsbr0" in the end, which ends up with all packets being lost, until the guest send a ARP quest (or reply) to refresh the mac learning table. Jianfeng then came up with a solution I have thought of firstly but NAKed by myself, concerning it has potential issues [0]. The solution is as title stated: broadcast the RARP packet by injecting it to the receiving mbuf arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me think it twice; it looked like a false concern to me then. And I had done a rough verification: it worked as expected. [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html Another note is that while preparing this version, I found that DPDK has some ARP related structures and macros defined. So, use them instead of the one from standard header files here. Cc: Thibaut Collet <thibaut.collet@6wind.com> Suggested-by: Jianfeng Tan <jianfeng.tan@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-22 22:36:11 +08:00
do_data_copy_dequeue(vq);
if (unlikely(i < count))
vq->shadow_used_idx = i;
if (likely(vq->shadow_used_idx)) {
flush_shadow_used_ring_split(dev, vq);
vhost_vring_call_split(dev, vq);
vhost: add dequeue zero copy The basic idea of dequeue zero copy is, instead of copying data from the desc buf, here we let the mbuf reference the desc buf addr directly. Doing so, however, has one major issue: we can't update the used ring at the end of rte_vhost_dequeue_burst. Because we don't do the copy here, an update of the used ring would let the driver to reclaim the desc buf. As a result, DPDK might reference a stale memory region. To update the used ring properly, this patch does several tricks: - when mbuf references a desc buf, refcnt is added by 1. This is to pin lock the mbuf, so that a mbuf free from the DPDK won't actually free it, instead, refcnt is subtracted by 1. - We chain all those mbuf together (by tailq) And we check it every time on the rte_vhost_dequeue_burst entrance, to see if the mbuf is freed (when refcnt equals to 1). If that happens, it means we are the last user of this mbuf and we are safe to update the used ring. - "struct zcopy_mbuf" is introduced, to associate an mbuf with the right desc idx. Dequeue zero copy is introduced for performance reason, and some rough tests show about 50% perfomance boost for packet size 1500B. For small packets, (e.g. 64B), it actually slows a bit down (well, it could up to 15%). That is expected because this patch introduces some extra works, and it outweighs the benefit from saving few bytes copy. Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Qian Xu <qian.q.xu@intel.com>
2016-10-09 15:27:57 +08:00
}
vhost: broadcast RARP by injecting in receiving mbuf array Broadcast RARP packet by injecting it to receiving mbuf array at rte_vhost_dequeue_burst(). Commit 33226236a35e ("vhost: handle request to send RARP") iterates all host interfaces and then broadcast it by all of them. It did notify the switches about the new location of the migrated VM, however, the mac learning table in the target host is wrong (at least in my test with OVS): $ ovs-appctl fdb/show ovsbr0 port VLAN MAC Age 1 0 b6:3c:72:71:cd:4d 10 LOCAL 0 b6:3c:72:71:cd:4e 10 LOCAL 0 52:54:00:12:34:68 9 1 0 56:f6:64:2c:bc:c0 1 Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the above, the port learned is "LOCAL", which is the "ovsbr0" port. That is reasonable, since we indeed send the pkt by the "ovsbr0" interface. The wrong mac table lead all the packets to the VM go to the "ovsbr0" in the end, which ends up with all packets being lost, until the guest send a ARP quest (or reply) to refresh the mac learning table. Jianfeng then came up with a solution I have thought of firstly but NAKed by myself, concerning it has potential issues [0]. The solution is as title stated: broadcast the RARP packet by injecting it to the receiving mbuf arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me think it twice; it looked like a false concern to me then. And I had done a rough verification: it worked as expected. [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html Another note is that while preparing this version, I found that DPDK has some ARP related structures and macros defined. So, use them instead of the one from standard header files here. Cc: Thibaut Collet <thibaut.collet@6wind.com> Suggested-by: Jianfeng Tan <jianfeng.tan@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-22 22:36:11 +08:00
return (i - dropped);
}
__rte_noinline
static uint16_t
virtio_dev_tx_split_legacy(struct virtio_net *dev,
struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
struct rte_mbuf **pkts, uint16_t count)
{
return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, true);
}
__rte_noinline
static uint16_t
virtio_dev_tx_split_compliant(struct virtio_net *dev,
struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
struct rte_mbuf **pkts, uint16_t count)
{
return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, false);
}
static __rte_always_inline int
vhost_reserve_avail_batch_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf **pkts,
uint16_t avail_idx,
uintptr_t *desc_addrs,
uint16_t *ids)
{
bool wrap = vq->avail_wrap_counter;
struct vring_packed_desc *descs = vq->desc_packed;
uint64_t lens[PACKED_BATCH_SIZE];
uint64_t buf_lens[PACKED_BATCH_SIZE];
uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
uint16_t flags, i;
if (unlikely(avail_idx & PACKED_BATCH_MASK))
return -1;
if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
return -1;
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
flags = descs[avail_idx + i].flags;
if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
(wrap == !!(flags & VRING_DESC_F_USED)) ||
(flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
return -1;
}
rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
lens[i] = descs[avail_idx + i].len;
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
desc_addrs[i] = vhost_iova_to_vva(dev, vq,
descs[avail_idx + i].addr,
&lens[i], VHOST_ACCESS_RW);
}
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
if (unlikely(!desc_addrs[i]))
return -1;
if (unlikely((lens[i] != descs[avail_idx + i].len)))
return -1;
}
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
goto err;
}
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
goto err;
}
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
pkts[i]->pkt_len = lens[i] - buf_offset;
pkts[i]->data_len = pkts[i]->pkt_len;
ids[i] = descs[avail_idx + i].id;
}
return 0;
err:
return -1;
}
static __rte_always_inline int
virtio_dev_tx_batch_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf **pkts,
bool legacy_ol_flags)
{
uint16_t avail_idx = vq->last_avail_idx;
uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
struct virtio_net_hdr *hdr;
uintptr_t desc_addrs[PACKED_BATCH_SIZE];
uint16_t ids[PACKED_BATCH_SIZE];
uint16_t i;
if (vhost_reserve_avail_batch_packed(dev, vq, pkts, avail_idx,
desc_addrs, ids))
return -1;
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
(void *)(uintptr_t)(desc_addrs[i] + buf_offset),
pkts[i]->pkt_len);
if (virtio_net_with_host_offload(dev)) {
vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
vhost_dequeue_offload(hdr, pkts[i], legacy_ol_flags);
}
}
if (virtio_net_is_inorder(dev))
vhost_shadow_dequeue_batch_packed_inorder(vq,
ids[PACKED_BATCH_SIZE - 1]);
else
vhost_shadow_dequeue_batch_packed(dev, vq, ids);
vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
return 0;
}
static __rte_always_inline int
vhost_dequeue_single_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mempool *mbuf_pool,
struct rte_mbuf *pkts,
uint16_t *buf_id,
uint16_t *desc_count,
bool legacy_ol_flags)
{
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint32_t buf_len;
uint16_t nr_vec = 0;
int err;
static bool allocerr_warned;
if (unlikely(fill_vec_buf_packed(dev, vq,
vq->last_avail_idx, desc_count,
buf_vec, &nr_vec,
buf_id, &buf_len,
VHOST_ACCESS_RO) < 0))
return -1;
if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
if (!allocerr_warned) {
VHOST_LOG_DATA(ERR,
"Failed mbuf alloc of size %d from %s on %s.\n",
buf_len, mbuf_pool->name, dev->ifname);
allocerr_warned = true;
}
return -1;
}
err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts,
mbuf_pool, legacy_ol_flags);
if (unlikely(err)) {
if (!allocerr_warned) {
VHOST_LOG_DATA(ERR,
"Failed to copy desc to mbuf on %s.\n",
dev->ifname);
allocerr_warned = true;
}
return -1;
}
return 0;
}
static __rte_always_inline int
virtio_dev_tx_single_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mempool *mbuf_pool,
struct rte_mbuf *pkts,
bool legacy_ol_flags)
{
uint16_t buf_id, desc_count = 0;
int ret;
ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
&desc_count, legacy_ol_flags);
if (likely(desc_count > 0)) {
if (virtio_net_is_inorder(dev))
vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
desc_count);
else
vhost_shadow_dequeue_single_packed(vq, buf_id,
desc_count);
vq_inc_last_avail_packed(vq, desc_count);
}
return ret;
}
__rte_always_inline
static uint16_t
virtio_dev_tx_packed(struct virtio_net *dev,
struct vhost_virtqueue *__rte_restrict vq,
struct rte_mempool *mbuf_pool,
struct rte_mbuf **__rte_restrict pkts,
uint32_t count,
bool legacy_ol_flags)
{
uint32_t pkt_idx = 0;
if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
return 0;
do {
rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
if (count - pkt_idx >= PACKED_BATCH_SIZE) {
if (!virtio_dev_tx_batch_packed(dev, vq,
&pkts[pkt_idx],
legacy_ol_flags)) {
pkt_idx += PACKED_BATCH_SIZE;
continue;
}
}
if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
pkts[pkt_idx],
legacy_ol_flags))
break;
pkt_idx++;
} while (pkt_idx < count);
if (pkt_idx != count)
rte_pktmbuf_free_bulk(&pkts[pkt_idx], count - pkt_idx);
if (vq->shadow_used_idx) {
do_data_copy_dequeue(vq);
vhost_flush_dequeue_shadow_packed(dev, vq);
vhost_vring_call_packed(dev, vq);
}
return pkt_idx;
}
__rte_noinline
static uint16_t
virtio_dev_tx_packed_legacy(struct virtio_net *dev,
struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
struct rte_mbuf **__rte_restrict pkts, uint32_t count)
{
return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, true);
}
__rte_noinline
static uint16_t
virtio_dev_tx_packed_compliant(struct virtio_net *dev,
struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
struct rte_mbuf **__rte_restrict pkts, uint32_t count)
{
return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, false);
}
uint16_t
rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
{
struct virtio_net *dev;
struct rte_mbuf *rarp_mbuf = NULL;
struct vhost_virtqueue *vq;
int16_t success = 1;
dev = get_device(vid);
if (!dev)
return 0;
if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
VHOST_LOG_DATA(ERR,
"(%d) %s: built-in vhost net backend is disabled.\n",
dev->vid, __func__);
return 0;
}
if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
VHOST_LOG_DATA(ERR,
"(%d) %s: invalid virtqueue idx %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
vq = dev->virtqueue[queue_id];
if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
return 0;
if (unlikely(!vq->enabled)) {
count = 0;
goto out_access_unlock;
}
if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
vhost_user_iotlb_rd_lock(vq);
if (unlikely(!vq->access_ok))
if (unlikely(vring_translate(dev, vq) < 0)) {
count = 0;
goto out;
}
/*
* Construct a RARP broadcast packet, and inject it to the "pkts"
* array, to looks like that guest actually send such packet.
*
* Check user_send_rarp() for more information.
*
* broadcast_rarp shares a cacheline in the virtio_net structure
* with some fields that are accessed during enqueue and
* __atomic_compare_exchange_n causes a write if performed compare
* and exchange. This could result in false sharing between enqueue
* and dequeue.
*
* Prevent unnecessary false sharing by reading broadcast_rarp first
* and only performing compare and exchange if the read indicates it
* is likely to be set.
*/
if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
__atomic_compare_exchange_n(&dev->broadcast_rarp,
&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
if (rarp_mbuf == NULL) {
VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
count = 0;
goto out;
}
/*
* Inject it to the head of "pkts" array, so that switch's mac
* learning table will get updated first.
*/
pkts[0] = rarp_mbuf;
pkts++;
count -= 1;
}
if (vq_is_packed(dev)) {
if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
count = virtio_dev_tx_packed_legacy(dev, vq, mbuf_pool, pkts, count);
else
count = virtio_dev_tx_packed_compliant(dev, vq, mbuf_pool, pkts, count);
} else {
if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
count = virtio_dev_tx_split_legacy(dev, vq, mbuf_pool, pkts, count);
else
count = virtio_dev_tx_split_compliant(dev, vq, mbuf_pool, pkts, count);
}
vhost: broadcast RARP by injecting in receiving mbuf array Broadcast RARP packet by injecting it to receiving mbuf array at rte_vhost_dequeue_burst(). Commit 33226236a35e ("vhost: handle request to send RARP") iterates all host interfaces and then broadcast it by all of them. It did notify the switches about the new location of the migrated VM, however, the mac learning table in the target host is wrong (at least in my test with OVS): $ ovs-appctl fdb/show ovsbr0 port VLAN MAC Age 1 0 b6:3c:72:71:cd:4d 10 LOCAL 0 b6:3c:72:71:cd:4e 10 LOCAL 0 52:54:00:12:34:68 9 1 0 56:f6:64:2c:bc:c0 1 Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the above, the port learned is "LOCAL", which is the "ovsbr0" port. That is reasonable, since we indeed send the pkt by the "ovsbr0" interface. The wrong mac table lead all the packets to the VM go to the "ovsbr0" in the end, which ends up with all packets being lost, until the guest send a ARP quest (or reply) to refresh the mac learning table. Jianfeng then came up with a solution I have thought of firstly but NAKed by myself, concerning it has potential issues [0]. The solution is as title stated: broadcast the RARP packet by injecting it to the receiving mbuf arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me think it twice; it looked like a false concern to me then. And I had done a rough verification: it worked as expected. [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html Another note is that while preparing this version, I found that DPDK has some ARP related structures and macros defined. So, use them instead of the one from standard header files here. Cc: Thibaut Collet <thibaut.collet@6wind.com> Suggested-by: Jianfeng Tan <jianfeng.tan@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-22 22:36:11 +08:00
out:
if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
vhost_user_iotlb_rd_unlock(vq);
out_access_unlock:
rte_spinlock_unlock(&vq->access_lock);
if (unlikely(rarp_mbuf != NULL))
count += 1;
vhost: broadcast RARP by injecting in receiving mbuf array Broadcast RARP packet by injecting it to receiving mbuf array at rte_vhost_dequeue_burst(). Commit 33226236a35e ("vhost: handle request to send RARP") iterates all host interfaces and then broadcast it by all of them. It did notify the switches about the new location of the migrated VM, however, the mac learning table in the target host is wrong (at least in my test with OVS): $ ovs-appctl fdb/show ovsbr0 port VLAN MAC Age 1 0 b6:3c:72:71:cd:4d 10 LOCAL 0 b6:3c:72:71:cd:4e 10 LOCAL 0 52:54:00:12:34:68 9 1 0 56:f6:64:2c:bc:c0 1 Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the above, the port learned is "LOCAL", which is the "ovsbr0" port. That is reasonable, since we indeed send the pkt by the "ovsbr0" interface. The wrong mac table lead all the packets to the VM go to the "ovsbr0" in the end, which ends up with all packets being lost, until the guest send a ARP quest (or reply) to refresh the mac learning table. Jianfeng then came up with a solution I have thought of firstly but NAKed by myself, concerning it has potential issues [0]. The solution is as title stated: broadcast the RARP packet by injecting it to the receiving mbuf arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me think it twice; it looked like a false concern to me then. And I had done a rough verification: it worked as expected. [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html Another note is that while preparing this version, I found that DPDK has some ARP related structures and macros defined. So, use them instead of the one from standard header files here. Cc: Thibaut Collet <thibaut.collet@6wind.com> Suggested-by: Jianfeng Tan <jianfeng.tan@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
2016-02-22 22:36:11 +08:00
return count;
}