freebsd-skq/sys/dev/netmap/netmap_offloadings.c
Luigi Rizzo 37e3a6d349 Import the current version of netmap, aligned with the one on github.
This commit, long overdue, contains contributions in the last 2 years
from Stefano Garzarella, Giuseppe Lettieri, Vincenzo Maffione, including:
+ fixes on monitor ports
+ the 'ptnet' virtual device driver, and ptnetmap backend, for
  high speed virtual passthrough on VMs (bhyve fixes in an upcoming commit)
+ improved emulated netmap mode
+ more robust error handling
+ removal of stale code
+ various fixes to code and documentation (some mixup between RX and TX
  parameters, and private and public variables)

We also include an additional tool, nmreplay, which is functionally
equivalent to tcpreplay but operating on netmap ports.
2016-10-16 14:13:32 +00:00

491 lines
14 KiB
C

/*
* Copyright (C) 2014-2015 Vincenzo Maffione
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* $FreeBSD$ */
#if defined(__FreeBSD__)
#include <sys/cdefs.h> /* prerequisite */
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/param.h> /* defines used in kernel.h */
#include <sys/kernel.h> /* types used in module initialization */
#include <sys/sockio.h>
#include <sys/malloc.h>
#include <sys/socketvar.h> /* struct socket */
#include <sys/socket.h> /* sockaddrs */
#include <net/if.h>
#include <net/if_var.h>
#include <machine/bus.h> /* bus_dmamap_* */
#include <sys/endian.h>
#elif defined(linux)
#include "bsd_glue.h"
#elif defined(__APPLE__)
#warning OSX support is only partial
#include "osx_glue.h"
#else
#error Unsupported platform
#endif /* unsupported */
#include <net/netmap.h>
#include <dev/netmap/netmap_kern.h>
/* This routine is called by bdg_mismatch_datapath() when it finishes
* accumulating bytes for a segment, in order to fix some fields in the
* segment headers (which still contain the same content as the header
* of the original GSO packet). 'pkt' points to the beginning of the IP
* header of the segment, while 'len' is the length of the IP packet.
*/
static void
gso_fix_segment(uint8_t *pkt, size_t len, u_int ipv4, u_int iphlen, u_int tcp,
u_int idx, u_int segmented_bytes, u_int last_segment)
{
struct nm_iphdr *iph = (struct nm_iphdr *)(pkt);
struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(pkt);
uint16_t *check = NULL;
uint8_t *check_data = NULL;
if (ipv4) {
/* Set the IPv4 "Total Length" field. */
iph->tot_len = htobe16(len);
ND("ip total length %u", be16toh(ip->tot_len));
/* Set the IPv4 "Identification" field. */
iph->id = htobe16(be16toh(iph->id) + idx);
ND("ip identification %u", be16toh(iph->id));
/* Compute and insert the IPv4 header checksum. */
iph->check = 0;
iph->check = nm_os_csum_ipv4(iph);
ND("IP csum %x", be16toh(iph->check));
} else {
/* Set the IPv6 "Payload Len" field. */
ip6h->payload_len = htobe16(len-iphlen);
}
if (tcp) {
struct nm_tcphdr *tcph = (struct nm_tcphdr *)(pkt + iphlen);
/* Set the TCP sequence number. */
tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes);
ND("tcp seq %u", be32toh(tcph->seq));
/* Zero the PSH and FIN TCP flags if this is not the last
segment. */
if (!last_segment)
tcph->flags &= ~(0x8 | 0x1);
ND("last_segment %u", last_segment);
check = &tcph->check;
check_data = (uint8_t *)tcph;
} else { /* UDP */
struct nm_udphdr *udph = (struct nm_udphdr *)(pkt + iphlen);
/* Set the UDP 'Length' field. */
udph->len = htobe16(len-iphlen);
check = &udph->check;
check_data = (uint8_t *)udph;
}
/* Compute and insert TCP/UDP checksum. */
*check = 0;
if (ipv4)
nm_os_csum_tcpudp_ipv4(iph, check_data, len-iphlen, check);
else
nm_os_csum_tcpudp_ipv6(ip6h, check_data, len-iphlen, check);
ND("TCP/UDP csum %x", be16toh(*check));
}
static int
vnet_hdr_is_bad(struct nm_vnet_hdr *vh)
{
uint8_t gso_type = vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN;
return (
(gso_type != VIRTIO_NET_HDR_GSO_NONE &&
gso_type != VIRTIO_NET_HDR_GSO_TCPV4 &&
gso_type != VIRTIO_NET_HDR_GSO_UDP &&
gso_type != VIRTIO_NET_HDR_GSO_TCPV6)
||
(vh->flags & ~(VIRTIO_NET_HDR_F_NEEDS_CSUM
| VIRTIO_NET_HDR_F_DATA_VALID))
);
}
/* The VALE mismatch datapath implementation. */
void
bdg_mismatch_datapath(struct netmap_vp_adapter *na,
struct netmap_vp_adapter *dst_na,
const struct nm_bdg_fwd *ft_p,
struct netmap_ring *dst_ring,
u_int *j, u_int lim, u_int *howmany)
{
struct netmap_slot *dst_slot = NULL;
struct nm_vnet_hdr *vh = NULL;
const struct nm_bdg_fwd *ft_end = ft_p + ft_p->ft_frags;
/* Source and destination pointers. */
uint8_t *dst, *src;
size_t src_len, dst_len;
/* Indices and counters for the destination ring. */
u_int j_start = *j;
u_int j_cur = j_start;
u_int dst_slots = 0;
if (unlikely(ft_p == ft_end)) {
RD(3, "No source slots to process");
return;
}
/* Init source and dest pointers. */
src = ft_p->ft_buf;
src_len = ft_p->ft_len;
dst_slot = &dst_ring->slot[j_cur];
dst = NMB(&dst_na->up, dst_slot);
dst_len = src_len;
/* If the source port uses the offloadings, while destination doesn't,
* we grab the source virtio-net header and do the offloadings here.
*/
if (na->up.virt_hdr_len && !dst_na->up.virt_hdr_len) {
vh = (struct nm_vnet_hdr *)src;
/* Initial sanity check on the source virtio-net header. If
* something seems wrong, just drop the packet. */
if (src_len < na->up.virt_hdr_len) {
RD(3, "Short src vnet header, dropping");
return;
}
if (vnet_hdr_is_bad(vh)) {
RD(3, "Bad src vnet header, dropping");
return;
}
}
/* We are processing the first input slot and there is a mismatch
* between source and destination virt_hdr_len (SHL and DHL).
* When the a client is using virtio-net headers, the header length
* can be:
* - 10: the header corresponds to the struct nm_vnet_hdr
* - 12: the first 10 bytes correspond to the struct
* virtio_net_hdr, and the last 2 bytes store the
* "mergeable buffers" info, which is an optional
* hint that can be zeroed for compatibility
*
* The destination header is therefore built according to the
* following table:
*
* SHL | DHL | destination header
* -----------------------------
* 0 | 10 | zero
* 0 | 12 | zero
* 10 | 0 | doesn't exist
* 10 | 12 | first 10 bytes are copied from source header, last 2 are zero
* 12 | 0 | doesn't exist
* 12 | 10 | copied from the first 10 bytes of source header
*/
bzero(dst, dst_na->up.virt_hdr_len);
if (na->up.virt_hdr_len && dst_na->up.virt_hdr_len)
memcpy(dst, src, sizeof(struct nm_vnet_hdr));
/* Skip the virtio-net headers. */
src += na->up.virt_hdr_len;
src_len -= na->up.virt_hdr_len;
dst += dst_na->up.virt_hdr_len;
dst_len = dst_na->up.virt_hdr_len + src_len;
/* Here it could be dst_len == 0 (which implies src_len == 0),
* so we avoid passing a zero length fragment.
*/
if (dst_len == 0) {
ft_p++;
src = ft_p->ft_buf;
src_len = ft_p->ft_len;
dst_len = src_len;
}
if (vh && vh->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
u_int gso_bytes = 0;
/* Length of the GSO packet header. */
u_int gso_hdr_len = 0;
/* Pointer to the GSO packet header. Assume it is in a single fragment. */
uint8_t *gso_hdr = NULL;
/* Index of the current segment. */
u_int gso_idx = 0;
/* Payload data bytes segmented so far (e.g. TCP data bytes). */
u_int segmented_bytes = 0;
/* Is this an IPv4 or IPv6 GSO packet? */
u_int ipv4 = 0;
/* Length of the IP header (20 if IPv4, 40 if IPv6). */
u_int iphlen = 0;
/* Length of the Ethernet header (18 if 802.1q, otherwise 14). */
u_int ethhlen = 14;
/* Is this a TCP or an UDP GSO packet? */
u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN)
== VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1;
/* Segment the GSO packet contained into the input slots (frags). */
for (;;) {
size_t copy;
if (dst_slots >= *howmany) {
/* We still have work to do, but we've run out of
* dst slots, so we have to drop the packet. */
RD(3, "Not enough slots, dropping GSO packet");
return;
}
/* Grab the GSO header if we don't have it. */
if (!gso_hdr) {
uint16_t ethertype;
gso_hdr = src;
/* Look at the 'Ethertype' field to see if this packet
* is IPv4 or IPv6, taking into account VLAN
* encapsulation. */
for (;;) {
if (src_len < ethhlen) {
RD(3, "Short GSO fragment [eth], dropping");
return;
}
ethertype = be16toh(*((uint16_t *)
(gso_hdr + ethhlen - 2)));
if (ethertype != 0x8100) /* not 802.1q */
break;
ethhlen += 4;
}
switch (ethertype) {
case 0x0800: /* IPv4 */
{
struct nm_iphdr *iph = (struct nm_iphdr *)
(gso_hdr + ethhlen);
if (src_len < ethhlen + 20) {
RD(3, "Short GSO fragment "
"[IPv4], dropping");
return;
}
ipv4 = 1;
iphlen = 4 * (iph->version_ihl & 0x0F);
break;
}
case 0x86DD: /* IPv6 */
ipv4 = 0;
iphlen = 40;
break;
default:
RD(3, "Unsupported ethertype, "
"dropping GSO packet");
return;
}
ND(3, "type=%04x", ethertype);
if (src_len < ethhlen + iphlen) {
RD(3, "Short GSO fragment [IP], dropping");
return;
}
/* Compute gso_hdr_len. For TCP we need to read the
* content of the 'Data Offset' field.
*/
if (tcp) {
struct nm_tcphdr *tcph = (struct nm_tcphdr *)
(gso_hdr + ethhlen + iphlen);
if (src_len < ethhlen + iphlen + 20) {
RD(3, "Short GSO fragment "
"[TCP], dropping");
return;
}
gso_hdr_len = ethhlen + iphlen +
4 * (tcph->doff >> 4);
} else {
gso_hdr_len = ethhlen + iphlen + 8; /* UDP */
}
if (src_len < gso_hdr_len) {
RD(3, "Short GSO fragment [TCP/UDP], dropping");
return;
}
ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len,
dst_na->mfs);
/* Advance source pointers. */
src += gso_hdr_len;
src_len -= gso_hdr_len;
if (src_len == 0) {
ft_p++;
if (ft_p == ft_end)
break;
src = ft_p->ft_buf;
src_len = ft_p->ft_len;
}
}
/* Fill in the header of the current segment. */
if (gso_bytes == 0) {
memcpy(dst, gso_hdr, gso_hdr_len);
gso_bytes = gso_hdr_len;
}
/* Fill in data and update source and dest pointers. */
copy = src_len;
if (gso_bytes + copy > dst_na->mfs)
copy = dst_na->mfs - gso_bytes;
memcpy(dst + gso_bytes, src, copy);
gso_bytes += copy;
src += copy;
src_len -= copy;
/* A segment is complete or we have processed all the
the GSO payload bytes. */
if (gso_bytes >= dst_na->mfs ||
(src_len == 0 && ft_p + 1 == ft_end)) {
/* After raw segmentation, we must fix some header
* fields and compute checksums, in a protocol dependent
* way. */
gso_fix_segment(dst + ethhlen, gso_bytes - ethhlen,
ipv4, iphlen, tcp,
gso_idx, segmented_bytes,
src_len == 0 && ft_p + 1 == ft_end);
ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes);
dst_slot->len = gso_bytes;
dst_slot->flags = 0;
dst_slots++;
segmented_bytes += gso_bytes - gso_hdr_len;
gso_bytes = 0;
gso_idx++;
/* Next destination slot. */
j_cur = nm_next(j_cur, lim);
dst_slot = &dst_ring->slot[j_cur];
dst = NMB(&dst_na->up, dst_slot);
}
/* Next input slot. */
if (src_len == 0) {
ft_p++;
if (ft_p == ft_end)
break;
src = ft_p->ft_buf;
src_len = ft_p->ft_len;
}
}
ND(3, "%d bytes segmented", segmented_bytes);
} else {
/* Address of a checksum field into a destination slot. */
uint16_t *check = NULL;
/* Accumulator for an unfolded checksum. */
rawsum_t csum = 0;
/* Process a non-GSO packet. */
/* Init 'check' if necessary. */
if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
if (unlikely(vh->csum_offset + vh->csum_start > src_len))
D("invalid checksum request");
else
check = (uint16_t *)(dst + vh->csum_start +
vh->csum_offset);
}
while (ft_p != ft_end) {
/* Init/update the packet checksum if needed. */
if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
if (!dst_slots)
csum = nm_os_csum_raw(src + vh->csum_start,
src_len - vh->csum_start, 0);
else
csum = nm_os_csum_raw(src, src_len, csum);
}
/* Round to a multiple of 64 */
src_len = (src_len + 63) & ~63;
if (ft_p->ft_flags & NS_INDIRECT) {
if (copyin(src, dst, src_len)) {
/* Invalid user pointer, pretend len is 0. */
dst_len = 0;
}
} else {
memcpy(dst, src, (int)src_len);
}
dst_slot->len = dst_len;
dst_slots++;
/* Next destination slot. */
j_cur = nm_next(j_cur, lim);
dst_slot = &dst_ring->slot[j_cur];
dst = NMB(&dst_na->up, dst_slot);
/* Next source slot. */
ft_p++;
src = ft_p->ft_buf;
dst_len = src_len = ft_p->ft_len;
}
/* Finalize (fold) the checksum if needed. */
if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
*check = nm_os_csum_fold(csum);
}
ND(3, "using %u dst_slots", dst_slots);
/* A second pass on the destination slots to set the slot flags,
* using the right number of destination slots.
*/
while (j_start != j_cur) {
dst_slot = &dst_ring->slot[j_start];
dst_slot->flags = (dst_slots << 8)| NS_MOREFRAG;
j_start = nm_next(j_start, lim);
}
/* Clear NS_MOREFRAG flag on last entry. */
dst_slot->flags = (dst_slots << 8);
}
/* Update howmany and j. This is to commit the use of
* those slots in the destination ring. */
if (unlikely(dst_slots > *howmany)) {
D("Slot allocation error: This is a bug");
}
*j = j_cur;
*howmany -= dst_slots;
}