tap: add support for virtio-net offloads

This patch is part of an effort to make bhyve networking (in particular TCP)
faster. The key strategy to enhance TCP throughput is to let the whole packet
datapath work with TSO/LRO packets (up to 64KB each), so that the per-packet
overhead is amortized over a large number of bytes.
This capability is supported in the guest by means of the vtnet(4) driver,
which is able to handle TSO/LRO packets leveraging the virtio-net header
(see struct virtio_net_hdr and struct virtio_net_hdr_mrg_rxbuf).
A bhyve VM exchanges packets with the host through a network backend,
which can be vale(4) or if_tap(4).
While vale(4) supports TSO/LRO packets, if_tap(4) does not.
This patch extends if_tap(4) with the ability to understand the virtio-net
header, so that a tapX interface can process TSO/LRO packets.
A couple of ioctl commands have been added to configure and probe the
virtio-net header. Once the virtio-net header is set, the tapX interface
acquires all the IFCAP capabilities necessary for TSO/LRO.

Reviewed by:	kevans
Differential Revision:	https://reviews.freebsd.org/D21263
This commit is contained in:
Vincenzo Maffione 2019-10-18 21:53:27 +00:00
parent 43e4b6ca7f
commit f8bc74e2f4
4 changed files with 459 additions and 319 deletions
sys

@ -1335,150 +1335,6 @@ ptnet_rx_intr(void *opaque)
ptnet_rx_eof(pq, PTNET_RX_BUDGET, true);
}
/* The following offloadings-related functions are taken from the vtnet
* driver, but the same functionality is required for the ptnet driver.
* As a temporary solution, I copied this code from vtnet and I started
* to generalize it (taking away driver-specific statistic accounting),
* making as little modifications as possible.
* In the future we need to share these functions between vtnet and ptnet.
*/
static int
ptnet_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start)
{
struct ether_vlan_header *evh;
int offset;
evh = mtod(m, struct ether_vlan_header *);
if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
/* BMV: We should handle nested VLAN tags too. */
*etype = ntohs(evh->evl_proto);
offset = sizeof(struct ether_vlan_header);
} else {
*etype = ntohs(evh->evl_encap_proto);
offset = sizeof(struct ether_header);
}
switch (*etype) {
#if defined(INET)
case ETHERTYPE_IP: {
struct ip *ip, iphdr;
if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
m_copydata(m, offset, sizeof(struct ip),
(caddr_t) &iphdr);
ip = &iphdr;
} else
ip = (struct ip *)(m->m_data + offset);
*proto = ip->ip_p;
*start = offset + (ip->ip_hl << 2);
break;
}
#endif
#if defined(INET6)
case ETHERTYPE_IPV6:
*proto = -1;
*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
/* Assert the network stack sent us a valid packet. */
KASSERT(*start > offset,
("%s: mbuf %p start %d offset %d proto %d", __func__, m,
*start, offset, *proto));
break;
#endif
default:
/* Here we should increment the tx_csum_bad_ethtype counter. */
return (EINVAL);
}
return (0);
}
static int
ptnet_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type,
int offset, bool allow_ecn, struct virtio_net_hdr *hdr)
{
static struct timeval lastecn;
static int curecn;
struct tcphdr *tcp, tcphdr;
if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
tcp = &tcphdr;
} else
tcp = (struct tcphdr *)(m->m_data + offset);
hdr->hdr_len = offset + (tcp->th_off << 2);
hdr->gso_size = m->m_pkthdr.tso_segsz;
hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
VIRTIO_NET_HDR_GSO_TCPV6;
if (tcp->th_flags & TH_CWR) {
/*
* Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD,
* ECN support is not on a per-interface basis, but globally via
* the net.inet.tcp.ecn.enable sysctl knob. The default is off.
*/
if (!allow_ecn) {
if (ppsratecheck(&lastecn, &curecn, 1))
if_printf(ifp,
"TSO with ECN not negotiated with host\n");
return (ENOTSUP);
}
hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
}
/* Here we should increment tx_tso counter. */
return (0);
}
static struct mbuf *
ptnet_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn,
struct virtio_net_hdr *hdr)
{
int flags, etype, csum_start, proto, error;
flags = m->m_pkthdr.csum_flags;
error = ptnet_tx_offload_ctx(m, &etype, &proto, &csum_start);
if (error)
goto drop;
if ((etype == ETHERTYPE_IP && flags & PTNET_CSUM_OFFLOAD) ||
(etype == ETHERTYPE_IPV6 && flags & PTNET_CSUM_OFFLOAD_IPV6)) {
/*
* We could compare the IP protocol vs the CSUM_ flag too,
* but that really should not be necessary.
*/
hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
hdr->csum_start = csum_start;
hdr->csum_offset = m->m_pkthdr.csum_data;
/* Here we should increment the tx_csum counter. */
}
if (flags & CSUM_TSO) {
if (__predict_false(proto != IPPROTO_TCP)) {
/* Likely failed to correctly parse the mbuf.
* Here we should increment the tx_tso_not_tcp
* counter. */
goto drop;
}
KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
("%s: mbuf %p TSO without checksum offload %#x",
__func__, m, flags));
error = ptnet_tx_offload_tso(ifp, m, etype, csum_start,
allow_ecn, hdr);
if (error)
goto drop;
}
return (m);
drop:
m_freem(m);
return (NULL);
}
static void
ptnet_vlan_tag_remove(struct mbuf *m)
{
@ -1494,157 +1350,6 @@ ptnet_vlan_tag_remove(struct mbuf *m)
m_adj(m, ETHER_VLAN_ENCAP_LEN);
}
/*
* Use the checksum offset in the VirtIO header to set the
* correct CSUM_* flags.
*/
static int
ptnet_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start,
struct virtio_net_hdr *hdr)
{
#if defined(INET) || defined(INET6)
int offset = hdr->csum_start + hdr->csum_offset;
#endif
/* Only do a basic sanity check on the offset. */
switch (eth_type) {
#if defined(INET)
case ETHERTYPE_IP:
if (__predict_false(offset < ip_start + sizeof(struct ip)))
return (1);
break;
#endif
#if defined(INET6)
case ETHERTYPE_IPV6:
if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
return (1);
break;
#endif
default:
/* Here we should increment the rx_csum_bad_ethtype counter. */
return (1);
}
/*
* Use the offset to determine the appropriate CSUM_* flags. This is
* a bit dirty, but we can get by with it since the checksum offsets
* happen to be different. We assume the host host does not do IPv4
* header checksum offloading.
*/
switch (hdr->csum_offset) {
case offsetof(struct udphdr, uh_sum):
case offsetof(struct tcphdr, th_sum):
m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xFFFF;
break;
default:
/* Here we should increment the rx_csum_bad_offset counter. */
return (1);
}
return (0);
}
static int
ptnet_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start,
struct virtio_net_hdr *hdr)
{
int offset, proto;
switch (eth_type) {
#if defined(INET)
case ETHERTYPE_IP: {
struct ip *ip;
if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
return (1);
ip = (struct ip *)(m->m_data + ip_start);
proto = ip->ip_p;
offset = ip_start + (ip->ip_hl << 2);
break;
}
#endif
#if defined(INET6)
case ETHERTYPE_IPV6:
if (__predict_false(m->m_len < ip_start +
sizeof(struct ip6_hdr)))
return (1);
offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
if (__predict_false(offset < 0))
return (1);
break;
#endif
default:
/* Here we should increment the rx_csum_bad_ethtype counter. */
return (1);
}
switch (proto) {
case IPPROTO_TCP:
if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
return (1);
m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xFFFF;
break;
case IPPROTO_UDP:
if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
return (1);
m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xFFFF;
break;
default:
/*
* For the remaining protocols, FreeBSD does not support
* checksum offloading, so the checksum will be recomputed.
*/
#if 0
if_printf(ifp, "cksum offload of unsupported "
"protocol eth_type=%#x proto=%d csum_start=%d "
"csum_offset=%d\n", __func__, eth_type, proto,
hdr->csum_start, hdr->csum_offset);
#endif
break;
}
return (0);
}
/*
* Set the appropriate CSUM_* flags. Unfortunately, the information
* provided is not directly useful to us. The VirtIO header gives the
* offset of the checksum, which is all Linux needs, but this is not
* how FreeBSD does things. We are forced to peek inside the packet
* a bit.
*
* It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
* could accept the offsets and let the stack figure it out.
*/
static int
ptnet_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr)
{
struct ether_header *eh;
struct ether_vlan_header *evh;
uint16_t eth_type;
int offset, error;
eh = mtod(m, struct ether_header *);
eth_type = ntohs(eh->ether_type);
if (eth_type == ETHERTYPE_VLAN) {
/* BMV: We should handle nested VLAN tags too. */
evh = mtod(m, struct ether_vlan_header *);
eth_type = ntohs(evh->evl_proto);
offset = sizeof(struct ether_vlan_header);
} else
offset = sizeof(struct ether_header);
if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
error = ptnet_rx_csum_by_offset(m, eth_type, offset, hdr);
else
error = ptnet_rx_csum_by_parse(m, eth_type, offset, hdr);
return (error);
}
/* End of offloading-related functions to be shared with vtnet. */
static void
ptnet_ring_update(struct ptnet_queue *pq, struct netmap_kring *kring,
unsigned int head, unsigned int sync_flags)
@ -1776,7 +1481,7 @@ ptnet_drain_transmit_queue(struct ptnet_queue *pq, unsigned int budget,
* two 8-bytes-wide writes. */
memset(nmbuf, 0, PTNET_HDR_SIZE);
if (mhead->m_pkthdr.csum_flags & PTNET_ALL_OFFLOAD) {
mhead = ptnet_tx_offload(ifp, mhead, false,
mhead = virtio_net_tx_offload(ifp, mhead, false,
vh);
if (unlikely(!mhead)) {
/* Packet dropped because errors
@ -2154,15 +1859,12 @@ host_sync:
}
}
if (have_vnet_hdr && (vh->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM
| VIRTIO_NET_HDR_F_DATA_VALID))) {
if (unlikely(ptnet_rx_csum(mhead, vh))) {
if (unlikely(have_vnet_hdr && virtio_net_rx_csum(mhead, vh))) {
m_freem(mhead);
nm_prlim(1, "Csum offload error: dropping");
pq->stats.iqdrops ++;
deliver = 0;
}
}
skip:
count ++;

@ -201,4 +201,297 @@ struct virtio_net_ctrl_mq {
#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1
#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000
/*
* Use the checksum offset in the VirtIO header to set the
* correct CSUM_* flags.
*/
static inline int
virtio_net_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start,
struct virtio_net_hdr *hdr)
{
#if defined(INET) || defined(INET6)
int offset = hdr->csum_start + hdr->csum_offset;
#endif
/* Only do a basic sanity check on the offset. */
switch (eth_type) {
#if defined(INET)
case ETHERTYPE_IP:
if (__predict_false(offset < ip_start + sizeof(struct ip)))
return (1);
break;
#endif
#if defined(INET6)
case ETHERTYPE_IPV6:
if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
return (1);
break;
#endif
default:
/* Here we should increment the rx_csum_bad_ethtype counter. */
return (1);
}
/*
* Use the offset to determine the appropriate CSUM_* flags. This is
* a bit dirty, but we can get by with it since the checksum offsets
* happen to be different. We assume the host host does not do IPv4
* header checksum offloading.
*/
switch (hdr->csum_offset) {
case offsetof(struct udphdr, uh_sum):
case offsetof(struct tcphdr, th_sum):
m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xFFFF;
break;
default:
/* Here we should increment the rx_csum_bad_offset counter. */
return (1);
}
return (0);
}
static inline int
virtio_net_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start,
struct virtio_net_hdr *hdr)
{
int offset, proto;
switch (eth_type) {
#if defined(INET)
case ETHERTYPE_IP: {
struct ip *ip;
if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
return (1);
ip = (struct ip *)(m->m_data + ip_start);
proto = ip->ip_p;
offset = ip_start + (ip->ip_hl << 2);
break;
}
#endif
#if defined(INET6)
case ETHERTYPE_IPV6:
if (__predict_false(m->m_len < ip_start +
sizeof(struct ip6_hdr)))
return (1);
offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
if (__predict_false(offset < 0))
return (1);
break;
#endif
default:
/* Here we should increment the rx_csum_bad_ethtype counter. */
return (1);
}
switch (proto) {
case IPPROTO_TCP:
if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
return (1);
m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xFFFF;
break;
case IPPROTO_UDP:
if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
return (1);
m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xFFFF;
break;
default:
/*
* For the remaining protocols, FreeBSD does not support
* checksum offloading, so the checksum will be recomputed.
*/
#if 0
if_printf(ifp, "cksum offload of unsupported "
"protocol eth_type=%#x proto=%d csum_start=%d "
"csum_offset=%d\n", __func__, eth_type, proto,
hdr->csum_start, hdr->csum_offset);
#endif
break;
}
return (0);
}
/*
* Set the appropriate CSUM_* flags. Unfortunately, the information
* provided is not directly useful to us. The VirtIO header gives the
* offset of the checksum, which is all Linux needs, but this is not
* how FreeBSD does things. We are forced to peek inside the packet
* a bit.
*
* It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
* could accept the offsets and let the stack figure it out.
*/
static inline int
virtio_net_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr)
{
struct ether_header *eh;
struct ether_vlan_header *evh;
uint16_t eth_type;
int offset, error;
if ((hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM |
VIRTIO_NET_HDR_F_DATA_VALID)) == 0) {
return (0);
}
eh = mtod(m, struct ether_header *);
eth_type = ntohs(eh->ether_type);
if (eth_type == ETHERTYPE_VLAN) {
/* BMV: We should handle nested VLAN tags too. */
evh = mtod(m, struct ether_vlan_header *);
eth_type = ntohs(evh->evl_proto);
offset = sizeof(struct ether_vlan_header);
} else
offset = sizeof(struct ether_header);
if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
error = virtio_net_rx_csum_by_offset(m, eth_type, offset, hdr);
else
error = virtio_net_rx_csum_by_parse(m, eth_type, offset, hdr);
return (error);
}
static inline int
virtio_net_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start)
{
struct ether_vlan_header *evh;
int offset;
evh = mtod(m, struct ether_vlan_header *);
if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
/* BMV: We should handle nested VLAN tags too. */
*etype = ntohs(evh->evl_proto);
offset = sizeof(struct ether_vlan_header);
} else {
*etype = ntohs(evh->evl_encap_proto);
offset = sizeof(struct ether_header);
}
switch (*etype) {
#if defined(INET)
case ETHERTYPE_IP: {
struct ip *ip, iphdr;
if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
m_copydata(m, offset, sizeof(struct ip),
(caddr_t) &iphdr);
ip = &iphdr;
} else
ip = (struct ip *)(m->m_data + offset);
*proto = ip->ip_p;
*start = offset + (ip->ip_hl << 2);
break;
}
#endif
#if defined(INET6)
case ETHERTYPE_IPV6:
*proto = -1;
*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
/* Assert the network stack sent us a valid packet. */
KASSERT(*start > offset,
("%s: mbuf %p start %d offset %d proto %d", __func__, m,
*start, offset, *proto));
break;
#endif
default:
/* Here we should increment the tx_csum_bad_ethtype counter. */
return (EINVAL);
}
return (0);
}
static inline int
virtio_net_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type,
int offset, bool allow_ecn, struct virtio_net_hdr *hdr)
{
static struct timeval lastecn;
static int curecn;
struct tcphdr *tcp, tcphdr;
if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
tcp = &tcphdr;
} else
tcp = (struct tcphdr *)(m->m_data + offset);
hdr->hdr_len = offset + (tcp->th_off << 2);
hdr->gso_size = m->m_pkthdr.tso_segsz;
hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
VIRTIO_NET_HDR_GSO_TCPV6;
if (tcp->th_flags & TH_CWR) {
/*
* Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD,
* ECN support is not on a per-interface basis, but globally via
* the net.inet.tcp.ecn.enable sysctl knob. The default is off.
*/
if (!allow_ecn) {
if (ppsratecheck(&lastecn, &curecn, 1))
if_printf(ifp,
"TSO with ECN not negotiated with host\n");
return (ENOTSUP);
}
hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
}
/* Here we should increment tx_tso counter. */
return (0);
}
static inline struct mbuf *
virtio_net_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn,
struct virtio_net_hdr *hdr)
{
int flags, etype, csum_start, proto, error;
flags = m->m_pkthdr.csum_flags;
error = virtio_net_tx_offload_ctx(m, &etype, &proto, &csum_start);
if (error)
goto drop;
if ((etype == ETHERTYPE_IP && (flags & (CSUM_TCP | CSUM_UDP))) ||
(etype == ETHERTYPE_IPV6 &&
(flags & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6)))) {
/*
* We could compare the IP protocol vs the CSUM_ flag too,
* but that really should not be necessary.
*/
hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
hdr->csum_start = csum_start;
hdr->csum_offset = m->m_pkthdr.csum_data;
/* Here we should increment the tx_csum counter. */
}
if (flags & CSUM_TSO) {
if (__predict_false(proto != IPPROTO_TCP)) {
/* Likely failed to correctly parse the mbuf.
* Here we should increment the tx_tso_not_tcp
* counter. */
goto drop;
}
KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
("%s: mbuf %p TSO without checksum offload %#x",
__func__, m, flags));
error = virtio_net_tx_offload_tso(ifp, m, etype, csum_start,
allow_ecn, hdr);
if (error)
goto drop;
}
return (m);
drop:
m_freem(m);
return (NULL);
}
#endif /* _VIRTIO_NET_H */

@ -43,7 +43,7 @@
#include <net/if_tun.h>
/* maximum receive packet size (hard limit) */
#define TAPMRU 16384
#define TAPMRU 65535
#define tapinfo tuninfo
@ -56,6 +56,8 @@
#define TAPSIFINFO TUNSIFINFO
#define TAPGIFINFO TUNGIFINFO
#define TAPGIFNAME TUNGIFNAME
#define TAPSVNETHDR _IOW('t', 91, int)
#define TAPGVNETHDR _IOR('t', 94, int)
/* VMware ioctl's */
#define VMIO_SIOCSIFFLAGS _IOWINT('V', 0)

@ -84,16 +84,24 @@
#include <net/if_dl.h>
#include <net/if_media.h>
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <net/netisr.h>
#include <net/route.h>
#include <net/vnet.h>
#ifdef INET
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet/udp.h>
#include <netinet/tcp.h>
#endif
#include <net/bpf.h>
#include <net/if_tap.h>
#include <net/if_tun.h>
#include <dev/virtio/network/virtio_net.h>
#include <sys/queue.h>
#include <sys/condvar.h>
#include <security/mac/mac_framework.h>
@ -134,6 +142,7 @@ struct tuntap_softc {
struct cv tun_cv; /* for ref'd dev destroy */
struct ether_addr tun_ether; /* remote address */
int tun_busy; /* busy count */
int tun_vhdrlen; /* virtio-net header length */
};
#define TUN2IFP(sc) ((sc)->tun_ifp)
@ -145,6 +154,19 @@ struct tuntap_softc {
#define TUN_VMIO_FLAG_MASK 0x0fff
/*
* Interface capabilities of a tap device that supports the virtio-net
* header.
*/
#define TAP_VNET_HDR_CAPS (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 \
| IFCAP_VLAN_HWCSUM \
| IFCAP_TSO | IFCAP_LRO \
| IFCAP_VLAN_HWTSO)
#define TAP_ALL_OFFLOAD (CSUM_TSO | CSUM_TCP | CSUM_UDP |\
CSUM_TCP_IPV6 | CSUM_UDP_IPV6)
/*
* All mutable global variables in if_tun are locked using tunmtx, with
* the exception of tundebug, which is used unlocked, and the drivers' *clones,
@ -211,6 +233,7 @@ static int tap_clone_match(struct if_clone *ifc, const char *name);
static int vmnet_clone_match(struct if_clone *ifc, const char *name);
static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t);
static int tun_clone_destroy(struct if_clone *, struct ifnet *);
static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen);
static d_open_t tunopen;
static d_close_t tunclose;
@ -1140,6 +1163,7 @@ out:
TUNDEBUG (ifp, "closed\n");
tp->tun_flags &= ~TUN_OPEN;
tp->tun_pid = 0;
tun_vnethdr_set(ifp, 0);
tun_unbusy_locked(tp);
TUN_UNLOCK(tp);
@ -1201,6 +1225,65 @@ tunifinit(void *xtp)
tuninit(tp->tun_ifp);
}
/*
* To be called under TUN_LOCK. Update ifp->if_hwassist according to the
* current value of ifp->if_capenable.
*/
static void
tun_caps_changed(struct ifnet *ifp)
{
uint64_t hwassist = 0;
TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc);
if (ifp->if_capenable & IFCAP_TXCSUM)
hwassist |= CSUM_TCP | CSUM_UDP;
if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
hwassist |= CSUM_TCP_IPV6
| CSUM_UDP_IPV6;
if (ifp->if_capenable & IFCAP_TSO4)
hwassist |= CSUM_IP_TSO;
if (ifp->if_capenable & IFCAP_TSO6)
hwassist |= CSUM_IP6_TSO;
ifp->if_hwassist = hwassist;
}
/*
* To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust
* if_capabilities and if_capenable as needed.
*/
static void
tun_vnethdr_set(struct ifnet *ifp, int vhdrlen)
{
struct tuntap_softc *tp = ifp->if_softc;
TUN_LOCK_ASSERT(tp);
if (tp->tun_vhdrlen == vhdrlen)
return;
/*
* Update if_capabilities to reflect the
* functionalities offered by the virtio-net
* header.
*/
if (vhdrlen != 0)
ifp->if_capabilities |=
TAP_VNET_HDR_CAPS;
else
ifp->if_capabilities &=
~TAP_VNET_HDR_CAPS;
/*
* Disable any capabilities that we don't
* support anymore.
*/
ifp->if_capenable &= ifp->if_capabilities;
tun_caps_changed(ifp);
tp->tun_vhdrlen = vhdrlen;
TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n",
vhdrlen, ifp->if_capabilities);
}
/*
* Process an ioctl request.
*/
@ -1268,6 +1351,13 @@ tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
error = copyout(&media, ifmr->ifm_ulist, sizeof(int));
}
break;
case SIOCSIFCAP:
TUN_LOCK(tp);
ifp->if_capenable = ifr->ifr_reqcap;
tun_caps_changed(ifp);
TUN_UNLOCK(tp);
VLAN_CAPABILITIES(ifp);
break;
default:
if (l2tun) {
error = ether_ioctl(ifp, cmd, data);
@ -1378,12 +1468,9 @@ tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
{
struct ifreq ifr, *ifrp;
struct tuntap_softc *tp = dev->si_drv1;
struct ifnet *ifp = TUN2IFP(tp);
struct tuninfo *tunp;
int error, iflags;
#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD4)
int ival;
#endif
int error, iflags, ival;
bool l2tun;
l2tun = (tp->tun_flags & TUN_L2) != 0;
@ -1405,8 +1492,8 @@ tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
iflags |= IFF_UP;
TUN_LOCK(tp);
TUN2IFP(tp)->if_flags = iflags |
(TUN2IFP(tp)->if_flags & IFF_CANTCHANGE);
ifp->if_flags = iflags |
(ifp->if_flags & IFF_CANTCHANGE);
TUN_UNLOCK(tp);
return (0);
@ -1423,6 +1510,24 @@ tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
sizeof(tp->tun_ether.octet));
TUN_UNLOCK(tp);
return (0);
case TAPSVNETHDR:
ival = *(int *)data;
if (ival != 0 &&
ival != sizeof(struct virtio_net_hdr) &&
ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
return (EINVAL);
}
TUN_LOCK(tp);
tun_vnethdr_set(ifp, ival);
TUN_UNLOCK(tp);
return (0);
case TAPGVNETHDR:
TUN_LOCK(tp);
*(int *)data = tp->tun_vhdrlen;
TUN_UNLOCK(tp);
return (0);
}
@ -1578,7 +1683,8 @@ tunread(struct cdev *dev, struct uio *uio, int flag)
struct tuntap_softc *tp = dev->si_drv1;
struct ifnet *ifp = TUN2IFP(tp);
struct mbuf *m;
int error=0, len;
size_t len;
int error = 0;
TUNDEBUG (ifp, "read\n");
TUN_LOCK(tp);
@ -1611,6 +1717,23 @@ tunread(struct cdev *dev, struct uio *uio, int flag)
if ((tp->tun_flags & TUN_L2) != 0)
BPF_MTAP(ifp, m);
len = min(tp->tun_vhdrlen, uio->uio_resid);
if (len > 0) {
struct virtio_net_hdr_mrg_rxbuf vhdr;
bzero(&vhdr, sizeof(vhdr));
if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) {
m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr);
}
TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
"gs %u, cs %u, co %u\n", vhdr.hdr.flags,
vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
vhdr.hdr.gso_size, vhdr.hdr.csum_start,
vhdr.hdr.csum_offset);
error = uiomove(&vhdr, len, uio);
}
while (m && uio->uio_resid > 0 && error == 0) {
len = min(uio->uio_resid, m->m_len);
if (len != 0)
@ -1626,7 +1749,8 @@ tunread(struct cdev *dev, struct uio *uio, int flag)
}
static int
tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m)
tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m,
struct virtio_net_hdr_mrg_rxbuf *vhdr)
{
struct ether_header *eh;
struct ifnet *ifp;
@ -1651,6 +1775,11 @@ tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m)
return (0);
}
if (vhdr != NULL && virtio_net_rx_csum(m, &vhdr->hdr)) {
m_freem(m);
return (0);
}
/* Pass packet up to parent. */
CURVNET_SET(ifp->if_vnet);
(*ifp->if_input)(ifp, m);
@ -1717,11 +1846,12 @@ tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m)
static int
tunwrite(struct cdev *dev, struct uio *uio, int flag)
{
struct virtio_net_hdr_mrg_rxbuf vhdr;
struct tuntap_softc *tp;
struct ifnet *ifp;
struct mbuf *m;
uint32_t mru;
int align;
int align, vhdrlen, error;
bool l2tun;
tp = dev->si_drv1;
@ -1735,17 +1865,30 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag)
return (0);
l2tun = (tp->tun_flags & TUN_L2) != 0;
align = 0;
mru = l2tun ? TAPMRU : TUNMRU;
if (l2tun)
vhdrlen = tp->tun_vhdrlen;
align = 0;
if (l2tun) {
align = ETHER_ALIGN;
else if ((tp->tun_flags & TUN_IFHEAD) != 0)
mru += vhdrlen;
} else if ((tp->tun_flags & TUN_IFHEAD) != 0)
mru += sizeof(uint32_t); /* family */
if (uio->uio_resid < 0 || uio->uio_resid > mru) {
TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid);
return (EIO);
}
if (vhdrlen > 0) {
error = uiomove(&vhdr, vhdrlen, uio);
if (error != 0)
return (error);
TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
"gs %u, cs %u, co %u\n", vhdr.hdr.flags,
vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
vhdr.hdr.gso_size, vhdr.hdr.csum_start,
vhdr.hdr.csum_offset);
}
if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) {
if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
return (ENOBUFS);
@ -1757,7 +1900,7 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag)
#endif
if (l2tun)
return (tunwrite_l2(tp, m));
return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL));
return (tunwrite_l3(tp, m));
}