tap: add support for virtio-net offloads

This patch is part of an effort to make bhyve networking (in particular TCP) faster. The key strategy to enhance TCP throughput is to let the whole packet datapath work with TSO/LRO packets (up to 64KB each), so that the per-packet overhead is amortized over a large number of bytes. This capability is supported in the guest by means of the vtnet(4) driver, which is able to handle TSO/LRO packets leveraging the virtio-net header (see struct virtio_net_hdr and struct virtio_net_hdr_mrg_rxbuf). A bhyve VM exchanges packets with the host through a network backend, which can be vale(4) or if_tap(4). While vale(4) supports TSO/LRO packets, if_tap(4) does not. This patch extends if_tap(4) with the ability to understand the virtio-net header, so that a tapX interface can process TSO/LRO packets. A couple of ioctl commands have been added to configure and probe the virtio-net header. Once the virtio-net header is set, the tapX interface acquires all the IFCAP capabilities necessary for TSO/LRO. Reviewed by: kevans Differential Revision: https://reviews.freebsd.org/D21263
2019-10-18 21:53:27 +00:00 · 2019-10-18 21:53:27 +00:00 · f8bc74e2f4
commit f8bc74e2f4
parent 43e4b6ca7f
4 changed files with 459 additions and 319 deletions
--- a/sys/dev/netmap/if_ptnet.c
+++ b/sys/dev/netmap/if_ptnet.c
@ -1335,150 +1335,6 @@ ptnet_rx_intr(void *opaque)
 	ptnet_rx_eof(pq, PTNET_RX_BUDGET, true);
 }

-/* The following offloadings-related functions are taken from the vtnet
- * driver, but the same functionality is required for the ptnet driver.
- * As a temporary solution, I copied this code from vtnet and I started
- * to generalize it (taking away driver-specific statistic accounting),
- * making as little modifications as possible.
- * In the future we need to share these functions between vtnet and ptnet.
- */
-static int
-ptnet_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start)
-{
-	struct ether_vlan_header *evh;
-	int offset;
-
-	evh = mtod(m, struct ether_vlan_header *);
-	if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
-		/* BMV: We should handle nested VLAN tags too. */
-		*etype = ntohs(evh->evl_proto);
-		offset = sizeof(struct ether_vlan_header);
-	} else {
-		*etype = ntohs(evh->evl_encap_proto);
-		offset = sizeof(struct ether_header);
-	}
-
-	switch (*etype) {
-#if defined(INET)
-	case ETHERTYPE_IP: {
-		struct ip *ip, iphdr;
-		if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
-			m_copydata(m, offset, sizeof(struct ip),
-			    (caddr_t) &iphdr);
-			ip = &iphdr;
-		} else
-			ip = (struct ip *)(m->m_data + offset);
-		*proto = ip->ip_p;
-		*start = offset + (ip->ip_hl << 2);
-		break;
-	}
-#endif
-#if defined(INET6)
-	case ETHERTYPE_IPV6:
-		*proto = -1;
-		*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
-		/* Assert the network stack sent us a valid packet. */
-		KASSERT(*start > offset,
-		    ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
-		    *start, offset, *proto));
-		break;
-#endif
-	default:
-		/* Here we should increment the tx_csum_bad_ethtype counter. */
-		return (EINVAL);
-	}
-
-	return (0);
-}
-
-static int
-ptnet_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type,
-		     int offset, bool allow_ecn, struct virtio_net_hdr *hdr)
-{
-	static struct timeval lastecn;
-	static int curecn;
-	struct tcphdr *tcp, tcphdr;
-
-	if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
-		m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
-		tcp = &tcphdr;
-	} else
-		tcp = (struct tcphdr *)(m->m_data + offset);
-
-	hdr->hdr_len = offset + (tcp->th_off << 2);
-	hdr->gso_size = m->m_pkthdr.tso_segsz;
-	hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
-	    VIRTIO_NET_HDR_GSO_TCPV6;
-
-	if (tcp->th_flags & TH_CWR) {
-		/*
-		 * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD,
-		 * ECN support is not on a per-interface basis, but globally via
-		 * the net.inet.tcp.ecn.enable sysctl knob. The default is off.
-		 */
-		if (!allow_ecn) {
-			if (ppsratecheck(&lastecn, &curecn, 1))
-				if_printf(ifp,
-				    "TSO with ECN not negotiated with host\n");
-			return (ENOTSUP);
-		}
-		hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
-	}
-
-	/* Here we should increment tx_tso counter. */
-
-	return (0);
-}
-
-static struct mbuf *
-ptnet_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn,
-		 struct virtio_net_hdr *hdr)
-{
-	int flags, etype, csum_start, proto, error;
-
-	flags = m->m_pkthdr.csum_flags;
-
-	error = ptnet_tx_offload_ctx(m, &etype, &proto, &csum_start);
-	if (error)
-		goto drop;
-
-	if ((etype == ETHERTYPE_IP && flags & PTNET_CSUM_OFFLOAD) ||
-	    (etype == ETHERTYPE_IPV6 && flags & PTNET_CSUM_OFFLOAD_IPV6)) {
-		/*
-		 * We could compare the IP protocol vs the CSUM_ flag too,
-		 * but that really should not be necessary.
-		 */
-		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		hdr->csum_start = csum_start;
-		hdr->csum_offset = m->m_pkthdr.csum_data;
-		/* Here we should increment the tx_csum counter. */
-	}
-
-	if (flags & CSUM_TSO) {
-		if (__predict_false(proto != IPPROTO_TCP)) {
-			/* Likely failed to correctly parse the mbuf.
-			 * Here we should increment the tx_tso_not_tcp
-			 * counter. */
-			goto drop;
-		}
-
-		KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
-		    ("%s: mbuf %p TSO without checksum offload %#x",
-		    __func__, m, flags));
-
-		error = ptnet_tx_offload_tso(ifp, m, etype, csum_start,
-					     allow_ecn, hdr);
-		if (error)
-			goto drop;
-	}
-
-	return (m);
-
-drop:
-	m_freem(m);
-	return (NULL);
-}
-
 static void
 ptnet_vlan_tag_remove(struct mbuf *m)
 {
@ -1494,157 +1350,6 @@ ptnet_vlan_tag_remove(struct mbuf *m)
 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
 }

-/*
- * Use the checksum offset in the VirtIO header to set the
- * correct CSUM_* flags.
- */
-static int
-ptnet_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start,
-			struct virtio_net_hdr *hdr)
-{
-#if defined(INET) || defined(INET6)
-	int offset = hdr->csum_start + hdr->csum_offset;
-#endif
-
-	/* Only do a basic sanity check on the offset. */
-	switch (eth_type) {
-#if defined(INET)
-	case ETHERTYPE_IP:
-		if (__predict_false(offset < ip_start + sizeof(struct ip)))
-			return (1);
-		break;
-#endif
-#if defined(INET6)
-	case ETHERTYPE_IPV6:
-		if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
-			return (1);
-		break;
-#endif
-	default:
-		/* Here we should increment the rx_csum_bad_ethtype counter. */
-		return (1);
-	}
-
-	/*
-	 * Use the offset to determine the appropriate CSUM_* flags. This is
-	 * a bit dirty, but we can get by with it since the checksum offsets
-	 * happen to be different. We assume the host host does not do IPv4
-	 * header checksum offloading.
-	 */
-	switch (hdr->csum_offset) {
-	case offsetof(struct udphdr, uh_sum):
-	case offsetof(struct tcphdr, th_sum):
-		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
-		m->m_pkthdr.csum_data = 0xFFFF;
-		break;
-	default:
-		/* Here we should increment the rx_csum_bad_offset counter. */
-		return (1);
-	}
-
-	return (0);
-}
-
-static int
-ptnet_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start,
-		       struct virtio_net_hdr *hdr)
-{
-	int offset, proto;
-
-	switch (eth_type) {
-#if defined(INET)
-	case ETHERTYPE_IP: {
-		struct ip *ip;
-		if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
-			return (1);
-		ip = (struct ip *)(m->m_data + ip_start);
-		proto = ip->ip_p;
-		offset = ip_start + (ip->ip_hl << 2);
-		break;
-	}
-#endif
-#if defined(INET6)
-	case ETHERTYPE_IPV6:
-		if (__predict_false(m->m_len < ip_start +
-		    sizeof(struct ip6_hdr)))
-			return (1);
-		offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
-		if (__predict_false(offset < 0))
-			return (1);
-		break;
-#endif
-	default:
-		/* Here we should increment the rx_csum_bad_ethtype counter. */
-		return (1);
-	}
-
-	switch (proto) {
-	case IPPROTO_TCP:
-		if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
-			return (1);
-		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
-		m->m_pkthdr.csum_data = 0xFFFF;
-		break;
-	case IPPROTO_UDP:
-		if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
-			return (1);
-		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
-		m->m_pkthdr.csum_data = 0xFFFF;
-		break;
-	default:
-		/*
-		 * For the remaining protocols, FreeBSD does not support
-		 * checksum offloading, so the checksum will be recomputed.
-		 */
-#if 0
-		if_printf(ifp, "cksum offload of unsupported "
-		    "protocol eth_type=%#x proto=%d csum_start=%d "
-		    "csum_offset=%d\n", __func__, eth_type, proto,
-		    hdr->csum_start, hdr->csum_offset);
-#endif
-		break;
-	}
-
-	return (0);
-}
-
-/*
- * Set the appropriate CSUM_* flags. Unfortunately, the information
- * provided is not directly useful to us. The VirtIO header gives the
- * offset of the checksum, which is all Linux needs, but this is not
- * how FreeBSD does things. We are forced to peek inside the packet
- * a bit.
- *
- * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
- * could accept the offsets and let the stack figure it out.
- */
-static int
-ptnet_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr)
-{
-	struct ether_header *eh;
-	struct ether_vlan_header *evh;
-	uint16_t eth_type;
-	int offset, error;
-
-	eh = mtod(m, struct ether_header *);
-	eth_type = ntohs(eh->ether_type);
-	if (eth_type == ETHERTYPE_VLAN) {
-		/* BMV: We should handle nested VLAN tags too. */
-		evh = mtod(m, struct ether_vlan_header *);
-		eth_type = ntohs(evh->evl_proto);
-		offset = sizeof(struct ether_vlan_header);
-	} else
-		offset = sizeof(struct ether_header);
-
-	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
-		error = ptnet_rx_csum_by_offset(m, eth_type, offset, hdr);
-	else
-		error = ptnet_rx_csum_by_parse(m, eth_type, offset, hdr);
-
-	return (error);
-}
-/* End of offloading-related functions to be shared with vtnet. */
-
 static void
 ptnet_ring_update(struct ptnet_queue *pq, struct netmap_kring *kring,
 		  unsigned int head, unsigned int sync_flags)
@ -1776,7 +1481,7 @@ ptnet_drain_transmit_queue(struct ptnet_queue *pq, unsigned int budget,
 			 * two 8-bytes-wide writes. */
 			memset(nmbuf, 0, PTNET_HDR_SIZE);
 			if (mhead->m_pkthdr.csum_flags & PTNET_ALL_OFFLOAD) {
-				mhead = ptnet_tx_offload(ifp, mhead, false,
+				mhead = virtio_net_tx_offload(ifp, mhead, false,
 							 vh);
 				if (unlikely(!mhead)) {
 					/* Packet dropped because errors
@ -2154,15 +1859,12 @@ host_sync:
 			}
 		}

-		if (have_vnet_hdr && (vh->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM
-					| VIRTIO_NET_HDR_F_DATA_VALID))) {
-			if (unlikely(ptnet_rx_csum(mhead, vh))) {
+		if (unlikely(have_vnet_hdr && virtio_net_rx_csum(mhead, vh))) {
 			m_freem(mhead);
 			nm_prlim(1, "Csum offload error: dropping");
 			pq->stats.iqdrops ++;
 			deliver = 0;
 		}
-		}

 skip:
 		count ++;
--- a/sys/dev/virtio/network/virtio_net.h
+++ b/sys/dev/virtio/network/virtio_net.h
@ -201,4 +201,297 @@ struct virtio_net_ctrl_mq {
 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN		1
 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX		0x8000

+/*
+ * Use the checksum offset in the VirtIO header to set the
+ * correct CSUM_* flags.
+ */
+static inline int
+virtio_net_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start,
+			struct virtio_net_hdr *hdr)
+{
+#if defined(INET) || defined(INET6)
+	int offset = hdr->csum_start + hdr->csum_offset;
+#endif
+
+	/* Only do a basic sanity check on the offset. */
+	switch (eth_type) {
+#if defined(INET)
+	case ETHERTYPE_IP:
+		if (__predict_false(offset < ip_start + sizeof(struct ip)))
+			return (1);
+		break;
+#endif
+#if defined(INET6)
+	case ETHERTYPE_IPV6:
+		if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
+			return (1);
+		break;
+#endif
+	default:
+		/* Here we should increment the rx_csum_bad_ethtype counter. */
+		return (1);
+	}
+
+	/*
+	 * Use the offset to determine the appropriate CSUM_* flags. This is
+	 * a bit dirty, but we can get by with it since the checksum offsets
+	 * happen to be different. We assume the host host does not do IPv4
+	 * header checksum offloading.
+	 */
+	switch (hdr->csum_offset) {
+	case offsetof(struct udphdr, uh_sum):
+	case offsetof(struct tcphdr, th_sum):
+		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+		m->m_pkthdr.csum_data = 0xFFFF;
+		break;
+	default:
+		/* Here we should increment the rx_csum_bad_offset counter. */
+		return (1);
+	}
+
+	return (0);
+}
+
+static inline int
+virtio_net_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start,
+		       struct virtio_net_hdr *hdr)
+{
+	int offset, proto;
+
+	switch (eth_type) {
+#if defined(INET)
+	case ETHERTYPE_IP: {
+		struct ip *ip;
+		if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
+			return (1);
+		ip = (struct ip *)(m->m_data + ip_start);
+		proto = ip->ip_p;
+		offset = ip_start + (ip->ip_hl << 2);
+		break;
+	}
+#endif
+#if defined(INET6)
+	case ETHERTYPE_IPV6:
+		if (__predict_false(m->m_len < ip_start +
+		    sizeof(struct ip6_hdr)))
+			return (1);
+		offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
+		if (__predict_false(offset < 0))
+			return (1);
+		break;
+#endif
+	default:
+		/* Here we should increment the rx_csum_bad_ethtype counter. */
+		return (1);
+	}
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
+			return (1);
+		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+		m->m_pkthdr.csum_data = 0xFFFF;
+		break;
+	case IPPROTO_UDP:
+		if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
+			return (1);
+		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+		m->m_pkthdr.csum_data = 0xFFFF;
+		break;
+	default:
+		/*
+		 * For the remaining protocols, FreeBSD does not support
+		 * checksum offloading, so the checksum will be recomputed.
+		 */
+#if 0
+		if_printf(ifp, "cksum offload of unsupported "
+		    "protocol eth_type=%#x proto=%d csum_start=%d "
+		    "csum_offset=%d\n", __func__, eth_type, proto,
+		    hdr->csum_start, hdr->csum_offset);
+#endif
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * Set the appropriate CSUM_* flags. Unfortunately, the information
+ * provided is not directly useful to us. The VirtIO header gives the
+ * offset of the checksum, which is all Linux needs, but this is not
+ * how FreeBSD does things. We are forced to peek inside the packet
+ * a bit.
+ *
+ * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
+ * could accept the offsets and let the stack figure it out.
+ */
+static inline int
+virtio_net_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr)
+{
+	struct ether_header *eh;
+	struct ether_vlan_header *evh;
+	uint16_t eth_type;
+	int offset, error;
+
+	if ((hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM |
+	    VIRTIO_NET_HDR_F_DATA_VALID)) == 0) {
+		return (0);
+	}
+
+	eh = mtod(m, struct ether_header *);
+	eth_type = ntohs(eh->ether_type);
+	if (eth_type == ETHERTYPE_VLAN) {
+		/* BMV: We should handle nested VLAN tags too. */
+		evh = mtod(m, struct ether_vlan_header *);
+		eth_type = ntohs(evh->evl_proto);
+		offset = sizeof(struct ether_vlan_header);
+	} else
+		offset = sizeof(struct ether_header);
+
+	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+		error = virtio_net_rx_csum_by_offset(m, eth_type, offset, hdr);
+	else
+		error = virtio_net_rx_csum_by_parse(m, eth_type, offset, hdr);
+
+	return (error);
+}
+
+static inline int
+virtio_net_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start)
+{
+	struct ether_vlan_header *evh;
+	int offset;
+
+	evh = mtod(m, struct ether_vlan_header *);
+	if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
+		/* BMV: We should handle nested VLAN tags too. */
+		*etype = ntohs(evh->evl_proto);
+		offset = sizeof(struct ether_vlan_header);
+	} else {
+		*etype = ntohs(evh->evl_encap_proto);
+		offset = sizeof(struct ether_header);
+	}
+
+	switch (*etype) {
+#if defined(INET)
+	case ETHERTYPE_IP: {
+		struct ip *ip, iphdr;
+		if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
+			m_copydata(m, offset, sizeof(struct ip),
+			    (caddr_t) &iphdr);
+			ip = &iphdr;
+		} else
+			ip = (struct ip *)(m->m_data + offset);
+		*proto = ip->ip_p;
+		*start = offset + (ip->ip_hl << 2);
+		break;
+	}
+#endif
+#if defined(INET6)
+	case ETHERTYPE_IPV6:
+		*proto = -1;
+		*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
+		/* Assert the network stack sent us a valid packet. */
+		KASSERT(*start > offset,
+		    ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
+		    *start, offset, *proto));
+		break;
+#endif
+	default:
+		/* Here we should increment the tx_csum_bad_ethtype counter. */
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+static inline int
+virtio_net_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type,
+		     int offset, bool allow_ecn, struct virtio_net_hdr *hdr)
+{
+	static struct timeval lastecn;
+	static int curecn;
+	struct tcphdr *tcp, tcphdr;
+
+	if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
+		m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
+		tcp = &tcphdr;
+	} else
+		tcp = (struct tcphdr *)(m->m_data + offset);
+
+	hdr->hdr_len = offset + (tcp->th_off << 2);
+	hdr->gso_size = m->m_pkthdr.tso_segsz;
+	hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
+	    VIRTIO_NET_HDR_GSO_TCPV6;
+
+	if (tcp->th_flags & TH_CWR) {
+		/*
+		 * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD,
+		 * ECN support is not on a per-interface basis, but globally via
+		 * the net.inet.tcp.ecn.enable sysctl knob. The default is off.
+		 */
+		if (!allow_ecn) {
+			if (ppsratecheck(&lastecn, &curecn, 1))
+				if_printf(ifp,
+				    "TSO with ECN not negotiated with host\n");
+			return (ENOTSUP);
+		}
+		hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+	}
+
+	/* Here we should increment tx_tso counter. */
+
+	return (0);
+}
+
+static inline struct mbuf *
+virtio_net_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn,
+		 struct virtio_net_hdr *hdr)
+{
+	int flags, etype, csum_start, proto, error;
+
+	flags = m->m_pkthdr.csum_flags;
+
+	error = virtio_net_tx_offload_ctx(m, &etype, &proto, &csum_start);
+	if (error)
+		goto drop;
+
+	if ((etype == ETHERTYPE_IP && (flags & (CSUM_TCP | CSUM_UDP))) ||
+	    (etype == ETHERTYPE_IPV6 &&
+	        (flags & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6)))) {
+		/*
+		 * We could compare the IP protocol vs the CSUM_ flag too,
+		 * but that really should not be necessary.
+		 */
+		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		hdr->csum_start = csum_start;
+		hdr->csum_offset = m->m_pkthdr.csum_data;
+		/* Here we should increment the tx_csum counter. */
+	}
+
+	if (flags & CSUM_TSO) {
+		if (__predict_false(proto != IPPROTO_TCP)) {
+			/* Likely failed to correctly parse the mbuf.
+			 * Here we should increment the tx_tso_not_tcp
+			 * counter. */
+			goto drop;
+		}
+
+		KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
+		    ("%s: mbuf %p TSO without checksum offload %#x",
+		    __func__, m, flags));
+
+		error = virtio_net_tx_offload_tso(ifp, m, etype, csum_start,
+					     allow_ecn, hdr);
+		if (error)
+			goto drop;
+	}
+
+	return (m);
+
+drop:
+	m_freem(m);
+	return (NULL);
+}
+
 #endif /* _VIRTIO_NET_H */
--- a/sys/net/if_tap.h
+++ b/sys/net/if_tap.h
@ -43,7 +43,7 @@
 #include <net/if_tun.h>

 /* maximum receive packet size (hard limit) */
-#define	TAPMRU		16384
+#define	TAPMRU		65535

 #define	tapinfo		tuninfo

@ -56,6 +56,8 @@
 #define	TAPSIFINFO		TUNSIFINFO
 #define	TAPGIFINFO		TUNGIFINFO
 #define	TAPGIFNAME		TUNGIFNAME
+#define	TAPSVNETHDR		_IOW('t', 91, int)
+#define	TAPGVNETHDR		_IOR('t', 94, int)

 /* VMware ioctl's */
 #define VMIO_SIOCSIFFLAGS	_IOWINT('V', 0)
--- a/sys/net/if_tuntap.c
+++ b/sys/net/if_tuntap.c
@ -84,16 +84,24 @@
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
+#include <net/if_vlan_var.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #ifdef INET
 #include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
 #endif
 #include <net/bpf.h>
 #include <net/if_tap.h>
 #include <net/if_tun.h>

+#include <dev/virtio/network/virtio_net.h>
+
 #include <sys/queue.h>
 #include <sys/condvar.h>
 #include <security/mac/mac_framework.h>
@ -134,6 +142,7 @@ struct tuntap_softc {
 	struct cv		 tun_cv;	/* for ref'd dev destroy */
 	struct ether_addr	 tun_ether;	/* remote address */
 	int			 tun_busy;	/* busy count */
+	int			 tun_vhdrlen;	/* virtio-net header length */
 };
 #define	TUN2IFP(sc)	((sc)->tun_ifp)

@ -145,6 +154,19 @@ struct tuntap_softc {

 #define	TUN_VMIO_FLAG_MASK	0x0fff

+/*
+ * Interface capabilities of a tap device that supports the virtio-net
+ * header.
+ */
+#define TAP_VNET_HDR_CAPS	(IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6	\
+				| IFCAP_VLAN_HWCSUM			\
+				| IFCAP_TSO | IFCAP_LRO			\
+				| IFCAP_VLAN_HWTSO)
+
+#define TAP_ALL_OFFLOAD		(CSUM_TSO | CSUM_TCP | CSUM_UDP |\
+				    CSUM_TCP_IPV6 | CSUM_UDP_IPV6)
+
+
 /*
 * All mutable global variables in if_tun are locked using tunmtx, with
 * the exception of tundebug, which is used unlocked, and the drivers' *clones,
@ -211,6 +233,7 @@ static int	tap_clone_match(struct if_clone *ifc, const char *name);
 static int	vmnet_clone_match(struct if_clone *ifc, const char *name);
 static int	tun_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static int	tun_clone_destroy(struct if_clone *, struct ifnet *);
+static void	tun_vnethdr_set(struct ifnet *ifp, int vhdrlen);

 static d_open_t		tunopen;
 static d_close_t	tunclose;
@ -1140,6 +1163,7 @@ out:
 	TUNDEBUG (ifp, "closed\n");
 	tp->tun_flags &= ~TUN_OPEN;
 	tp->tun_pid = 0;
+	tun_vnethdr_set(ifp, 0);

 	tun_unbusy_locked(tp);
 	TUN_UNLOCK(tp);
@ -1201,6 +1225,65 @@ tunifinit(void *xtp)
 	tuninit(tp->tun_ifp);
 }

+/*
+ * To be called under TUN_LOCK. Update ifp->if_hwassist according to the
+ * current value of ifp->if_capenable.
+ */
+static void
+tun_caps_changed(struct ifnet *ifp)
+{
+	uint64_t hwassist = 0;
+
+	TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc);
+	if (ifp->if_capenable & IFCAP_TXCSUM)
+		hwassist |= CSUM_TCP | CSUM_UDP;
+	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
+		hwassist |= CSUM_TCP_IPV6
+		    | CSUM_UDP_IPV6;
+	if (ifp->if_capenable & IFCAP_TSO4)
+		hwassist |= CSUM_IP_TSO;
+	if (ifp->if_capenable & IFCAP_TSO6)
+		hwassist |= CSUM_IP6_TSO;
+	ifp->if_hwassist = hwassist;
+}
+
+/*
+ * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust
+ * if_capabilities and if_capenable as needed.
+ */
+static void
+tun_vnethdr_set(struct ifnet *ifp, int vhdrlen)
+{
+	struct tuntap_softc *tp = ifp->if_softc;
+
+	TUN_LOCK_ASSERT(tp);
+
+	if (tp->tun_vhdrlen == vhdrlen)
+		return;
+
+	/*
+	 * Update if_capabilities to reflect the
+	 * functionalities offered by the virtio-net
+	 * header.
+	 */
+	if (vhdrlen != 0)
+		ifp->if_capabilities |=
+			TAP_VNET_HDR_CAPS;
+	else
+		ifp->if_capabilities &=
+			~TAP_VNET_HDR_CAPS;
+	/*
+	 * Disable any capabilities that we don't
+	 * support anymore.
+	 */
+	ifp->if_capenable &= ifp->if_capabilities;
+	tun_caps_changed(ifp);
+	tp->tun_vhdrlen = vhdrlen;
+
+	TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n",
+	    vhdrlen, ifp->if_capabilities);
+}
+
 /*
 * Process an ioctl request.
 */
@ -1268,6 +1351,13 @@ tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 			error = copyout(&media, ifmr->ifm_ulist, sizeof(int));
 		}
 		break;
+	case SIOCSIFCAP:
+		TUN_LOCK(tp);
+		ifp->if_capenable = ifr->ifr_reqcap;
+		tun_caps_changed(ifp);
+		TUN_UNLOCK(tp);
+		VLAN_CAPABILITIES(ifp);
+		break;
 	default:
 		if (l2tun) {
 			error = ether_ioctl(ifp, cmd, data);
@ -1378,12 +1468,9 @@ tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
 {
 	struct ifreq ifr, *ifrp;
 	struct tuntap_softc *tp = dev->si_drv1;
+	struct ifnet *ifp = TUN2IFP(tp);
 	struct tuninfo *tunp;
-	int error, iflags;
-#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
-    defined(COMPAT_FREEBSD4)
-	int	ival;
-#endif
+	int error, iflags, ival;
 	bool	l2tun;

 	l2tun = (tp->tun_flags & TUN_L2) != 0;
@ -1405,8 +1492,8 @@ tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
 			iflags |= IFF_UP;

 			TUN_LOCK(tp);
-			TUN2IFP(tp)->if_flags = iflags |
-			    (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE);
+			ifp->if_flags = iflags |
+			    (ifp->if_flags & IFF_CANTCHANGE);
 			TUN_UNLOCK(tp);

 			return (0);
@ -1423,6 +1510,24 @@ tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);

+			return (0);
+		case TAPSVNETHDR:
+			ival = *(int *)data;
+			if (ival != 0 &&
+			    ival != sizeof(struct virtio_net_hdr) &&
+			    ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
+				return (EINVAL);
+			}
+			TUN_LOCK(tp);
+			tun_vnethdr_set(ifp, ival);
+			TUN_UNLOCK(tp);
+
+			return (0);
+		case TAPGVNETHDR:
+			TUN_LOCK(tp);
+			*(int *)data = tp->tun_vhdrlen;
+			TUN_UNLOCK(tp);
+
 			return (0);
 		}

@ -1578,7 +1683,8 @@ tunread(struct cdev *dev, struct uio *uio, int flag)
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	struct mbuf	*m;
-	int		error=0, len;
+	size_t		len;
+	int		error = 0;

 	TUNDEBUG (ifp, "read\n");
 	TUN_LOCK(tp);
@ -1611,6 +1717,23 @@ tunread(struct cdev *dev, struct uio *uio, int flag)
 	if ((tp->tun_flags & TUN_L2) != 0)
 		BPF_MTAP(ifp, m);

+	len = min(tp->tun_vhdrlen, uio->uio_resid);
+	if (len > 0) {
+		struct virtio_net_hdr_mrg_rxbuf vhdr;
+
+		bzero(&vhdr, sizeof(vhdr));
+		if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) {
+			m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr);
+		}
+
+		TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
+		    "gs %u, cs %u, co %u\n", vhdr.hdr.flags,
+		    vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
+		    vhdr.hdr.gso_size, vhdr.hdr.csum_start,
+		    vhdr.hdr.csum_offset);
+		error = uiomove(&vhdr, len, uio);
+	}
+
 	while (m && uio->uio_resid > 0 && error == 0) {
 		len = min(uio->uio_resid, m->m_len);
 		if (len != 0)
@ -1626,7 +1749,8 @@ tunread(struct cdev *dev, struct uio *uio, int flag)
 }

 static int
-tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m)
+tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m,
+	    struct virtio_net_hdr_mrg_rxbuf *vhdr)
 {
 	struct ether_header *eh;
 	struct ifnet *ifp;
@ -1651,6 +1775,11 @@ tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m)
 		return (0);
 	}

+	if (vhdr != NULL && virtio_net_rx_csum(m, &vhdr->hdr)) {
+		m_freem(m);
+		return (0);
+	}
+
 	/* Pass packet up to parent. */
 	CURVNET_SET(ifp->if_vnet);
 	(*ifp->if_input)(ifp, m);
@ -1717,11 +1846,12 @@ tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m)
 static	int
 tunwrite(struct cdev *dev, struct uio *uio, int flag)
 {
+	struct virtio_net_hdr_mrg_rxbuf vhdr;
 	struct tuntap_softc *tp;
 	struct ifnet	*ifp;
 	struct mbuf	*m;
 	uint32_t	mru;
-	int		align;
+	int		align, vhdrlen, error;
 	bool		l2tun;

 	tp = dev->si_drv1;
@ -1735,17 +1865,30 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag)
 		return (0);

 	l2tun = (tp->tun_flags & TUN_L2) != 0;
-	align = 0;
 	mru = l2tun ? TAPMRU : TUNMRU;
-	if (l2tun)
+	vhdrlen = tp->tun_vhdrlen;
+	align = 0;
+	if (l2tun) {
 		align = ETHER_ALIGN;
-	else if ((tp->tun_flags & TUN_IFHEAD) != 0)
+		mru += vhdrlen;
+	} else if ((tp->tun_flags & TUN_IFHEAD) != 0)
 		mru += sizeof(uint32_t);	/* family */
 	if (uio->uio_resid < 0 || uio->uio_resid > mru) {
 		TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid);
 		return (EIO);
 	}

+	if (vhdrlen > 0) {
+		error = uiomove(&vhdr, vhdrlen, uio);
+		if (error != 0)
+			return (error);
+		TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
+		    "gs %u, cs %u, co %u\n", vhdr.hdr.flags,
+		    vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
+		    vhdr.hdr.gso_size, vhdr.hdr.csum_start,
+		    vhdr.hdr.csum_offset);
+	}
+
 	if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		return (ENOBUFS);
@ -1757,7 +1900,7 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag)
 #endif

 	if (l2tun)
-		return (tunwrite_l2(tp, m));
+		return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL));

 	return (tunwrite_l3(tp, m));
 }