bhyve: add support for virtio-net mergeable rx buffers

Mergeable rx buffers is a virtio-net feature that allows the hypervisor to use multiple RX descriptor chains to receive a single receive packet. Without this feature, a TSO-enabled guest is compelled to publish only 64K (or 32K) long chains, and each of these large buffers is consumed to receive a single packet, even a very short one. This is a waste of memory, as a RX queue has room for 256 chains, which means up to 16MB of buffer memory for each (single-queue) vtnet device. With the feature on, the guest can publish 2K long chains, and the hypervisor will merge them as needed. This change also enables the feature in the netmap backend, which supports virtio-net offloads. We plan to add support for the tap backend too. Note that differently from QEMU/KVM, here we implement one-copy receive, while QEMU uses two copies. Reviewed by: jhb MFC after: 3 weeks Differential Revision: https://reviews.freebsd.org/D21007
svn path=/head/; revision=354552
2019-11-08 17:57:03 +00:00 · 2019-11-08 17:57:03 +00:00 · d55e0373f1 · 2020-12-20 02:59:44 +00:00
commit d55e0373f1
parent 1c37b63fb6
5 changed files with 143 additions and 64 deletions
--- a/usr.sbin/bhyve/net_backends.c
+++ b/usr.sbin/bhyve/net_backends.c
@ -328,7 +328,8 @@ DATA_SET(net_backend_set, vmnet_backend);
 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
-		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
+		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \
+		VIRTIO_NET_F_MRG_RXBUF)

 struct netmap_priv {
 	char ifname[IFNAMSIZ];
--- a/usr.sbin/bhyve/pci_virtio_console.c
+++ b/usr.sbin/bhyve/pci_virtio_console.c
@ -423,7 +423,7 @@ pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg)
 		len = readv(sock->vss_conn_fd, &iov, n);

 		if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) {
-			vq_retchain(vq);
+			vq_retchains(vq, 1);
 			vq_endchains(vq, 0);
 			if (len == 0)
 				goto close;
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@ -58,11 +58,14 @@ __FBSDID("$FreeBSD$");
 #include "virtio.h"
 #include "net_utils.h"
 #include "net_backends.h"
+#include "iov.h"

 #define VTNET_RINGSZ	1024

 #define VTNET_MAXSEGS	256

+#define VTNET_MAX_PKT_LEN	(65536 + 64)
+
 #define VTNET_S_HOSTCAPS      \
  ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \
    VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
@ -170,44 +173,79 @@ pci_vtnet_reset(void *vsc)
 	pthread_mutex_unlock(&sc->rx_mtx);
 }

+struct virtio_mrg_rxbuf_info {
+	uint16_t idx;
+	uint16_t pad;
+	uint32_t len;
+};
+
 static void
 pci_vtnet_rx(struct pci_vtnet_softc *sc)
 {
+	struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS];
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	struct vqueue_info *vq;
-	int len, n;
-	uint16_t idx;
+	uint32_t cur_iov_bytes;
+	struct iovec *cur_iov;
+	uint16_t cur_iov_len;
+	uint32_t ulen;
+	int n_chains;
+	int len;

 	vq = &sc->vsc_queues[VTNET_RXQ];
 	for (;;) {
 		/*
-		 * Check for available rx buffers.
+		 * Get a descriptor chain to store the next ingress
+		 * packet. In case of mergeable rx buffers, get as
+		 * many chains as necessary in order to make room
+		 * for a maximum sized LRO packet.
 		 */
-		if (!vq_has_descs(vq)) {
-			/* No rx buffers. Enable RX kicks and double check. */
-			vq_kick_enable(vq);
-			if (!vq_has_descs(vq)) {
+		cur_iov_bytes = 0;
+		cur_iov_len = 0;
+		cur_iov = iov;
+		n_chains = 0;
+		do {
+			int n = vq_getchain(vq, &info[n_chains].idx, cur_iov,
+			    VTNET_MAXSEGS - cur_iov_len, NULL);
+
+			if (n == 0) {
 				/*
-				 * Still no buffers. Interrupt if needed
-				 * (including for NOTIFY_ON_EMPTY), and
-				 * disable the backend until the next kick.
+				 * No rx buffers. Enable RX kicks and double
+				 * check.
 				 */
-				vq_endchains(vq, /*used_all_avail=*/1);
-				netbe_rx_disable(sc->vsc_be);
-				return;
+				vq_kick_enable(vq);
+				if (!vq_has_descs(vq)) {
+					/*
+					 * Still no buffers. Return the unused
+					 * chains (if any), interrupt if needed
+					 * (including for NOTIFY_ON_EMPTY), and
+					 * disable the backend until the next
+					 * kick.
+					 */
+					vq_retchains(vq, n_chains);
+					vq_endchains(vq, /*used_all_avail=*/1);
+					netbe_rx_disable(sc->vsc_be);
+					return;
+				}
+
+				/* More rx buffers found, so keep going. */
+				vq_kick_disable(vq);
+				continue;
 			}
+			assert(n >= 1 && cur_iov_len + n <= VTNET_MAXSEGS);
+			cur_iov_len += n;
+			if (!sc->rx_merge) {
+				n_chains = 1;
+				break;
+			}
+			info[n_chains].len = (uint32_t)count_iov(cur_iov, n);
+			cur_iov_bytes += info[n_chains].len;
+			cur_iov += n;
+			n_chains++;
+		} while (cur_iov_bytes < VTNET_MAX_PKT_LEN &&
+			    cur_iov_len < VTNET_MAXSEGS);

-			/* More rx buffers found, so keep going. */
-			vq_kick_disable(vq);
-		}
-
-		/*
-		 * Get descriptor chain.
-		 */
-		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
-		assert(n >= 1 && n <= VTNET_MAXSEGS);
-
-		len = netbe_recv(sc->vsc_be, iov, n);
+		len = netbe_recv(sc->vsc_be, iov, cur_iov_len);

 		if (len <= 0) {
 			/*
@ -215,14 +253,39 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 			 * (err < 0). Return unused available buffers
 			 * and stop.
 			 */
-			vq_retchain(vq);
+			vq_retchains(vq, n_chains);
 			/* Interrupt if needed/appropriate and stop. */
 			vq_endchains(vq, /*used_all_avail=*/0);
 			return;
 		}

-		/* Publish the info to the guest */
-		vq_relchain(vq, idx, (uint32_t)len);
+		ulen = (uint32_t)len; /* avoid too many casts below */
+
+		/* Publish the used buffers to the guest. */
+		if (!sc->rx_merge) {
+			vq_relchain(vq, info[0].idx, ulen);
+		} else {
+			struct virtio_net_rxhdr *hdr = iov[0].iov_base;
+			uint32_t iolen;
+			int i = 0;
+
+			assert(iov[0].iov_len >= sizeof(*hdr));
+
+			do {
+				iolen = info[i].len;
+				if (iolen > ulen) {
+					iolen = ulen;
+				}
+				vq_relchain_prepare(vq, info[i].idx, iolen);
+				ulen -= iolen;
+				i++;
+				assert(i <= n_chains);
+			} while (ulen > 0);
+
+			hdr->vrh_bufs = i;
+			vq_relchain_publish(vq);
+			vq_retchains(vq, n_chains - i);
+		}
 	}

 }
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@ -102,6 +102,7 @@ vi_reset_dev(struct virtio_softc *vs)
 	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
 		vq->vq_flags = 0;
 		vq->vq_last_avail = 0;
+		vq->vq_next_used = 0;
 		vq->vq_save_used = 0;
 		vq->vq_pfn = 0;
 		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
@ -199,6 +200,7 @@ vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
 	/* Mark queue as allocated, and start at 0 when we use it. */
 	vq->vq_flags = VQ_ALLOC;
 	vq->vq_last_avail = 0;
+	vq->vq_next_used = 0;
 	vq->vq_save_used = 0;
 }

@ -279,7 +281,7 @@ vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
         * the guest has written are valid (including all their
         * vd_next fields and vd_flags).
 	 *
-	 * Compute (last_avail - va_idx) in integers mod 2**16.  This is
+	 * Compute (va_idx - last_avail) in integers mod 2**16.  This is
 	 * the number of descriptors the device has made available
 	 * since the last time we updated vq->vq_last_avail.
 	 *
@ -382,16 +384,52 @@ vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 }

 /*
- * Return the currently-first request chain back to the available queue.
+ * Return the first n_chain request chains back to the available queue.
 *
- * (This chain is the one you handled when you called vq_getchain()
+ * (These chains are the ones you handled when you called vq_getchain()
 * and used its positive return value.)
 */
 void
-vq_retchain(struct vqueue_info *vq)
+vq_retchains(struct vqueue_info *vq, uint16_t n_chains)
 {

-	vq->vq_last_avail--;
+	vq->vq_last_avail -= n_chains;
+}
+
+void
+vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
+{
+	volatile struct vring_used *vuh;
+	volatile struct virtio_used *vue;
+	uint16_t mask;
+
+	/*
+	 * Notes:
+	 *  - mask is N-1 where N is a power of 2 so computes x % N
+	 *  - vuh points to the "used" data shared with guest
+	 *  - vue points to the "used" ring entry we want to update
+	 *
+	 * (I apologize for the two fields named vu_idx; the
+	 * virtio spec calls the one that vue points to, "id"...)
+	 */
+	mask = vq->vq_qsize - 1;
+	vuh = vq->vq_used;
+
+	vue = &vuh->vu_ring[vq->vq_next_used++ & mask];
+	vue->vu_idx = idx;
+	vue->vu_tlen = iolen;
+}
+
+void
+vq_relchain_publish(struct vqueue_info *vq)
+{
+	/*
+	 * Ensure the used descriptor is visible before updating the index.
+	 * This is necessary on ISAs with memory ordering less strict than x86
+	 * (and even on x86 to act as a compiler barrier).
+	 */
+	atomic_thread_fence_rel();
+	vq->vq_used->vu_idx = vq->vq_next_used;
 }

 /*
@ -404,35 +442,8 @@ vq_retchain(struct vqueue_info *vq)
 void
 vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
 {
-	uint16_t uidx, mask;
-	volatile struct vring_used *vuh;
-	volatile struct virtio_used *vue;
-
-	/*
-	 * Notes:
-	 *  - mask is N-1 where N is a power of 2 so computes x % N
-	 *  - vuh points to the "used" data shared with guest
-	 *  - vue points to the "used" ring entry we want to update
-	 *  - head is the same value we compute in vq_iovecs().
-	 *
-	 * (I apologize for the two fields named vu_idx; the
-	 * virtio spec calls the one that vue points to, "id"...)
-	 */
-	mask = vq->vq_qsize - 1;
-	vuh = vq->vq_used;
-
-	uidx = vuh->vu_idx;
-	vue = &vuh->vu_ring[uidx++ & mask];
-	vue->vu_idx = idx;
-	vue->vu_tlen = iolen;
-
-	/*
-	 * Ensure the used descriptor is visible before updating the index.
-	 * This is necessary on ISAs with memory ordering less strict than x86
-	 * (and even on x86 to act as a compiler barrier).
-	 */
-	atomic_thread_fence_rel();
-	vuh->vu_idx = uidx;
+	vq_relchain_prepare(vq, idx, iolen);
+	vq_relchain_publish(vq);
 }

 /*
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@ -392,6 +392,7 @@ struct vqueue_info {

 	uint16_t vq_flags;	/* flags (see above) */
 	uint16_t vq_last_avail;	/* a recent value of vq_avail->va_idx */
+	uint16_t vq_next_used;	/* index of the next used slot to be filled */
 	uint16_t vq_save_used;	/* saved vq_used->vu_idx; see vq_endchains */
 	uint16_t vq_msix_idx;	/* MSI-X index, or VIRTIO_MSI_NO_VECTOR */

@ -479,7 +480,10 @@ void	vi_set_io_bar(struct virtio_softc *, int);

 int	vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 		    struct iovec *iov, int n_iov, uint16_t *flags);
-void	vq_retchain(struct vqueue_info *vq);
+void	vq_retchains(struct vqueue_info *vq, uint16_t n_chains);
+void	vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx,
+			    uint32_t iolen);
+void	vq_relchain_publish(struct vqueue_info *vq);
 void	vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
 void	vq_endchains(struct vqueue_info *vq, int used_all_avail);