bhyve: enable virtio-net mergeable rx buffers for tap(4)

This patch adds a new netbe_peek_recvlen() function to the net backend API. The new function allows the virtio-net receive code to know in advance how many virtio descriptors chains will be needed to receive the next packet. As a result, the implementation of the virtio-net mergeable rx buffers feature becomes efficient, so that we can enable it also with the tap(4) backend. For the tap(4) backend, a bounce buffer is introduced to implement the peeck_recvlen() callback, which implies an additional packet copy on the receive datapath. In the future, it should be possible to remove the bounce buffer (and so the additional copy), by obtaining the length of the next packet from kevent data. Reviewed by: grehan, aleksandr.fedorov@itglobal.com MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D23472
svn path=/head/; revision=358180
2020-02-20 21:07:23 +00:00 · 2020-02-20 21:07:23 +00:00 · f92bb8c19a · 2020-12-20 02:59:44 +00:00
commit f92bb8c19a
parent 55cd93249b
5 changed files with 138 additions and 38 deletions
--- a/usr.sbin/bhyve/iov.c
+++ b/usr.sbin/bhyve/iov.c
@ -119,24 +119,25 @@ iov_to_buf(const struct iovec *iov, int niov, void **buf)
 }

 ssize_t
-buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
+buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, int niov,
    size_t seek)
 {
 	struct iovec *diov;
-	int ndiov, i;
 	size_t off = 0, len;
+	int  i;

 	if (seek > 0) {
+		int ndiov;
+
 		diov = malloc(sizeof(struct iovec) * niov);
 		seek_iov(iov, niov, diov, &ndiov, seek);
-	} else {
-		diov = iov;
-		ndiov = niov;
+		iov = diov;
+		niov = ndiov;
 	}

-	for (i = 0; i < ndiov && off < buflen; i++) {
-		len = MIN(diov[i].iov_len, buflen - off);
-		memcpy(diov[i].iov_base, buf + off, len);
+	for (i = 0; i < niov && off < buflen; i++) {
+		len = MIN(iov[i].iov_len, buflen - off);
+		memcpy(iov[i].iov_base, buf + off, len);
 		off += len;
 	}

--- a/usr.sbin/bhyve/iov.h
+++ b/usr.sbin/bhyve/iov.h
@ -38,7 +38,7 @@ void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2,
 void truncate_iov(struct iovec *iov, int *niov, size_t length);
 size_t count_iov(const struct iovec *iov, int niov);
 ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf);
-ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
-    size_t seek);
+ssize_t buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov,
+    int niov, size_t seek);

 #endif	/* _IOV_H_ */
--- a/usr.sbin/bhyve/net_backends.c
+++ b/usr.sbin/bhyve/net_backends.c
@ -102,6 +102,13 @@ struct net_backend {
 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
 	    int iovcnt);

+	/*
+	 * Get the length of the next packet that can be received from
+	 * the backend. If no packets are currently available, this
+	 * function returns 0.
+	 */
+	ssize_t (*peek_recvlen)(struct net_backend *be);
+
 	/*
 	 * Called to receive a packet from the backend. When the function
 	 * returns a positive value 'len', the scatter-gather vector
@ -167,6 +174,13 @@ SET_DECLARE(net_backend_set, struct net_backend);

 struct tap_priv {
 	struct mevent *mevp;
+	/*
+	 * A bounce buffer that allows us to implement the peek_recvlen
+	 * callback. In the future we may get the same information from
+	 * the kevent data.
+	 */
+	char bbuf[1 << 16];
+	ssize_t bbuflen;
 };

 static void
@ -223,6 +237,9 @@ tap_init(struct net_backend *be, const char *devname,
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif

+	memset(priv->bbuf, 0, sizeof(priv->bbuf));
+	priv->bbuflen = 0;
+
 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
 	if (priv->mevp == NULL) {
 		WPRINTF(("Could not register event"));
@ -246,15 +263,56 @@ tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
 }

 static ssize_t
-tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
+tap_peek_recvlen(struct net_backend *be)
 {
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
 	ssize_t ret;

-	/* Should never be called without a valid tap fd */
-	assert(be->fd != -1);
+	if (priv->bbuflen > 0) {
+		/*
+		 * We already have a packet in the bounce buffer.
+		 * Just return its length.
+		 */
+		return priv->bbuflen;
+	}
+
+	/*
+	 * Read the next packet (if any) into the bounce buffer, so
+	 * that we get to know its length and we can return that
+	 * to the caller.
+	 */
+	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
+	if (ret < 0 && errno == EWOULDBLOCK) {
+		return (0);
+	}
+
+	if (ret > 0)
+		priv->bbuflen = ret;
+
+	return (ret);
+}
+
+static ssize_t
+tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
+{
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
+	ssize_t ret;
+
+	if (priv->bbuflen > 0) {
+		/*
+		 * A packet is available in the bounce buffer, so
+		 * we read it from there.
+		 */
+		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
+		    iov, iovcnt, 0);
+
+		/* Mark the bounce buffer as empty. */
+		priv->bbuflen = 0;
+
+		return (ret);
+	}

 	ret = readv(be->fd, iov, iovcnt);
-
 	if (ret < 0 && errno == EWOULDBLOCK) {
 		return (0);
 	}
@ -299,6 +357,7 @@ static struct net_backend tap_backend = {
 	.init = tap_init,
 	.cleanup = tap_cleanup,
 	.send = tap_send,
+	.peek_recvlen = tap_peek_recvlen,
 	.recv = tap_recv,
 	.recv_enable = tap_recv_enable,
 	.recv_disable = tap_recv_disable,
@ -313,6 +372,7 @@ static struct net_backend vmnet_backend = {
 	.init = tap_init,
 	.cleanup = tap_cleanup,
 	.send = tap_send,
+	.peek_recvlen = tap_peek_recvlen,
 	.recv = tap_recv,
 	.recv_enable = tap_recv_enable,
 	.recv_disable = tap_recv_disable,
@ -331,8 +391,7 @@ DATA_SET(net_backend_set, vmnet_backend);
 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
-		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \
-		VIRTIO_NET_F_MRG_RXBUF)
+		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)

 struct netmap_priv {
 	char ifname[IFNAMSIZ];
@ -539,6 +598,26 @@ netmap_send(struct net_backend *be, const struct iovec *iov,
 	return (totlen);
 }

+static ssize_t
+netmap_peek_recvlen(struct net_backend *be)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+	struct netmap_ring *ring = priv->rx;
+	uint32_t head = ring->head;
+	ssize_t totlen = 0;
+
+	while (head != ring->tail) {
+		struct netmap_slot *slot = ring->slot + head;
+
+		totlen += slot->len;
+		if ((slot->flags & NS_MOREFRAG) == 0)
+			break;
+		head = nm_ring_next(ring, head);
+	}
+
+	return (totlen);
+}
+
 static ssize_t
 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
 {
@ -628,6 +707,7 @@ static struct net_backend netmap_backend = {
 	.init = netmap_init,
 	.cleanup = netmap_cleanup,
 	.send = netmap_send,
+	.peek_recvlen = netmap_peek_recvlen,
 	.recv = netmap_recv,
 	.recv_enable = netmap_recv_enable,
 	.recv_disable = netmap_recv_disable,
@ -642,6 +722,7 @@ static struct net_backend vale_backend = {
 	.init = netmap_init,
 	.cleanup = netmap_cleanup,
 	.send = netmap_send,
+	.peek_recvlen = netmap_peek_recvlen,
 	.recv = netmap_recv,
 	.recv_enable = netmap_recv_enable,
 	.recv_disable = netmap_recv_disable,
@ -758,6 +839,13 @@ netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
 	return (be->send(be, iov, iovcnt));
 }

+ssize_t
+netbe_peek_recvlen(struct net_backend *be)
+{
+
+	return (be->peek_recvlen(be));
+}
+
 /*
 * Try to read a packet from the backend, without blocking.
 * If no packets are available, return 0. In case of success, return
--- a/usr.sbin/bhyve/net_backends.h
+++ b/usr.sbin/bhyve/net_backends.h
@ -45,6 +45,7 @@ int	 netbe_set_cap(net_backend_t *be, uint64_t cap,
             unsigned vnet_hdr_len);
 size_t	netbe_get_vnet_hdr_len(net_backend_t *be);
 ssize_t	netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt);
+ssize_t	netbe_peek_recvlen(net_backend_t *be);
 ssize_t	netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt);
 ssize_t	netbe_rx_discard(net_backend_t *be);
 void	netbe_rx_disable(net_backend_t *be);
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@ -228,22 +228,34 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 	struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS];
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	struct vqueue_info *vq;
-	uint32_t riov_bytes;
-	struct iovec *riov;
-	int riov_len;
-	uint32_t ulen;
-	int n_chains;
-	int len;

 	vq = &sc->vsc_queues[VTNET_RXQ];
 	for (;;) {
 		struct virtio_net_rxhdr *hdr;
+		uint32_t riov_bytes;
+		struct iovec *riov;
+		uint32_t ulen;
+		int riov_len;
+		int n_chains;
+		ssize_t rlen;
+		ssize_t plen;
+
+		plen = netbe_peek_recvlen(sc->vsc_be);
+		if (plen <= 0) {
+			/*
+			 * No more packets (plen == 0), or backend errored
+			 * (plen < 0). Interrupt if needed and stop.
+			 */
+			vq_endchains(vq, /*used_all_avail=*/0);
+			return;
+		}
+		plen += prepend_hdr_len;

 		/*
 		 * Get a descriptor chain to store the next ingress
 		 * packet. In case of mergeable rx buffers, get as
 		 * many chains as necessary in order to make room
-		 * for a maximum sized LRO packet.
+		 * for plen bytes.
 		 */
 		riov_bytes = 0;
 		riov_len = 0;
@ -287,8 +299,7 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 			riov_bytes += info[n_chains].len;
 			riov += n;
 			n_chains++;
-		} while (riov_bytes < VTNET_MAX_PKT_LEN &&
-			    riov_len < VTNET_MAXSEGS);
+		} while (riov_bytes < plen && riov_len < VTNET_MAXSEGS);

 		riov = iov;
 		hdr = riov[0].iov_base;
@ -312,21 +323,20 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 			memset(hdr, 0, prepend_hdr_len);
 		}

-		len = netbe_recv(sc->vsc_be, riov, riov_len);
-
-		if (len <= 0) {
+		rlen = netbe_recv(sc->vsc_be, riov, riov_len);
+		if (rlen != plen - prepend_hdr_len) {
 			/*
-			 * No more packets (len == 0), or backend errored
-			 * (err < 0). Return unused available buffers
-			 * and stop.
+			 * If this happens it means there is something
+			 * wrong with the backend (e.g., some other
+			 * process is stealing our packets).
 			 */
+			WPRINTF(("netbe_recv: expected %zd bytes, "
+				"got %zd", plen - prepend_hdr_len, rlen));
 			vq_retchains(vq, n_chains);
-			/* Interrupt if needed/appropriate and stop. */
-			vq_endchains(vq, /*used_all_avail=*/0);
-			return;
+			continue;
 		}

-		ulen = (uint32_t)(len + prepend_hdr_len);
+		ulen = (uint32_t)plen;

 		/*
 		 * Publish the used buffers to the guest, reporting the
@ -346,12 +356,11 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 				vq_relchain_prepare(vq, info[i].idx, iolen);
 				ulen -= iolen;
 				i++;
-				assert(i <= n_chains);
 			} while (ulen > 0);

 			hdr->vrh_bufs = i;
 			vq_relchain_publish(vq);
-			vq_retchains(vq, n_chains - i);
+			assert(i == n_chains);
 		}
 	}

@ -592,7 +601,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 			free(sc);
 			return (err);
 		}
-		sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be);
+		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF |
+		    netbe_get_cap(sc->vsc_be);
 	}

 	if (!mac_provided) {