bhyve: enable virtio-net mergeable rx buffers for tap(4)

This patch adds a new netbe_peek_recvlen() function to the net
backend API. The new function allows the virtio-net receive code
to know in advance how many virtio descriptors chains will be
needed to receive the next packet. As a result, the implementation
of the virtio-net mergeable rx buffers feature becomes efficient,
so that we can enable it also with the tap(4) backend. For the
tap(4) backend, a bounce buffer is introduced to implement the
peeck_recvlen() callback, which implies an additional packet copy
on the receive datapath. In the future, it should be possible to
remove the bounce buffer (and so the additional copy), by
obtaining the length of the next packet from kevent data.

Reviewed by:    grehan, aleksandr.fedorov@itglobal.com
MFC after:      1 week
Differential Revision:	https://reviews.freebsd.org/D23472
This commit is contained in:
Vincenzo Maffione 2020-02-20 21:07:23 +00:00
parent 55cd93249b
commit f92bb8c19a
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=358180
5 changed files with 138 additions and 38 deletions

View File

@ -119,24 +119,25 @@ iov_to_buf(const struct iovec *iov, int niov, void **buf)
}
ssize_t
buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, int niov,
size_t seek)
{
struct iovec *diov;
int ndiov, i;
size_t off = 0, len;
int i;
if (seek > 0) {
int ndiov;
diov = malloc(sizeof(struct iovec) * niov);
seek_iov(iov, niov, diov, &ndiov, seek);
} else {
diov = iov;
ndiov = niov;
iov = diov;
niov = ndiov;
}
for (i = 0; i < ndiov && off < buflen; i++) {
len = MIN(diov[i].iov_len, buflen - off);
memcpy(diov[i].iov_base, buf + off, len);
for (i = 0; i < niov && off < buflen; i++) {
len = MIN(iov[i].iov_len, buflen - off);
memcpy(iov[i].iov_base, buf + off, len);
off += len;
}

View File

@ -38,7 +38,7 @@ void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2,
void truncate_iov(struct iovec *iov, int *niov, size_t length);
size_t count_iov(const struct iovec *iov, int niov);
ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf);
ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
size_t seek);
ssize_t buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov,
int niov, size_t seek);
#endif /* _IOV_H_ */

View File

@ -102,6 +102,13 @@ struct net_backend {
ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
int iovcnt);
/*
* Get the length of the next packet that can be received from
* the backend. If no packets are currently available, this
* function returns 0.
*/
ssize_t (*peek_recvlen)(struct net_backend *be);
/*
* Called to receive a packet from the backend. When the function
* returns a positive value 'len', the scatter-gather vector
@ -167,6 +174,13 @@ SET_DECLARE(net_backend_set, struct net_backend);
struct tap_priv {
struct mevent *mevp;
/*
* A bounce buffer that allows us to implement the peek_recvlen
* callback. In the future we may get the same information from
* the kevent data.
*/
char bbuf[1 << 16];
ssize_t bbuflen;
};
static void
@ -223,6 +237,9 @@ tap_init(struct net_backend *be, const char *devname,
errx(EX_OSERR, "Unable to apply rights for sandbox");
#endif
memset(priv->bbuf, 0, sizeof(priv->bbuf));
priv->bbuflen = 0;
priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
if (priv->mevp == NULL) {
WPRINTF(("Could not register event"));
@ -246,15 +263,56 @@ tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
}
static ssize_t
tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
tap_peek_recvlen(struct net_backend *be)
{
struct tap_priv *priv = (struct tap_priv *)be->opaque;
ssize_t ret;
/* Should never be called without a valid tap fd */
assert(be->fd != -1);
if (priv->bbuflen > 0) {
/*
* We already have a packet in the bounce buffer.
* Just return its length.
*/
return priv->bbuflen;
}
/*
* Read the next packet (if any) into the bounce buffer, so
* that we get to know its length and we can return that
* to the caller.
*/
ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
if (ret < 0 && errno == EWOULDBLOCK) {
return (0);
}
if (ret > 0)
priv->bbuflen = ret;
return (ret);
}
static ssize_t
tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
{
struct tap_priv *priv = (struct tap_priv *)be->opaque;
ssize_t ret;
if (priv->bbuflen > 0) {
/*
* A packet is available in the bounce buffer, so
* we read it from there.
*/
ret = buf_to_iov(priv->bbuf, priv->bbuflen,
iov, iovcnt, 0);
/* Mark the bounce buffer as empty. */
priv->bbuflen = 0;
return (ret);
}
ret = readv(be->fd, iov, iovcnt);
if (ret < 0 && errno == EWOULDBLOCK) {
return (0);
}
@ -299,6 +357,7 @@ static struct net_backend tap_backend = {
.init = tap_init,
.cleanup = tap_cleanup,
.send = tap_send,
.peek_recvlen = tap_peek_recvlen,
.recv = tap_recv,
.recv_enable = tap_recv_enable,
.recv_disable = tap_recv_disable,
@ -313,6 +372,7 @@ static struct net_backend vmnet_backend = {
.init = tap_init,
.cleanup = tap_cleanup,
.send = tap_send,
.peek_recvlen = tap_peek_recvlen,
.recv = tap_recv,
.recv_enable = tap_recv_enable,
.recv_disable = tap_recv_disable,
@ -331,8 +391,7 @@ DATA_SET(net_backend_set, vmnet_backend);
#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \
VIRTIO_NET_F_MRG_RXBUF)
VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
struct netmap_priv {
char ifname[IFNAMSIZ];
@ -539,6 +598,26 @@ netmap_send(struct net_backend *be, const struct iovec *iov,
return (totlen);
}
static ssize_t
netmap_peek_recvlen(struct net_backend *be)
{
struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
struct netmap_ring *ring = priv->rx;
uint32_t head = ring->head;
ssize_t totlen = 0;
while (head != ring->tail) {
struct netmap_slot *slot = ring->slot + head;
totlen += slot->len;
if ((slot->flags & NS_MOREFRAG) == 0)
break;
head = nm_ring_next(ring, head);
}
return (totlen);
}
static ssize_t
netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
{
@ -628,6 +707,7 @@ static struct net_backend netmap_backend = {
.init = netmap_init,
.cleanup = netmap_cleanup,
.send = netmap_send,
.peek_recvlen = netmap_peek_recvlen,
.recv = netmap_recv,
.recv_enable = netmap_recv_enable,
.recv_disable = netmap_recv_disable,
@ -642,6 +722,7 @@ static struct net_backend vale_backend = {
.init = netmap_init,
.cleanup = netmap_cleanup,
.send = netmap_send,
.peek_recvlen = netmap_peek_recvlen,
.recv = netmap_recv,
.recv_enable = netmap_recv_enable,
.recv_disable = netmap_recv_disable,
@ -758,6 +839,13 @@ netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
return (be->send(be, iov, iovcnt));
}
ssize_t
netbe_peek_recvlen(struct net_backend *be)
{
return (be->peek_recvlen(be));
}
/*
* Try to read a packet from the backend, without blocking.
* If no packets are available, return 0. In case of success, return

View File

@ -45,6 +45,7 @@ int netbe_set_cap(net_backend_t *be, uint64_t cap,
unsigned vnet_hdr_len);
size_t netbe_get_vnet_hdr_len(net_backend_t *be);
ssize_t netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt);
ssize_t netbe_peek_recvlen(net_backend_t *be);
ssize_t netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt);
ssize_t netbe_rx_discard(net_backend_t *be);
void netbe_rx_disable(net_backend_t *be);

View File

@ -228,22 +228,34 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS];
struct iovec iov[VTNET_MAXSEGS + 1];
struct vqueue_info *vq;
uint32_t riov_bytes;
struct iovec *riov;
int riov_len;
uint32_t ulen;
int n_chains;
int len;
vq = &sc->vsc_queues[VTNET_RXQ];
for (;;) {
struct virtio_net_rxhdr *hdr;
uint32_t riov_bytes;
struct iovec *riov;
uint32_t ulen;
int riov_len;
int n_chains;
ssize_t rlen;
ssize_t plen;
plen = netbe_peek_recvlen(sc->vsc_be);
if (plen <= 0) {
/*
* No more packets (plen == 0), or backend errored
* (plen < 0). Interrupt if needed and stop.
*/
vq_endchains(vq, /*used_all_avail=*/0);
return;
}
plen += prepend_hdr_len;
/*
* Get a descriptor chain to store the next ingress
* packet. In case of mergeable rx buffers, get as
* many chains as necessary in order to make room
* for a maximum sized LRO packet.
* for plen bytes.
*/
riov_bytes = 0;
riov_len = 0;
@ -287,8 +299,7 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
riov_bytes += info[n_chains].len;
riov += n;
n_chains++;
} while (riov_bytes < VTNET_MAX_PKT_LEN &&
riov_len < VTNET_MAXSEGS);
} while (riov_bytes < plen && riov_len < VTNET_MAXSEGS);
riov = iov;
hdr = riov[0].iov_base;
@ -312,21 +323,20 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
memset(hdr, 0, prepend_hdr_len);
}
len = netbe_recv(sc->vsc_be, riov, riov_len);
if (len <= 0) {
rlen = netbe_recv(sc->vsc_be, riov, riov_len);
if (rlen != plen - prepend_hdr_len) {
/*
* No more packets (len == 0), or backend errored
* (err < 0). Return unused available buffers
* and stop.
* If this happens it means there is something
* wrong with the backend (e.g., some other
* process is stealing our packets).
*/
WPRINTF(("netbe_recv: expected %zd bytes, "
"got %zd", plen - prepend_hdr_len, rlen));
vq_retchains(vq, n_chains);
/* Interrupt if needed/appropriate and stop. */
vq_endchains(vq, /*used_all_avail=*/0);
return;
continue;
}
ulen = (uint32_t)(len + prepend_hdr_len);
ulen = (uint32_t)plen;
/*
* Publish the used buffers to the guest, reporting the
@ -346,12 +356,11 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
vq_relchain_prepare(vq, info[i].idx, iolen);
ulen -= iolen;
i++;
assert(i <= n_chains);
} while (ulen > 0);
hdr->vrh_bufs = i;
vq_relchain_publish(vq);
vq_retchains(vq, n_chains - i);
assert(i == n_chains);
}
}
@ -592,7 +601,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
free(sc);
return (err);
}
sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be);
sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF |
netbe_get_cap(sc->vsc_be);
}
if (!mac_provided) {