freebsd-dev/usr.sbin/bhyve/pci_virtio_net.c
Vincenzo Maffione 0ff7076bdb bhyve: abstraction for network backends
Bhyve can currently emulate two virtual NICs, namely virtio-net and e1000,
and connect to the host network through two backends, namely tap and netmap.
However, there is no interface between virtual NIC functionalities and
backend functionalities. As a result, the backend code is duplicated between
the two virtual NIC implementations and also within the same virtual NIC.
Also, e1000 cannot currently use netmap as a backend.
This patch introduces a network backend API between virtio-net/e1000 and
tap/netmap, to improve code reuse and add missing functionalities.
Virtual NICs and backends can negotiate virtio-net features, such as checksum
offload and TSO. If the backend supports the features, it will propagate this
information to the guest, so that the latter can make use of them. Currently,
only netmap VALE ports support the features, but support should be added to
tap in the future.

Reviewed by:	jhb, bryanv
MFC after:	2 weeks
Differential Revision:	https://reviews.freebsd.org/D20659
2019-07-07 12:15:24 +00:00

530 lines
13 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <net/ethernet.h>
#include <net/if.h> /* IFNAMSIZ */
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include <pthread_np.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "mevent.h"
#include "virtio.h"
#include "net_utils.h"
#include "net_backends.h"
#define VTNET_RINGSZ 1024
#define VTNET_MAXSEGS 256
#define VTNET_S_HOSTCAPS \
( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \
VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
/*
* PCI config-space "registers"
*/
struct virtio_net_config {
uint8_t mac[6];
uint16_t status;
} __packed;
/*
* Queue definitions.
*/
#define VTNET_RXQ 0
#define VTNET_TXQ 1
#define VTNET_CTLQ 2 /* NB: not yet supported */
#define VTNET_MAXQ 3
/*
* Debug printf
*/
static int pci_vtnet_debug;
#define DPRINTF(params) if (pci_vtnet_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtnet_softc {
struct virtio_softc vsc_vs;
struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
pthread_mutex_t vsc_mtx;
net_backend_t *vsc_be;
int vsc_rx_ready;
int resetting; /* protected by tx_mtx */
uint64_t vsc_features; /* negotiated features */
pthread_mutex_t rx_mtx;
unsigned int rx_vhdrlen;
int rx_merge; /* merged rx bufs in use */
pthread_t tx_tid;
pthread_mutex_t tx_mtx;
pthread_cond_t tx_cond;
int tx_in_progress;
struct virtio_net_config vsc_config;
struct virtio_consts vsc_consts;
};
static void pci_vtnet_reset(void *);
/* static void pci_vtnet_notify(void *, struct vqueue_info *); */
static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
static void pci_vtnet_neg_features(void *, uint64_t);
static struct virtio_consts vtnet_vi_consts = {
"vtnet", /* our name */
VTNET_MAXQ - 1, /* we currently support 2 virtqueues */
sizeof(struct virtio_net_config), /* config reg size */
pci_vtnet_reset, /* reset */
NULL, /* device-wide qnotify -- not used */
pci_vtnet_cfgread, /* read PCI config */
pci_vtnet_cfgwrite, /* write PCI config */
pci_vtnet_neg_features, /* apply negotiated features */
VTNET_S_HOSTCAPS, /* our capabilities */
};
static void
pci_vtnet_reset(void *vsc)
{
struct pci_vtnet_softc *sc = vsc;
DPRINTF(("vtnet: device reset requested !\n"));
/* Acquire the RX lock to block RX processing. */
pthread_mutex_lock(&sc->rx_mtx);
/* Set sc->resetting and give a chance to the TX thread to stop. */
pthread_mutex_lock(&sc->tx_mtx);
sc->resetting = 1;
while (sc->tx_in_progress) {
pthread_mutex_unlock(&sc->tx_mtx);
usleep(10000);
pthread_mutex_lock(&sc->tx_mtx);
}
sc->vsc_rx_ready = 0;
sc->rx_merge = 1;
sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
/*
* Now reset rings, MSI-X vectors, and negotiated capabilities.
* Do that with the TX lock held, since we need to reset
* sc->resetting.
*/
vi_reset_dev(&sc->vsc_vs);
sc->resetting = 0;
pthread_mutex_unlock(&sc->tx_mtx);
pthread_mutex_unlock(&sc->rx_mtx);
}
static void
pci_vtnet_rx(struct pci_vtnet_softc *sc)
{
struct iovec iov[VTNET_MAXSEGS + 1];
struct vqueue_info *vq;
int len, n;
uint16_t idx;
if (!sc->vsc_rx_ready) {
/*
* The rx ring has not yet been set up.
* Drop the packet and try later.
*/
netbe_rx_discard(sc->vsc_be);
return;
}
/*
* Check for available rx buffers
*/
vq = &sc->vsc_queues[VTNET_RXQ];
if (!vq_has_descs(vq)) {
/*
* No available rx buffers. Drop the packet and try later.
* Interrupt on empty, if that's negotiated.
*/
netbe_rx_discard(sc->vsc_be);
vq_endchains(vq, /*used_all_avail=*/1);
return;
}
do {
/*
* Get descriptor chain.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
len = netbe_recv(sc->vsc_be, iov, n);
if (len <= 0) {
/*
* No more packets (len == 0), or backend errored
* (err < 0). Return unused available buffers.
*/
vq_retchain(vq);
/* Interrupt if needed/appropriate and stop. */
vq_endchains(vq, /*used_all_avail=*/0);
return;
}
/* Publish the info to the guest */
vq_relchain(vq, idx, (uint32_t)len);
} while (vq_has_descs(vq));
/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
vq_endchains(vq, /*used_all_avail=*/1);
}
/*
* Called when there is read activity on the backend file descriptor.
* Each buffer posted by the guest is assumed to be able to contain
* an entire ethernet frame + rx header.
*/
static void
pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->rx_mtx);
pci_vtnet_rx(sc);
pthread_mutex_unlock(&sc->rx_mtx);
}
/* Called on RX kick. */
static void
pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
{
struct pci_vtnet_softc *sc = vsc;
/*
* A qnotify means that the rx process can now begin
*/
pthread_mutex_lock(&sc->rx_mtx);
if (sc->vsc_rx_ready == 0) {
sc->vsc_rx_ready = 1;
vq_kick_disable(vq);
}
pthread_mutex_unlock(&sc->rx_mtx);
}
/* TX virtqueue processing, called by the TX thread. */
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
uint16_t idx;
ssize_t len;
int n;
/*
* Obtain chain of descriptors. The first descriptor also
* contains the virtio-net header.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
len = netbe_send(sc->vsc_be, iov, n);
/* chain is processed, release it and set len */
vq_relchain(vq, idx, len > 0 ? len : 0);
}
/* Called on TX kick. */
static void
pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
{
struct pci_vtnet_softc *sc = vsc;
/*
* Any ring entries to process?
*/
if (!vq_has_descs(vq))
return;
/* Signal the tx thread for processing */
pthread_mutex_lock(&sc->tx_mtx);
vq_kick_disable(vq);
if (sc->tx_in_progress == 0)
pthread_cond_signal(&sc->tx_cond);
pthread_mutex_unlock(&sc->tx_mtx);
}
/*
* Thread which will handle processing of TX desc
*/
static void *
pci_vtnet_tx_thread(void *param)
{
struct pci_vtnet_softc *sc = param;
struct vqueue_info *vq;
int error;
vq = &sc->vsc_queues[VTNET_TXQ];
/*
* Let us wait till the tx queue pointers get initialised &
* first tx signaled
*/
pthread_mutex_lock(&sc->tx_mtx);
error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
assert(error == 0);
for (;;) {
/* note - tx mutex is locked here */
while (sc->resetting || !vq_has_descs(vq)) {
vq_kick_enable(vq);
if (!sc->resetting && vq_has_descs(vq))
break;
sc->tx_in_progress = 0;
error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
assert(error == 0);
}
vq_kick_disable(vq);
sc->tx_in_progress = 1;
pthread_mutex_unlock(&sc->tx_mtx);
do {
/*
* Run through entries, placing them into
* iovecs and sending when an end-of-packet
* is found
*/
pci_vtnet_proctx(sc, vq);
} while (vq_has_descs(vq));
/*
* Generate an interrupt if needed.
*/
vq_endchains(vq, /*used_all_avail=*/1);
pthread_mutex_lock(&sc->tx_mtx);
}
}
#ifdef notyet
static void
pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
{
DPRINTF(("vtnet: control qnotify!\n\r"));
}
#endif
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct pci_vtnet_softc *sc;
char tname[MAXCOMLEN + 1];
int mac_provided;
/*
* Allocate data structures for further virtio initializations.
* sc also contains a copy of vtnet_vi_consts, since capabilities
* change depending on the backend.
*/
sc = calloc(1, sizeof(struct pci_vtnet_softc));
sc->vsc_consts = vtnet_vi_consts;
pthread_mutex_init(&sc->vsc_mtx, NULL);
sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
#ifdef notyet
sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
#endif
/*
* Attempt to open the backend device and read the MAC address
* if specified.
*/
mac_provided = 0;
if (opts != NULL) {
char *devname;
char *vtopts;
int err;
devname = vtopts = strdup(opts);
(void) strsep(&vtopts, ",");
if (vtopts != NULL) {
err = net_parsemac(vtopts, sc->vsc_config.mac);
if (err != 0) {
free(devname);
return (err);
}
mac_provided = 1;
}
err = netbe_init(&sc->vsc_be, devname, pci_vtnet_rx_callback,
sc);
free(devname);
if (err)
return (err);
sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be);
}
if (!mac_provided) {
net_genmac(pi, sc->vsc_config.mac);
}
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
/* Link is up if we managed to open backend device. */
sc->vsc_config.status = (opts == NULL || sc->vsc_be);
vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues);
sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
return (1);
/* use BAR 0 to map config regs in IO space */
vi_set_io_bar(&sc->vsc_vs, 0);
sc->resetting = 0;
sc->rx_merge = 1;
sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
pthread_mutex_init(&sc->rx_mtx, NULL);
/*
* Initialize tx semaphore & spawn TX processing thread.
* As of now, only one thread for TX desc processing is
* spawned.
*/
sc->tx_in_progress = 0;
pthread_mutex_init(&sc->tx_mtx, NULL);
pthread_cond_init(&sc->tx_cond, NULL);
pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
pi->pi_func);
pthread_set_name_np(sc->tx_tid, tname);
return (0);
}
static int
pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
{
struct pci_vtnet_softc *sc = vsc;
void *ptr;
if (offset < (int)sizeof(sc->vsc_config.mac)) {
assert(offset + size <= (int)sizeof(sc->vsc_config.mac));
/*
* The driver is allowed to change the MAC address
*/
ptr = &sc->vsc_config.mac[offset];
memcpy(ptr, &value, size);
} else {
/* silently ignore other writes */
DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
}
return (0);
}
static int
pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
{
struct pci_vtnet_softc *sc = vsc;
void *ptr;
ptr = (uint8_t *)&sc->vsc_config + offset;
memcpy(retval, ptr, size);
return (0);
}
static void
pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
{
struct pci_vtnet_softc *sc = vsc;
sc->vsc_features = negotiated_features;
if (!(negotiated_features & VIRTIO_NET_F_MRG_RXBUF)) {
sc->rx_merge = 0;
/* Without mergeable rx buffers, virtio-net header is 2
* bytes shorter than sizeof(struct virtio_net_rxhdr). */
sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr) - 2;
}
/* Tell the backend to enable some capabilities it has advertised. */
netbe_set_cap(sc->vsc_be, negotiated_features, sc->rx_vhdrlen);
}
static struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_barwrite = vi_pci_write,
.pe_barread = vi_pci_read
};
PCI_EMUL_SET(pci_de_vnet);