4bf50f18eb
Mostly bugfixes or features developed in the past 6 months, so this is a 10.1 candidate. Basically no user API changes (some bugfixes in sys/net/netmap_user.h). In detail: 1. netmap support for virtio-net, including in netmap mode. Under bhyve and with a netmap backend [2] we reach over 1Mpps with standard APIs (e.g. libpcap), and 5-8 Mpps in netmap mode. 2. (kernel) add support for multiple memory allocators, so we can better partition physical and virtual interfaces giving access to separate users. The most visible effect is one additional argument to the various kernel functions to compute buffer addresses. All netmap-supported drivers are affected, but changes are mechanical and trivial 3. (kernel) simplify the prototype for *txsync() and *rxsync() driver methods. All netmap drivers affected, changes mostly mechanical. 4. add support for netmap-monitor ports. Think of it as a mirroring port on a physical switch: a netmap monitor port replicates traffic present on the main port. Restrictions apply. Drive carefully. 5. if_lem.c: support for various paravirtualization features, experimental and disabled by default. Most of these are described in our ANCS'13 paper [1]. Paravirtualized support in netmap mode is new, and beats the numbers in the paper by a large factor (under qemu-kvm, we measured gues-host throughput up to 10-12 Mpps). A lot of refactoring and additional documentation in the files in sys/dev/netmap, but apart from #2 and #3 above, almost nothing of this stuff is visible to other kernel parts. Example programs in tools/tools/netmap have been updated with bugfixes and to support more of the existing features. This is meant to go into 10.1 so we plan an MFC before the Aug.22 deadline. A lot of this code has been contributed by my colleagues at UNIPI, including Giuseppe Lettieri, Vincenzo Maffione, Stefano Garzarella. MFC after: 3 days.
158 lines
6.9 KiB
C
158 lines
6.9 KiB
C
/*
|
|
* Copyright (C) 2013 Luigi Rizzo. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef NET_PARAVIRT_H
|
|
#define NET_PARAVIRT_H
|
|
|
|
/*
|
|
* $FreeBSD$
|
|
*
|
|
Support for virtio-like communication between host (H) and guest (G) NICs.
|
|
|
|
THIS IS EXPERIMENTAL CODE AND SUBJECT TO CHANGE.
|
|
|
|
The guest allocates the shared Communication Status Block (csb) and
|
|
write its physical address at CSBAL and CSBAH (data is little endian).
|
|
csb->csb_on enables the mode. If disabled, the device acts a regular one.
|
|
|
|
Notifications for tx and rx are exchanged without vm exits
|
|
if possible. In particular (only mentioning csb mode below),
|
|
the following actions are performed. In the description below,
|
|
"double check" means verifying again the condition that caused
|
|
the previous action, and reverting the action if the condition has
|
|
changed. The condition typically depends on a variable set by the
|
|
other party, and the double check is done to avoid races. E.g.
|
|
|
|
// start with A=0
|
|
again:
|
|
// do something
|
|
if ( cond(C) ) { // C is written by the other side
|
|
A = 1;
|
|
// barrier
|
|
if ( !cond(C) ) {
|
|
A = 0;
|
|
goto again;
|
|
}
|
|
}
|
|
|
|
TX: start from idle:
|
|
H starts with host_need_txkick=1 when the I/O thread bh is idle. Upon new
|
|
transmissions, G always updates guest_tdt. If host_need_txkick == 1,
|
|
G also writes to the TDT, which acts as a kick to H (so pending
|
|
writes are always dispatched to H as soon as possible.)
|
|
|
|
TX: active state:
|
|
On the kick (TDT write) H sets host_need_txkick == 0 (if not
|
|
done already by G), and starts an I/O thread trying to consume
|
|
packets from TDH to guest_tdt, periodically refreshing host_tdh
|
|
and TDH. When host_tdh == guest_tdt, H sets host_need_txkick=1,
|
|
and then does the "double check" for race avoidance.
|
|
|
|
TX: G runs out of buffers
|
|
XXX there are two mechanisms, one boolean (using guest_need_txkick)
|
|
and one with a threshold (using guest_txkick_at). They are mutually
|
|
exclusive.
|
|
BOOLEAN: when G has no space, it sets guest_need_txkick=1 and does
|
|
the double check. If H finds guest_need_txkick== 1 on a write
|
|
to TDH, it also generates an interrupt.
|
|
THRESHOLD: G sets guest_txkick_at to the TDH value for which it
|
|
wants to receive an interrupt. When H detects that TDH moves
|
|
across guest_txkick_at, it generates an interrupt.
|
|
This second mechanism reduces the number of interrupts and
|
|
TDT writes on the transmit side when the host is too slow.
|
|
|
|
RX: start from idle
|
|
G starts with guest_need_rxkick = 1 when the receive ring is empty.
|
|
As packets arrive, H updates host_rdh (and RDH) and also generates an
|
|
interrupt when guest_need_rxkick == 1 (so incoming packets are
|
|
always reported to G as soon as possible, apart from interrupt
|
|
moderation delays). It also tracks guest_rdt for new buffers.
|
|
|
|
RX: active state
|
|
As the interrupt arrives, G sets guest_need_rxkick = 0 and starts
|
|
draining packets from the receive ring, while updating guest_rdt
|
|
When G runs out of packets it sets guest_need_rxkick=1 and does the
|
|
double check.
|
|
|
|
RX: H runs out of buffers
|
|
XXX there are two mechanisms, one boolean (using host_need_rxkick)
|
|
and one with a threshold (using host_xxkick_at). They are mutually
|
|
exclusive.
|
|
BOOLEAN: when H has no space, it sets host_need_rxkick=1 and does the
|
|
double check. If G finds host_need_rxkick==1 on updating guest_rdt,
|
|
it also writes to RDT causing a kick to H.
|
|
THRESHOLD: H sets host_rxkick_at to the RDT value for which it wants
|
|
to receive a kick. When G detects that guest_rdt moves across
|
|
host_rxkick_at, it writes to RDT thus generates a kick.
|
|
This second mechanism reduces the number of kicks and
|
|
RDT writes on the receive side when the guest is too slow and
|
|
would free only a few buffers at a time.
|
|
|
|
*/
|
|
struct paravirt_csb {
|
|
/* XXX revise the layout to minimize cache bounces.
|
|
* Usage is described as follows:
|
|
* [GH][RW][+-0] guest/host reads/writes frequently/rarely/almost never
|
|
*/
|
|
/* these are (mostly) written by the guest */
|
|
uint32_t guest_tdt; /* GW+ HR+ pkt to transmit */
|
|
uint32_t guest_need_txkick; /* GW- HR+ G ran out of tx bufs, request kick */
|
|
uint32_t guest_need_rxkick; /* GW- HR+ G ran out of rx pkts, request kick */
|
|
uint32_t guest_csb_on; /* GW- HR+ enable paravirtual mode */
|
|
uint32_t guest_rdt; /* GW+ HR+ rx buffers available */
|
|
uint32_t guest_txkick_at; /* GW- HR+ tx ring pos. where G expects an intr */
|
|
uint32_t guest_use_msix; /* GW0 HR0 guest uses MSI-X interrupts. */
|
|
uint32_t pad[9];
|
|
|
|
/* these are (mostly) written by the host */
|
|
uint32_t host_tdh; /* GR0 HW- shadow register, mostly unused */
|
|
uint32_t host_need_txkick; /* GR+ HW- start the iothread */
|
|
uint32_t host_txcycles_lim; /* GW- HR- how much to spin before sleep.
|
|
* set by the guest */
|
|
uint32_t host_txcycles; /* GR0 HW- counter, but no need to be exported */
|
|
uint32_t host_rdh; /* GR0 HW- shadow register, mostly unused */
|
|
uint32_t host_need_rxkick; /* GR+ HW- flush rx queued packets */
|
|
uint32_t host_isr; /* GR* HW* shadow copy of ISR */
|
|
uint32_t host_rxkick_at; /* GR+ HW- rx ring pos where H expects a kick */
|
|
uint32_t vnet_ring_high; /* Vnet ring physical address high. */
|
|
uint32_t vnet_ring_low; /* Vnet ring physical address low. */
|
|
};
|
|
|
|
#define NET_PARAVIRT_CSB_SIZE 4096
|
|
#define NET_PARAVIRT_NONE (~((uint32_t)0))
|
|
|
|
#ifdef QEMU_PCI_H
|
|
|
|
/*
|
|
* API functions only available within QEMU
|
|
*/
|
|
|
|
void paravirt_configure_csb(struct paravirt_csb** csb, uint32_t csbbal,
|
|
uint32_t csbbah, QEMUBH* tx_bh, AddressSpace *as);
|
|
|
|
#endif /* QEMU_PCI_H */
|
|
|
|
#endif /* NET_PARAVIRT_H */
|