4bf50f18eb
Mostly bugfixes or features developed in the past 6 months, so this is a 10.1 candidate. Basically no user API changes (some bugfixes in sys/net/netmap_user.h). In detail: 1. netmap support for virtio-net, including in netmap mode. Under bhyve and with a netmap backend [2] we reach over 1Mpps with standard APIs (e.g. libpcap), and 5-8 Mpps in netmap mode. 2. (kernel) add support for multiple memory allocators, so we can better partition physical and virtual interfaces giving access to separate users. The most visible effect is one additional argument to the various kernel functions to compute buffer addresses. All netmap-supported drivers are affected, but changes are mechanical and trivial 3. (kernel) simplify the prototype for *txsync() and *rxsync() driver methods. All netmap drivers affected, changes mostly mechanical. 4. add support for netmap-monitor ports. Think of it as a mirroring port on a physical switch: a netmap monitor port replicates traffic present on the main port. Restrictions apply. Drive carefully. 5. if_lem.c: support for various paravirtualization features, experimental and disabled by default. Most of these are described in our ANCS'13 paper [1]. Paravirtualized support in netmap mode is new, and beats the numbers in the paper by a large factor (under qemu-kvm, we measured gues-host throughput up to 10-12 Mpps). A lot of refactoring and additional documentation in the files in sys/dev/netmap, but apart from #2 and #3 above, almost nothing of this stuff is visible to other kernel parts. Example programs in tools/tools/netmap have been updated with bugfixes and to support more of the existing features. This is meant to go into 10.1 so we plan an MFC before the Aug.22 deadline. A lot of this code has been contributed by my colleagues at UNIPI, including Giuseppe Lettieri, Vincenzo Maffione, Stefano Garzarella. MFC after: 3 days.
315 lines
9.1 KiB
C
315 lines
9.1 KiB
C
/*
|
|
* Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* $FreeBSD$
|
|
*
|
|
* Netmap support for igb, partly contributed by Ahmed Kooli
|
|
* For details on netmap support please see ixgbe_netmap.h
|
|
*/
|
|
|
|
|
|
#include <net/netmap.h>
|
|
#include <sys/selinfo.h>
|
|
#include <vm/vm.h>
|
|
#include <vm/pmap.h> /* vtophys ? */
|
|
#include <dev/netmap/netmap_kern.h>
|
|
|
|
/*
|
|
* Adaptation to different versions of the driver.
|
|
*/
|
|
|
|
#ifndef IGB_MEDIA_RESET
|
|
/* at the same time as IGB_MEDIA_RESET was defined, the
|
|
* tx buffer descriptor was renamed, so use this to revert
|
|
* back to the old name.
|
|
*/
|
|
#define igb_tx_buf igb_tx_buffer
|
|
#endif
|
|
|
|
|
|
/*
|
|
* Register/unregister. We are already under netmap lock.
|
|
*/
|
|
static int
|
|
igb_netmap_reg(struct netmap_adapter *na, int onoff)
|
|
{
|
|
struct ifnet *ifp = na->ifp;
|
|
struct adapter *adapter = ifp->if_softc;
|
|
|
|
IGB_CORE_LOCK(adapter);
|
|
igb_disable_intr(adapter);
|
|
|
|
/* Tell the stack that the interface is no longer active */
|
|
ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
|
|
|
|
/* enable or disable flags and callbacks in na and ifp */
|
|
if (onoff) {
|
|
nm_set_native_flags(na);
|
|
} else {
|
|
nm_clear_native_flags(na);
|
|
}
|
|
igb_init_locked(adapter); /* also enable intr */
|
|
IGB_CORE_UNLOCK(adapter);
|
|
return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1);
|
|
}
|
|
|
|
|
|
/*
|
|
* Reconcile kernel and user view of the transmit ring.
|
|
*/
|
|
static int
|
|
igb_netmap_txsync(struct netmap_kring *kring, int flags)
|
|
{
|
|
struct netmap_adapter *na = kring->na;
|
|
struct ifnet *ifp = na->ifp;
|
|
struct netmap_ring *ring = kring->ring;
|
|
u_int nm_i; /* index into the netmap ring */
|
|
u_int nic_i; /* index into the NIC ring */
|
|
u_int n;
|
|
u_int const lim = kring->nkr_num_slots - 1;
|
|
u_int const head = kring->rhead;
|
|
/* generate an interrupt approximately every half ring */
|
|
u_int report_frequency = kring->nkr_num_slots >> 1;
|
|
|
|
/* device-specific */
|
|
struct adapter *adapter = ifp->if_softc;
|
|
struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
|
|
/* 82575 needs the queue index added */
|
|
u32 olinfo_status =
|
|
(adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0;
|
|
|
|
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
|
|
BUS_DMASYNC_POSTREAD);
|
|
|
|
/*
|
|
* First part: process new packets to send.
|
|
*/
|
|
|
|
nm_i = kring->nr_hwcur;
|
|
if (nm_i != head) { /* we have new packets to send */
|
|
nic_i = netmap_idx_k2n(kring, nm_i);
|
|
for (n = 0; nm_i != head; n++) {
|
|
struct netmap_slot *slot = &ring->slot[nm_i];
|
|
u_int len = slot->len;
|
|
uint64_t paddr;
|
|
void *addr = PNMB(na, slot, &paddr);
|
|
|
|
/* device-specific */
|
|
union e1000_adv_tx_desc *curr =
|
|
(union e1000_adv_tx_desc *)&txr->tx_base[nic_i];
|
|
struct igb_tx_buf *txbuf = &txr->tx_buffers[nic_i];
|
|
int flags = (slot->flags & NS_REPORT ||
|
|
nic_i == 0 || nic_i == report_frequency) ?
|
|
E1000_ADVTXD_DCMD_RS : 0;
|
|
|
|
NM_CHECK_ADDR_LEN(na, addr, len);
|
|
|
|
if (slot->flags & NS_BUF_CHANGED) {
|
|
/* buffer has changed, reload map */
|
|
netmap_reload_map(na, txr->txtag, txbuf->map, addr);
|
|
}
|
|
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
|
|
|
|
/* Fill the slot in the NIC ring. */
|
|
curr->read.buffer_addr = htole64(paddr);
|
|
// XXX check olinfo and cmd_type_len
|
|
curr->read.olinfo_status =
|
|
htole32(olinfo_status |
|
|
(len<< E1000_ADVTXD_PAYLEN_SHIFT));
|
|
curr->read.cmd_type_len =
|
|
htole32(len | E1000_ADVTXD_DTYP_DATA |
|
|
E1000_ADVTXD_DCMD_IFCS |
|
|
E1000_ADVTXD_DCMD_DEXT |
|
|
E1000_ADVTXD_DCMD_EOP | flags);
|
|
|
|
/* make sure changes to the buffer are synced */
|
|
bus_dmamap_sync(txr->txtag, txbuf->map,
|
|
BUS_DMASYNC_PREWRITE);
|
|
|
|
nm_i = nm_next(nm_i, lim);
|
|
nic_i = nm_next(nic_i, lim);
|
|
}
|
|
kring->nr_hwcur = head;
|
|
|
|
/* Set the watchdog XXX ? */
|
|
txr->queue_status = IGB_QUEUE_WORKING;
|
|
txr->watchdog_time = ticks;
|
|
|
|
/* synchronize the NIC ring */
|
|
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
|
|
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
|
|
|
/* (re)start the tx unit up to slot nic_i (excluded) */
|
|
E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i);
|
|
}
|
|
|
|
/*
|
|
* Second part: reclaim buffers for completed transmissions.
|
|
*/
|
|
if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
|
|
/* record completed transmissions using TDH */
|
|
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
|
|
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
|
|
D("TDH wrap %d", nic_i);
|
|
nic_i -= kring->nkr_num_slots;
|
|
}
|
|
txr->next_to_clean = nic_i;
|
|
kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
|
|
}
|
|
|
|
nm_txsync_finalize(kring);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
* Reconcile kernel and user view of the receive ring.
|
|
*/
|
|
static int
|
|
igb_netmap_rxsync(struct netmap_kring *kring, int flags)
|
|
{
|
|
struct netmap_adapter *na = kring->na;
|
|
struct ifnet *ifp = na->ifp;
|
|
struct netmap_ring *ring = kring->ring;
|
|
u_int nm_i; /* index into the netmap ring */
|
|
u_int nic_i; /* index into the NIC ring */
|
|
u_int n;
|
|
u_int const lim = kring->nkr_num_slots - 1;
|
|
u_int const head = nm_rxsync_prologue(kring);
|
|
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
|
|
|
|
/* device-specific */
|
|
struct adapter *adapter = ifp->if_softc;
|
|
struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
|
|
|
|
if (head > lim)
|
|
return netmap_ring_reinit(kring);
|
|
|
|
/* XXX check sync modes */
|
|
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
|
|
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
|
|
|
|
/*
|
|
* First part: import newly received packets.
|
|
*/
|
|
if (netmap_no_pendintr || force_update) {
|
|
uint16_t slot_flags = kring->nkr_slot_flags;
|
|
|
|
nic_i = rxr->next_to_check;
|
|
nm_i = netmap_idx_n2k(kring, nic_i);
|
|
|
|
for (n = 0; ; n++) {
|
|
union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i];
|
|
uint32_t staterr = le32toh(curr->wb.upper.status_error);
|
|
|
|
if ((staterr & E1000_RXD_STAT_DD) == 0)
|
|
break;
|
|
ring->slot[nm_i].len = le16toh(curr->wb.upper.length);
|
|
ring->slot[nm_i].flags = slot_flags;
|
|
bus_dmamap_sync(rxr->ptag,
|
|
rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD);
|
|
nm_i = nm_next(nm_i, lim);
|
|
nic_i = nm_next(nic_i, lim);
|
|
}
|
|
if (n) { /* update the state variables */
|
|
rxr->next_to_check = nic_i;
|
|
kring->nr_hwtail = nm_i;
|
|
}
|
|
kring->nr_kflags &= ~NKR_PENDINTR;
|
|
}
|
|
|
|
/*
|
|
* Second part: skip past packets that userspace has released.
|
|
*/
|
|
nm_i = kring->nr_hwcur;
|
|
if (nm_i != head) {
|
|
nic_i = netmap_idx_k2n(kring, nm_i);
|
|
for (n = 0; nm_i != head; n++) {
|
|
struct netmap_slot *slot = &ring->slot[nm_i];
|
|
uint64_t paddr;
|
|
void *addr = PNMB(na, slot, &paddr);
|
|
|
|
union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i];
|
|
struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i];
|
|
|
|
if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
|
|
goto ring_reset;
|
|
|
|
if (slot->flags & NS_BUF_CHANGED) {
|
|
/* buffer has changed, reload map */
|
|
netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr);
|
|
slot->flags &= ~NS_BUF_CHANGED;
|
|
}
|
|
curr->wb.upper.status_error = 0;
|
|
curr->read.pkt_addr = htole64(paddr);
|
|
bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
|
|
BUS_DMASYNC_PREREAD);
|
|
nm_i = nm_next(nm_i, lim);
|
|
nic_i = nm_next(nic_i, lim);
|
|
}
|
|
kring->nr_hwcur = head;
|
|
|
|
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
|
|
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
|
/*
|
|
* IMPORTANT: we must leave one free slot in the ring,
|
|
* so move nic_i back by one unit
|
|
*/
|
|
nic_i = nm_prev(nic_i, lim);
|
|
E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i);
|
|
}
|
|
|
|
/* tell userspace that there might be new packets */
|
|
nm_rxsync_finalize(kring);
|
|
|
|
return 0;
|
|
|
|
ring_reset:
|
|
return netmap_ring_reinit(kring);
|
|
}
|
|
|
|
|
|
static void
|
|
igb_netmap_attach(struct adapter *adapter)
|
|
{
|
|
struct netmap_adapter na;
|
|
|
|
bzero(&na, sizeof(na));
|
|
|
|
na.ifp = adapter->ifp;
|
|
na.na_flags = NAF_BDG_MAYSLEEP;
|
|
na.num_tx_desc = adapter->num_tx_desc;
|
|
na.num_rx_desc = adapter->num_rx_desc;
|
|
na.nm_txsync = igb_netmap_txsync;
|
|
na.nm_rxsync = igb_netmap_rxsync;
|
|
na.nm_register = igb_netmap_reg;
|
|
na.num_tx_rings = na.num_rx_rings = adapter->num_queues;
|
|
netmap_attach(&na);
|
|
}
|
|
|
|
/* end of file */
|