It is 2014 and we have a new version of netmap.

Most relevant features:

- netmap emulation on any NIC, even those without native netmap support.

  On the ixgbe we have measured about 4Mpps/core/queue in this mode,
  which is still a lot more than with sockets/bpf.

- seamless interconnection of VALE switch, NICs and host stack.

  If you disable accelerations on your NIC (say em0)

        ifconfig em0 -txcsum -txcsum

  you can use the VALE switch to connect the NIC and the host stack:

        vale-ctl -h valeXX:em0

  allowing sharing the NIC with other netmap clients.

- THE USER API HAS SLIGHTLY CHANGED (head/cur/tail pointers
  instead of pointers/count as before). This was unavoidable to support,
  in the future, multiple threads operating on the same rings.
  Netmap clients require very small source code changes to compile again.
      On the plus side, the new API should be easier to understand
  and the internals are a lot simpler.

The manual page has been updated extensively to reflect the current
features and give some examples.

This is the result of work of several people including Giuseppe Lettieri,
Vincenzo Maffione, Michio Honda and myself, and has been financially
supported by EU projects CHANGE and OPENLAB, from NetApp University
Research Fund, NEC, and of course the Universita` di Pisa.
This commit is contained in:
Luigi Rizzo 2014-01-06 12:53:15 +00:00
parent 0979970a1d
commit 17885a7bfd
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=260368
27 changed files with 3099 additions and 2194 deletions

File diff suppressed because it is too large Load Diff

View File

@ -4352,7 +4352,7 @@ em_initialize_receive_unit(struct adapter *adapter)
* preserve the rx buffers passed to userspace.
*/
if (ifp->if_capenable & IFCAP_NETMAP)
rdt -= NA(adapter->ifp)->rx_rings[i].nr_hwavail;
rdt -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[i]);
#endif /* DEV_NETMAP */
E1000_WRITE_REG(hw, E1000_RDT(i), rdt);
}

View File

@ -4630,13 +4630,13 @@ igb_initialize_receive_units(struct adapter *adapter)
* an init() while a netmap client is active must
* preserve the rx buffers passed to userspace.
* In this driver it means we adjust RDT to
* somthing different from next_to_refresh
* something different from next_to_refresh
* (which is not used in netmap mode).
*/
if (ifp->if_capenable & IFCAP_NETMAP) {
struct netmap_adapter *na = NA(adapter->ifp);
struct netmap_kring *kring = &na->rx_rings[i];
int t = rxr->next_to_refresh - kring->nr_hwavail;
int t = rxr->next_to_refresh - nm_kr_rxspace(kring);
if (t >= adapter->num_rx_desc)
t -= adapter->num_rx_desc;

View File

@ -3367,7 +3367,7 @@ lem_initialize_receive_unit(struct adapter *adapter)
#ifdef DEV_NETMAP
/* preserve buffers already made available to clients */
if (ifp->if_capenable & IFCAP_NETMAP)
rctl -= NA(adapter->ifp)->rx_rings[0].nr_hwavail;
rctl -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[0]);
#endif /* DEV_NETMAP */
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), rctl);

View File

@ -1245,7 +1245,7 @@ ixgbe_init_locked(struct adapter *adapter)
if (ifp->if_capenable & IFCAP_NETMAP) {
struct netmap_adapter *na = NA(adapter->ifp);
struct netmap_kring *kring = &na->rx_rings[i];
int t = na->num_rx_desc - 1 - kring->nr_hwavail;
int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring);
IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t);
} else

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -120,9 +120,9 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, new_slots;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_txsync_prologue(kring, &new_slots);
u_int const head = kring->rhead;
/* generate an interrupt approximately every half ring */
u_int report_frequency = kring->nkr_num_slots >> 1;
@ -130,9 +130,6 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct adapter *adapter = ifp->if_softc;
struct tx_ring *txr = &adapter->tx_rings[ring_nr];
if (cur > lim) /* error checking in nm_txsync_prologue() */
return netmap_ring_reinit(kring);
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@ -141,9 +138,9 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) { /* we have new packets to send */
if (nm_i != head) { /* we have new packets to send */
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@ -175,9 +172,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwcur = cur; /* the saved ring->cur */
/* decrease avail by # of packets sent minus previous ones */
kring->nr_hwavail -= new_slots;
kring->nr_hwcur = head;
/* synchronize the NIC ring */
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
@ -190,26 +185,20 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/*
* Second part: reclaim buffers for completed transmissions.
*/
if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) {
int delta;
if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
/* record completed transmissions using TDH */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
}
delta = nic_i - txr->next_to_clean;
if (delta) {
/* some completed, increment hwavail. */
if (delta < 0)
delta += kring->nkr_num_slots;
if (nic_i != txr->next_to_clean) {
txr->next_to_clean = nic_i;
kring->nr_hwavail += delta;
kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
}
nm_txsync_finalize(kring, cur);
nm_txsync_finalize(kring);
return 0;
}
@ -226,16 +215,16 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, resvd;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
if (cur > lim)
if (head > lim)
return netmap_ring_reinit(kring);
/* XXX check sync modes */
@ -251,7 +240,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i = rxr->next_to_check;
nm_i = netmap_idx_n2k(kring, nic_i);
for (n = 0; ; n++) {
for (n = 0; ; n++) { // XXX no need to count
struct e1000_rx_desc *curr = &rxr->rx_base[nic_i];
uint32_t staterr = le32toh(curr->status);
@ -268,7 +257,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
}
if (n) { /* update the state variables */
rxr->next_to_check = nic_i;
kring->nr_hwavail += n;
kring->nr_hwtail = nm_i;
}
kring->nr_kflags &= ~NKR_PENDINTR;
}
@ -277,9 +266,9 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Second part: skip past packets that userspace has released.
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) {
if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@ -302,8 +291,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwavail -= n;
kring->nr_hwcur = cur;
kring->nr_hwcur = head;
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
@ -311,12 +299,12 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* IMPORTANT: we must leave one free slot in the ring,
* so move nic_i back by one unit
*/
nic_i = (nic_i == 0) ? lim : nic_i - 1;
nic_i = nm_prev(nic_i, lim);
E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i);
}
/* tell userspace that there might be new packets */
ring->avail = kring->nr_hwavail - resvd;
nm_rxsync_finalize(kring);
return 0;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2011 Universita` di Pisa. All rights reserved.
* Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -88,9 +88,9 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, new_slots;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_txsync_prologue(kring, &new_slots);
u_int const head = kring->rhead;
/* generate an interrupt approximately every half ring */
u_int report_frequency = kring->nkr_num_slots >> 1;
@ -101,9 +101,6 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
u32 olinfo_status =
(adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0;
if (cur > lim) /* error checking in nm_txsync_prologue() */
return netmap_ring_reinit(kring);
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@ -112,9 +109,9 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) { /* we have new packets to send */
if (nm_i != head) { /* we have new packets to send */
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@ -155,9 +152,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwcur = cur; /* the saved ring->cur */
/* decrease avail by # of packets sent minus previous ones */
kring->nr_hwavail -= new_slots;
kring->nr_hwcur = head;
/* Set the watchdog XXX ? */
txr->queue_status = IGB_QUEUE_WORKING;
@ -174,26 +169,18 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/*
* Second part: reclaim buffers for completed transmissions.
*/
if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) {
int delta;
if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
/* record completed transmissions using TDH */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
}
delta = nic_i - txr->next_to_clean;
if (delta) {
/* some completed, increment hwavail. */
if (delta < 0)
delta += kring->nkr_num_slots;
txr->next_to_clean = nic_i;
kring->nr_hwavail += delta;
}
txr->next_to_clean = nic_i;
kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
nm_txsync_finalize(kring, cur);
nm_txsync_finalize(kring);
return 0;
}
@ -210,16 +197,16 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, resvd;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
if (cur > lim)
if (head > lim)
return netmap_ring_reinit(kring);
/* XXX check sync modes */
@ -250,7 +237,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
}
if (n) { /* update the state variables */
rxr->next_to_check = nic_i;
kring->nr_hwavail += n;
kring->nr_hwtail = nm_i;
}
kring->nr_kflags &= ~NKR_PENDINTR;
}
@ -259,9 +246,9 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Second part: skip past packets that userspace has released.
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) {
if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@ -284,8 +271,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwavail -= n;
kring->nr_hwcur = cur;
kring->nr_hwcur = head;
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
@ -293,12 +279,12 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* IMPORTANT: we must leave one free slot in the ring,
* so move nic_i back by one unit
*/
nic_i = (nic_i == 0) ? lim : nic_i - 1;
nic_i = nm_prev(nic_i, lim);
E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i);
}
/* tell userspace that there might be new packets */
ring->avail = kring->nr_hwavail - resvd;
nm_rxsync_finalize(kring);
return 0;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -91,18 +91,14 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, new_slots;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_txsync_prologue(kring, &new_slots);
u_int const head = kring->rhead;
/* generate an interrupt approximately every half ring */
u_int report_frequency = kring->nkr_num_slots >> 1;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
if (cur > lim) /* error checking in nm_txsync_prologue() */
return netmap_ring_reinit(kring);
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@ -111,9 +107,9 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) { /* we have new packets to send */
if (nm_i != head) { /* we have new packets to send */
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
while (nm_i != head) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@ -145,9 +141,7 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwcur = cur; /* the saved ring->cur */
/* decrease avail by # of packets sent minus previous ones */
kring->nr_hwavail -= new_slots;
kring->nr_hwcur = head;
/* synchronize the NIC ring */
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
@ -160,26 +154,19 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/*
* Second part: reclaim buffers for completed transmissions.
*/
if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) {
int delta;
if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
kring->last_reclaim = ticks;
/* record completed transmissions using TDH */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
}
delta = nic_i - adapter->next_tx_to_clean;
if (delta) {
/* some completed, increment hwavail. */
if (delta < 0)
delta += kring->nkr_num_slots;
adapter->next_tx_to_clean = nic_i;
kring->nr_hwavail += delta;
}
adapter->next_tx_to_clean = nic_i;
kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
nm_txsync_finalize(kring, cur);
nm_txsync_finalize(kring);
return 0;
}
@ -196,15 +183,15 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, resvd;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
if (cur > lim)
if (head > lim)
return netmap_ring_reinit(kring);
/* XXX check sync modes */
@ -241,9 +228,14 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i = nm_next(nic_i, lim);
}
if (n) { /* update the state variables */
ND("%d new packets at nic %d nm %d tail %d",
n,
adapter->next_rx_desc_to_check,
netmap_idx_n2k(kring, adapter->next_rx_desc_to_check),
kring->nr_hwtail);
adapter->next_rx_desc_to_check = nic_i;
// ifp->if_ipackets += n;
kring->nr_hwavail += n;
kring->nr_hwtail = nm_i;
}
kring->nr_kflags &= ~NKR_PENDINTR;
}
@ -252,9 +244,9 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Second part: skip past packets that userspace has released.
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) {
if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@ -277,20 +269,19 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwavail -= n;
kring->nr_hwcur = cur;
kring->nr_hwcur = head;
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
/*
* IMPORTANT: we must leave one free slot in the ring,
* so move nic_i back by one unit
*/
nic_i = (nic_i == 0) ? lim : nic_i - 1;
nic_i = nm_prev(nic_i, lim);
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i);
}
/* tell userspace that there might be new packets */
ring->avail = kring->nr_hwavail - resvd;
nm_rxsync_finalize(kring);
return 0;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2011 Luigi Rizzo. All rights reserved.
* Copyright (C) 2011-2014 Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -72,17 +72,14 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, new_slots;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_txsync_prologue(kring, &new_slots);
u_int const head = kring->rhead;
/* device-specific */
struct rl_softc *sc = ifp->if_softc;
struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc;
if (cur > lim) /* error checking in nm_txsync_prologue() */
return netmap_ring_reinit(kring);
bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag,
sc->rl_ldata.rl_tx_list_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); // XXX extra postwrite ?
@ -91,11 +88,11 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* First part: process new packets to send.
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) { /* we have new packets to send */
if (nm_i != head) { /* we have new packets to send */
nic_i = sc->rl_ldata.rl_tx_prodidx;
// XXX or netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@ -132,9 +129,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i = nm_next(nic_i, lim);
}
sc->rl_ldata.rl_tx_prodidx = nic_i;
/* decrease avail by # of packets sent minus previous ones */
kring->nr_hwcur = cur; /* the saved ring->cur */
kring->nr_hwavail -= new_slots;
kring->nr_hwcur = head;
/* synchronize the NIC ring */
bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag,
@ -148,7 +143,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/*
* Second part: reclaim buffers for completed transmissions.
*/
if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) {
if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
nic_i = sc->rl_ldata.rl_tx_considx;
for (n = 0; nic_i != sc->rl_ldata.rl_tx_prodidx;
n++, nic_i = RL_TX_DESC_NXT(sc, nic_i)) {
@ -160,11 +155,11 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
if (n > 0) {
sc->rl_ldata.rl_tx_considx = nic_i;
sc->rl_ldata.rl_tx_free += n;
kring->nr_hwavail += n;
kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
}
nm_txsync_finalize(kring, cur);
nm_txsync_finalize(kring);
return 0;
}
@ -181,16 +176,16 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, resvd;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct rl_softc *sc = ifp->if_softc;
struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc;
if (cur > lim)
if (head > lim)
return netmap_ring_reinit(kring);
bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag,
@ -202,16 +197,17 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*
* This device uses all the buffers in the ring, so we need
* another termination condition in addition to RL_RDESC_STAT_OWN
* cleared (all buffers could have it cleared. The easiest one
* is to limit the amount of data reported up to 'lim'
* cleared (all buffers could have it cleared). The easiest one
* is to stop right before nm_hwcur.
*/
if (netmap_no_pendintr || force_update) {
uint16_t slot_flags = kring->nkr_slot_flags;
uint32_t stop_i = nm_prev(kring->nr_hwcur, lim);
nic_i = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */
nm_i = netmap_idx_n2k(kring, nic_i);
for (n = kring->nr_hwavail; n < lim ; n++) {
while (nm_i != stop_i) {
struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[nic_i];
uint32_t rxstat = le32toh(cur_rx->rl_cmdstat);
uint32_t total_len;
@ -226,14 +222,12 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* sync was in re_newbuf() */
bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag,
rxd[nic_i].rx_dmamap, BUS_DMASYNC_POSTREAD);
// sc->rl_ifp->if_ipackets++;
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
if (n != kring->nr_hwavail) {
sc->rl_ldata.rl_rx_prodidx = nic_i;
sc->rl_ifp->if_ipackets += n - kring->nr_hwavail;
kring->nr_hwavail = n;
}
sc->rl_ldata.rl_rx_prodidx = nic_i;
kring->nr_hwtail = nm_i;
kring->nr_kflags &= ~NKR_PENDINTR;
}
@ -241,9 +235,9 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Second part: skip past packets that userspace has released.
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) {
if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@ -272,8 +266,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwavail -= n;
kring->nr_hwcur = cur;
kring->nr_hwcur = head;
bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag,
sc->rl_ldata.rl_rx_list_map,
@ -281,7 +274,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
}
/* tell userspace that there might be new packets */
ring->avail = kring->nr_hwavail - resvd;
nm_rxsync_finalize(kring);
return 0;
@ -336,36 +329,35 @@ re_netmap_rx_init(struct rl_softc *sc)
struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0);
struct rl_desc *desc = sc->rl_ldata.rl_rx_list;
uint32_t cmdstat;
int i, n, max_avail;
uint32_t nic_i, max_avail;
uint32_t const n = sc->rl_ldata.rl_rx_desc_cnt;
if (!slot)
return;
n = sc->rl_ldata.rl_rx_desc_cnt;
/*
* Userspace owned hwavail packets before the reset,
* so the NIC that last hwavail descriptors of the ring
* are still owned by the driver (and keep one empty).
* Do not release the slots owned by userspace,
* and also keep one empty.
*/
max_avail = n - 1 - na->rx_rings[0].nr_hwavail;
for (i = 0; i < n; i++) {
max_avail = n - 1 - nm_kr_rxspace(&na->rx_rings[0]);
for (nic_i = 0; nic_i < n; nic_i++) {
void *addr;
uint64_t paddr;
int l = netmap_idx_n2k(&na->rx_rings[0], i);
uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i);
addr = PNMB(slot + l, &paddr);
addr = PNMB(slot + nm_i, &paddr);
netmap_reload_map(sc->rl_ldata.rl_rx_mtag,
sc->rl_ldata.rl_rx_desc[i].rx_dmamap, addr);
sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr);
bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag,
sc->rl_ldata.rl_rx_desc[i].rx_dmamap, BUS_DMASYNC_PREREAD);
desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD);
desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
cmdstat = NETMAP_BUF_SIZE;
if (i == n - 1) /* mark the end of ring */
if (nic_i == n - 1) /* mark the end of ring */
cmdstat |= RL_RDESC_CMD_EOR;
if (i < max_avail)
if (nic_i < max_avail)
cmdstat |= RL_RDESC_CMD_OWN;
desc[i].rl_cmdstat = htole32(cmdstat);
desc[nic_i].rl_cmdstat = htole32(cmdstat);
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -141,14 +141,13 @@ ixgbe_netmap_reg(struct netmap_adapter *na, int onoff)
/*
* Reconcile kernel and user view of the transmit ring.
*
* Userspace wants to send packets up to the one before ring->cur,
* All information is in the kring.
* Userspace wants to send packets up to the one before kring->rhead,
* kernel knows kring->nr_hwcur is the first unsent packet.
*
* Here we push packets out (as many as possible), and possibly
* reclaim buffers from previously completed transmission.
*
* ring->avail is not used on input, but it is updated on return.
*
* The caller (netmap) guarantees that there is only one instance
* running at any time. Any interference with other driver
* methods should be handled by the individual drivers.
@ -161,9 +160,9 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, new_slots;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_txsync_prologue(kring, &new_slots);
u_int const head = kring->rhead;
/*
* interrupts on every tx packet are expensive so request
* them every half ring, or where NS_REPORT is set
@ -175,9 +174,6 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct tx_ring *txr = &adapter->tx_rings[ring_nr];
int reclaim_tx;
if (cur > lim) /* error checking in nm_txsync_prologue() */
return netmap_ring_reinit(kring);
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@ -199,7 +195,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
/*
* If we have packets to send (kring->nr_hwcur != ring->cur)
* If we have packets to send (kring->nr_hwcur != kring->rhead)
* iterate over the netmap ring, fetch length and update
* the corresponding slot in the NIC ring. Some drivers also
* need to update the buffer's physical address in the NIC slot
@ -217,13 +213,13 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) { /* we have new packets to send */
if (nm_i != head) { /* we have new packets to send */
nic_i = netmap_idx_k2n(kring, nm_i);
__builtin_prefetch(&ring->slot[nm_i]);
__builtin_prefetch(&txr->tx_buffers[nic_i]);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@ -262,9 +258,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwcur = cur; /* the saved ring->cur */
/* decrease avail by # of packets sent minus previous ones */
kring->nr_hwavail -= new_slots;
kring->nr_hwcur = head;
/* synchronize the NIC ring */
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
@ -281,7 +275,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
if (flags & NAF_FORCE_RECLAIM) {
reclaim_tx = 1; /* forced reclaim */
} else if (kring->nr_hwavail > 0) {
} else if (!nm_kr_txempty(kring)) {
reclaim_tx = 0; /* have buffers, no reclaim */
} else {
/*
@ -321,21 +315,13 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i -= kring->nkr_num_slots;
}
if (nic_i != txr->next_to_clean) {
n = (nic_i + lim + 1) - txr->next_to_clean;
if (n > lim)
n -= lim + 1;
/* some tx completed, increment avail */
txr->next_to_clean = nic_i;
kring->nr_hwavail += n;
if (kring->nr_hwavail > lim) {
RD(5, "bad hwavail %d",
kring->nr_hwavail);
return netmap_ring_reinit(kring);
}
kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
}
nm_txsync_finalize(kring, cur);
nm_txsync_finalize(kring);
return 0;
}
@ -347,14 +333,9 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* The caller guarantees a single invocations, but races against
* the rest of the driver should be handled here.
*
* When called, userspace has released buffers up to
* ring->cur - ring->reserved (last one excluded).
*
* The last interrupt reported kring->nr_hwavail slots available
* after kring->nr_hwcur.
* We must subtract the newly consumed slots (cur - nr_hwcur)
* from nr_hwavail, make the descriptors available for the next reads,
* and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail.
* On call, kring->rhead is the first packet that userspace wants
* to keep, and kring->rcur is the wakeup point.
* The kernel has previously reported packets up to kring->rtail.
*
* If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
* of whether or not we received an interrupt.
@ -367,16 +348,16 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
u_int n, resvd;
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
if (cur > lim)
if (head > lim)
return netmap_ring_reinit(kring);
/* XXX check sync modes */
@ -391,8 +372,8 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* and they may differ in case if_init() has been called while
* in netmap mode. For the receive ring we have
*
* nm_i = (kring->nr_hwcur + kring->nr_hwavail) % ring_size
* nic_i = rxr->next_to_check;
* nm_i = kring->nr_hwtail (previous)
* and
* nm_i == (nic_i + kring->nkr_hwofs) % ring_size
*
@ -402,7 +383,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
int crclen = ix_crcstrip ? 0 : 4;
uint16_t slot_flags = kring->nkr_slot_flags;
nic_i = rxr->next_to_check;
nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail)
nm_i = netmap_idx_n2k(kring, nic_i);
for (n = 0; ; n++) {
@ -425,23 +406,23 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
ix_rx_miss_bufs += n;
}
rxr->next_to_check = nic_i;
kring->nr_hwavail += n;
kring->nr_hwtail = nm_i;
}
kring->nr_kflags &= ~NKR_PENDINTR;
}
/*
* Second part: skip past packets that userspace has released.
* (kring->nr_hwcur to ring->cur - ring->reserved excluded),
* (kring->nr_hwcur to kring->rhead excluded),
* and make the buffers available for reception.
* As usual nm_i is the index in the netmap ring,
* nic_i is the index in the NIC ring, and
* nm_i == (nic_i + kring->nkr_hwofs) % ring_size
*/
nm_i = kring->nr_hwcur;
if (nm_i != cur) {
if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != cur; n++) {
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@ -464,8 +445,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwavail -= n;
kring->nr_hwcur = cur;
kring->nr_hwcur = head;
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
@ -473,12 +453,12 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* IMPORTANT: we must leave one free slot in the ring,
* so move nic_i back by one unit
*/
nic_i = (nic_i == 0) ? lim : nic_i - 1;
nic_i = nm_prev(nic_i, lim);
IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i);
}
/* tell userspace that there might be new packets */
ring->avail = kring->nr_hwavail - resvd;
nm_rxsync_finalize(kring);
return 0;

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2013 Universita` di Pisa. All rights reserved.
* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -86,21 +86,31 @@ netmap_catch_rx(struct netmap_adapter *na, int intercept)
return 0;
}
/*
* Intercept the packet steering routine in the tx path,
* so that we can decide which queue is used for an mbuf.
* Second argument is non-zero to intercept, 0 to restore.
*
* actually we also need to redirect the if_transmit ?
*
* XXX see if FreeBSD has such a mechanism
*/
void
netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable)
netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)
{
struct netmap_adapter *na = &gna->up.up;
struct ifnet *ifp = na->ifp;
if (enable) {
na->if_transmit = ifp->if_transmit;
ifp->if_transmit = netmap_transmit;
} else {
ifp->if_transmit = na->if_transmit;
}
}
/* Transmit routine used by generic_netmap_txsync(). Returns 0 on success
* and non-zero on error (which may be packet drops or other errors).
* addr and len identify the netmap buffer, m is the (preallocated)
@ -126,16 +136,16 @@ generic_xmit_frame(struct ifnet *ifp, struct mbuf *m,
// copy data to the mbuf
m_copyback(m, 0, len, addr);
// inc refcount. We are alone, so we can skip the atomic
atomic_fetchadd_int(m->m_ext.ref_cnt, 1);
m->m_flags |= M_FLOWID;
m->m_pkthdr.flowid = ring_nr;
m->m_pkthdr.rcvif = ifp; /* used for tx notification */
ret = ifp->if_transmit(ifp, m);
ret = NA(ifp)->if_transmit(ifp, m);
return ret;
}
/*
* The following two functions are empty until we have a generic
* way to extract the info from the ifp
@ -147,6 +157,7 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)
return 0;
}
void
generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
{
@ -155,6 +166,7 @@ generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
*rxq = 1;
}
void netmap_mitigation_init(struct netmap_generic_adapter *na)
{
ND("called");
@ -167,22 +179,26 @@ void netmap_mitigation_start(struct netmap_generic_adapter *na)
ND("called");
}
void netmap_mitigation_restart(struct netmap_generic_adapter *na)
{
ND("called");
}
int netmap_mitigation_active(struct netmap_generic_adapter *na)
{
ND("called");
return 0;
}
void netmap_mitigation_cleanup(struct netmap_generic_adapter *na)
{
ND("called");
}
/*
* In order to track whether pages are still mapped, we hook into
* the standard cdev_pager and intercept the constructor and
@ -194,6 +210,7 @@ struct netmap_vm_handle_t {
struct netmap_priv_d *priv;
};
static int
netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
@ -218,6 +235,7 @@ netmap_dev_pager_dtor(void *handle)
dev_rel(dev);
}
static int
netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
int prot, vm_page_t *mres)

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
/*
* Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2013 Universita` di Pisa. All rights reserved.
* Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -53,7 +53,7 @@
#define NM_SELINFO_T struct selinfo
#define MBUF_LEN(m) ((m)->m_pkthdr.len)
#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif)
#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m)
#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m)
#define NM_ATOMIC_T volatile int // XXX ?
/* atomic operations */
@ -76,7 +76,11 @@ struct hrtimer {
#define NM_SELINFO_T wait_queue_head_t
#define MBUF_LEN(m) ((m)->len)
#define MBUF_IFP(m) ((m)->dev)
#define NM_SEND_UP(ifp, m) netif_rx(m)
#define NM_SEND_UP(ifp, m) \
do { \
m->priority = NM_MAGIC_PRIORITY; \
netif_rx(m); \
} while (0)
#define NM_ATOMIC_T volatile long unsigned int
@ -125,9 +129,9 @@ struct hrtimer {
do { \
struct timeval __xxts; \
microtime(&__xxts); \
printf("%03d.%06d %s [%d] " format "\n", \
printf("%03d.%06d [%4d] %-25s " format "\n", \
(int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \
__FUNCTION__, __LINE__, ##__VA_ARGS__); \
__LINE__, __FUNCTION__, ##__VA_ARGS__); \
} while (0)
/* rate limited, lps indicates how many per second */
@ -158,15 +162,23 @@ extern NMG_LOCK_T netmap_global_lock;
* a ring across system calls.
*
* nr_hwcur index of the next buffer to refill.
* It corresponds to ring->cur - ring->reserved
* It corresponds to ring->head
* at the time the system call returns.
*
* nr_hwavail the number of slots "owned" by userspace.
* nr_hwavail =:= ring->avail + ring->reserved
* nr_hwtail index of the first buffer owned by the kernel.
* On RX, hwcur->hwtail are receive buffers
* not yet released. hwcur is advanced following
* ring->head, hwtail is advanced on incoming packets,
* and a wakeup is generated when hwtail passes ring->cur
* On TX, hwcur->rcur have been filled by the sender
* but not sent yet to the NIC; rcur->hwtail are available
* for new transmissions, and hwtail->hwcur-1 are pending
* transmissions not yet acknowledged.
*
* The indexes in the NIC and netmap rings are offset by nkr_hwofs slots.
* This is so that, on a reset, buffers owned by userspace are not
* modified by the kernel. In particular:
* RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides with
* RX rings: the next empty buffer (hwtail + hwofs) coincides with
* the next empty buffer as known by the hardware (next_to_check or so).
* TX rings: hwcur + hwofs coincides with next_to_send
*
@ -184,44 +196,76 @@ extern NMG_LOCK_T netmap_global_lock;
* from nr_hwlease, advances it, then does the
* copy outside the lock.
* In RX rings (used for VALE ports),
* nkr_hwcur + nkr_hwavail <= nkr_hwlease < nkr_hwcur+N-1
* nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1
* In TX rings (used for NIC or host stack ports)
* nkr_hwcur <= nkr_hwlease < nkr_hwcur+ nkr_hwavail
* nkr_hwcur <= nkr_hwlease < nkr_hwtail
* nkr_leases array of nkr_num_slots where writers can report
* completion of their block. NR_NOSLOT (~0) indicates
* that the writer has not finished yet
* nkr_lease_idx index of next free slot in nr_leases, to be assigned
*
* The kring is manipulated by txsync/rxsync and generic netmap function.
* q_lock is used to arbitrate access to the kring from within the netmap
* code, and this and other protections guarantee that there is never
* more than 1 concurrent call to txsync or rxsync. So we are free
* to manipulate the kring from within txsync/rxsync without any extra
* locks.
*
* Concurrent rxsync or txsync on the same ring are prevented through
* by nm_kr_lock() which in turn uses nr_busy. This is all we need
* for NIC rings, and for TX rings attached to the host stack.
*
* RX rings attached to the host stack use an mbq (rx_queue) on both
* rxsync_from_host() and netmap_transmit(). The mbq is protected
* by its internal lock.
*
* RX rings attached to the VALE switch are accessed by both sender
* and receiver. They are protected through the q_lock on the RX ring.
*/
struct netmap_kring {
struct netmap_ring *ring;
uint32_t nr_hwcur;
uint32_t nr_hwavail;
uint32_t nr_kflags; /* private driver flags */
int32_t nr_hwreserved;
#define NKR_PENDINTR 0x1 // Pending interrupt.
uint32_t nkr_num_slots;
int32_t nkr_hwofs; /* offset between NIC and netmap ring */
struct netmap_ring *ring;
uint32_t nr_hwcur;
uint32_t nr_hwtail;
/*
* Copies of values in user rings, so we do not need to look
* at the ring (which could be modified). These are set in the
* *sync_prologue()/finalize() routines.
*/
uint32_t rhead;
uint32_t rcur;
uint32_t rtail;
uint32_t nr_kflags; /* private driver flags */
#define NKR_PENDINTR 0x1 // Pending interrupt.
uint32_t nkr_num_slots;
/*
* On a NIC reset, the NIC ring indexes may be reset but the
* indexes in the netmap rings remain the same. nkr_hwofs
* keeps track of the offset between the two.
*/
int32_t nkr_hwofs;
uint16_t nkr_slot_flags; /* initial value for flags */
/* last_reclaim is opaque marker to help reduce the frequency
* of operations such as reclaiming tx buffers. A possible use
* is set it to ticks and do the reclaim only once per tick.
*/
uint64_t last_reclaim;
NM_SELINFO_T si; /* poll/select wait queue */
NM_LOCK_T q_lock; /* protects kring and ring. */
NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */
struct netmap_adapter *na;
/* The folloiwing fields are for VALE switch support */
struct nm_bdg_fwd *nkr_ft;
uint32_t *nkr_leases;
#define NR_NOSLOT ((uint32_t)~0)
uint32_t nkr_hwlease;
uint32_t nkr_lease_idx;
uint32_t *nkr_leases;
#define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */
uint32_t nkr_hwlease;
uint32_t nkr_lease_idx;
NM_SELINFO_T si; /* poll/select wait queue */
NM_LOCK_T q_lock; /* protects kring and ring. */
NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */
volatile int nkr_stopped;
volatile int nkr_stopped; // XXX what for ?
/* support for adapters without native netmap support.
* On tx rings we preallocate an array of tx buffers
@ -230,8 +274,11 @@ struct netmap_kring {
* XXX who writes to the rx queue ?
*/
struct mbuf **tx_pool;
u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */
struct mbq rx_queue; /* A queue for intercepted rx mbufs. */
// u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */
struct mbq rx_queue; /* intercepted rx mbufs. */
uint32_t ring_id; /* debugging */
char name[64]; /* diagnostic */
} __attribute__((__aligned__(64)));
@ -243,6 +290,15 @@ nm_next(uint32_t i, uint32_t lim)
return unlikely (i == lim) ? 0 : i + 1;
}
/* return the previous index, with wraparound */
static inline uint32_t
nm_prev(uint32_t i, uint32_t lim)
{
return unlikely (i == 0) ? lim : i - 1;
}
/*
*
* Here is the layout for the Rx and Tx rings.
@ -253,36 +309,36 @@ nm_next(uint32_t i, uint32_t lim)
| | | |
|XXX free slot XXX| |XXX free slot XXX|
+-----------------+ +-----------------+
| |<-hwcur | |<-hwcur
| reserved h | | (ready |
+----------- w -+ | to be |
cur->| a | | sent) h |
| v | +---------- w |
| a | cur->| (being a |
| i | | prepared) v |
| avail l | | a |
+-----------------+ + a ------ i +
| | ... | v l |<-hwlease
| (being | ... | a | ...
| prepared) | ... | i | ...
+-----------------+ ... | l | ...
| |<-hwlease +-----------------+
head->| owned by user |<-hwcur | not sent to nic |<-hwcur
| | | yet |
+-----------------+ | |
cur->| available to | | |
| user, not read | +-----------------+
| yet | cur->| (being |
| | | prepared) |
| | | |
+-----------------+ + ------ +
tail->| |<-hwtail | |<-hwlease
| (being | ... | | ...
| prepared) | ... | | ...
+-----------------+ ... | | ...
| |<-hwlease +-----------------+
| | tail->| |<-hwtail
| | | |
| | | |
| | | |
+-----------------+ +-----------------+
* The cur/avail (user view) and hwcur/hwavail (kernel view)
* The cur/tail (user view) and hwcur/hwtail (kernel view)
* are used in the normal operation of the card.
*
* When a ring is the output of a switch port (Rx ring for
* a VALE port, Tx ring for the host stack or NIC), slots
* are reserved in blocks through 'hwlease' which points
* to the next unused slot.
* On an Rx ring, hwlease is always after hwavail,
* and completions cause avail to advance.
* On a Tx ring, hwlease is always between cur and hwavail,
* On an Rx ring, hwlease is always after hwtail,
* and completions cause hwtail to advance.
* On a Tx ring, hwlease is always between cur and hwtail,
* and completions cause cur to advance.
*
* nm_kr_space() returns the maximum number of slots that
@ -294,7 +350,6 @@ nm_next(uint32_t i, uint32_t lim)
enum txrx { NR_RX = 0, NR_TX = 1 };
/*
@ -349,6 +404,7 @@ struct netmap_adapter {
*/
struct netmap_kring *tx_rings; /* array of TX rings. */
struct netmap_kring *rx_rings; /* array of RX rings. */
void *tailroom; /* space below the rings array */
/* (used for leases) */
@ -360,11 +416,38 @@ struct netmap_adapter {
*/
int (*if_transmit)(struct ifnet *, struct mbuf *);
/* copy of if_input for netmap_send_up() */
void (*if_input)(struct ifnet *, struct mbuf *);
/* references to the ifnet and device routines, used by
* the generic netmap functions.
*/
struct ifnet *ifp; /* adapter is ifp->if_softc */
/*---- callbacks for this netmap adapter -----*/
/*
* nm_dtor() is the cleanup routine called when destroying
* the adapter.
*
* nm_register() is called on NIOCREGIF and close() to enter
* or exit netmap mode on the NIC
*
* nm_txsync() pushes packets to the underlying hw/switch
*
* nm_rxsync() collects packets from the underlying hw/switch
*
* nm_config() returns configuration information from the OS
*
* nm_krings_create() XXX
*
* nm_krings_delete() XXX
*
* nm_notify() is used to act after data have become available.
* For hw devices this is typically a selwakeup(),
* but for NIC/host ports attached to a switch (or vice-versa)
* we also need to invoke the 'txsync' code downstream.
*/
/* private cleanup */
void (*nm_dtor)(struct netmap_adapter *);
@ -403,6 +486,7 @@ struct netmap_adapter {
void *na_private;
};
/*
* If the NIC is owned by the kernel
* (i.e., bridge), neither another bridge nor user can use it;
@ -433,13 +517,15 @@ struct netmap_vp_adapter { /* VALE software port */
u_int offset; /* Offset of ethernet header for each packet. */
};
struct netmap_hw_adapter { /* physical device */
struct netmap_adapter up;
struct net_device_ops nm_ndo; // XXX linux only
};
struct netmap_generic_adapter { /* non-native device */
struct netmap_generic_adapter { /* emulated device */
struct netmap_hw_adapter up;
/* Pointer to a previously used netmap adapter. */
@ -455,16 +541,20 @@ struct netmap_generic_adapter { /* non-native device */
struct hrtimer mit_timer;
int mit_pending;
#ifdef linux
netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *);
#endif
};
#ifdef WITH_VALE
/* bridge wrapper for non VALE ports. It is used to connect real devices to the bridge.
/*
* Bridge wrapper for non VALE ports attached to a VALE switch.
*
* The real device must already have its own netmap adapter (hwna). The
* bridge wrapper and the hwna adapter share the same set of netmap rings and
* buffers, but they have two separate sets of krings descriptors, with tx/rx
* meanings swapped:
* The real device must already have its own netmap adapter (hwna).
* The bridge wrapper and the hwna adapter share the same set of
* netmap rings and buffers, but they have two separate sets of
* krings descriptors, with tx/rx meanings swapped:
*
* netmap
* bwrap krings rings krings hwna
@ -478,23 +568,28 @@ struct netmap_generic_adapter { /* non-native device */
* | | +------+ +-----+ +------+ | |
* +------+ +------+
*
* - packets coming from the bridge go to the brwap rx rings, which are also the
* hwna tx rings. The bwrap notify callback will then complete the hwna tx
* (see netmap_bwrap_notify).
* - packets coming from the outside go to the hwna rx rings, which are also the
* bwrap tx rings. The (overwritten) hwna notify method will then complete
* the bridge tx (see netmap_bwrap_intr_notify).
* - packets coming from the bridge go to the brwap rx rings,
* which are also the hwna tx rings. The bwrap notify callback
* will then complete the hwna tx (see netmap_bwrap_notify).
*
* The bridge wrapper may optionally connect the hwna 'host' rings to the
* bridge. This is done by using a second port in the bridge and connecting it
* to the 'host' netmap_vp_adapter contained in the netmap_bwrap_adapter.
* The brwap host adapter cross-links the hwna host rings in the same way as shown above.
* - packets coming from the outside go to the hwna rx rings,
* which are also the bwrap tx rings. The (overwritten) hwna
* notify method will then complete the bridge tx
* (see netmap_bwrap_intr_notify).
*
* - packets coming from the bridge and directed to host stack are handled by the
* bwrap host notify callback (see netmap_bwrap_host_notify)
* - packets coming from the host stack are still handled by the overwritten
* hwna notify callback (netmap_bwrap_intr_notify), but are diverted to the
* host adapter depending on the ring number.
* The bridge wrapper may optionally connect the hwna 'host' rings
* to the bridge. This is done by using a second port in the
* bridge and connecting it to the 'host' netmap_vp_adapter
* contained in the netmap_bwrap_adapter. The brwap host adapter
* cross-links the hwna host rings in the same way as shown above.
*
* - packets coming from the bridge and directed to the host stack
* are handled by the bwrap host notify callback
* (see netmap_bwrap_host_notify)
*
* - packets coming from the host stack are still handled by the
* overwritten hwna notify callback (netmap_bwrap_intr_notify),
* but are diverted to the host adapter depending on the ring number.
*
*/
struct netmap_bwrap_adapter {
@ -505,103 +600,39 @@ struct netmap_bwrap_adapter {
/* backup of the hwna notify callback */
int (*save_notify)(struct netmap_adapter *,
u_int ring, enum txrx, int flags);
/* When we attach a physical interface to the bridge, we
/*
* When we attach a physical interface to the bridge, we
* allow the controlling process to terminate, so we need
* a place to store the netmap_priv_d data structure.
* This is only done when physical interfaces are attached to a bridge.
* This is only done when physical interfaces
* are attached to a bridge.
*/
struct netmap_priv_d *na_kpriv;
};
/*
* Available space in the ring. Only used in VALE code
*/
static inline uint32_t
nm_kr_space(struct netmap_kring *k, int is_rx)
{
int space;
#endif /* WITH_VALE */
/* return slots reserved to rx clients; used in drivers */
static inline uint32_t
nm_kr_rxspace(struct netmap_kring *k)
{
int space = k->nr_hwtail - k->nr_hwcur;
if (space < 0)
space += k->nkr_num_slots;
ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail);
if (is_rx) {
int busy = k->nkr_hwlease - k->nr_hwcur + k->nr_hwreserved;
if (busy < 0)
busy += k->nkr_num_slots;
space = k->nkr_num_slots - 1 - busy;
} else {
space = k->nr_hwcur + k->nr_hwavail - k->nkr_hwlease;
if (space < 0)
space += k->nkr_num_slots;
}
#if 0
// sanity check
if (k->nkr_hwlease >= k->nkr_num_slots ||
k->nr_hwcur >= k->nkr_num_slots ||
k->nr_hwavail >= k->nkr_num_slots ||
busy < 0 ||
busy >= k->nkr_num_slots) {
D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease,
k->nkr_lease_idx, k->nkr_num_slots);
}
#endif
return space;
}
/* make a lease on the kring for N positions. return the
* lease index
*/
static inline uint32_t
nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
/* True if no space in the tx ring. only valid after txsync_prologue */
static inline int
nm_kr_txempty(struct netmap_kring *kring)
{
uint32_t lim = k->nkr_num_slots - 1;
uint32_t lease_idx = k->nkr_lease_idx;
k->nkr_leases[lease_idx] = NR_NOSLOT;
k->nkr_lease_idx = nm_next(lease_idx, lim);
if (n > nm_kr_space(k, is_rx)) {
D("invalid request for %d slots", n);
panic("x");
}
/* XXX verify that there are n slots */
k->nkr_hwlease += n;
if (k->nkr_hwlease > lim)
k->nkr_hwlease -= lim + 1;
if (k->nkr_hwlease >= k->nkr_num_slots ||
k->nr_hwcur >= k->nkr_num_slots ||
k->nr_hwavail >= k->nkr_num_slots ||
k->nkr_lease_idx >= k->nkr_num_slots) {
D("invalid kring %s, cur %d avail %d lease %d lease_idx %d lim %d",
k->na->ifp->if_xname,
k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease,
k->nkr_lease_idx, k->nkr_num_slots);
}
return lease_idx;
}
#endif /* WITH_VALE */
/* return update position */
static inline uint32_t
nm_kr_rxpos(struct netmap_kring *k)
{
uint32_t pos = k->nr_hwcur + k->nr_hwavail;
if (pos >= k->nkr_num_slots)
pos -= k->nkr_num_slots;
#if 0
if (pos >= k->nkr_num_slots ||
k->nkr_hwlease >= k->nkr_num_slots ||
k->nr_hwcur >= k->nkr_num_slots ||
k->nr_hwavail >= k->nkr_num_slots ||
k->nkr_lease_idx >= k->nkr_num_slots) {
D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease,
k->nkr_lease_idx, k->nkr_num_slots);
}
#endif
return pos;
return kring->rcur == kring->nr_hwtail;
}
@ -613,11 +644,13 @@ nm_kr_rxpos(struct netmap_kring *k)
#define NM_KR_BUSY 1
#define NM_KR_STOPPED 2
static __inline void nm_kr_put(struct netmap_kring *kr)
{
NM_ATOMIC_CLEAR(&kr->nr_busy);
}
static __inline int nm_kr_tryget(struct netmap_kring *kr)
{
/* check a first time without taking the lock
@ -640,7 +673,7 @@ static __inline int nm_kr_tryget(struct netmap_kring *kr)
/*
* The following are support routines used by individual drivers to
* The following functions are used by individual drivers to
* support netmap operation.
*
* netmap_attach() initializes a struct netmap_adapter, allocating the
@ -666,7 +699,17 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na,
enum txrx tx, u_int n, u_int new_cur);
int netmap_ring_reinit(struct netmap_kring *);
/* set/clear native flags. XXX maybe also if_transmit ? */
/* default functions to handle rx/tx interrupts */
int netmap_rx_irq(struct ifnet *, u_int, u_int *);
#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL)
void netmap_common_irq(struct ifnet *, u_int, u_int *work_done);
void netmap_disable_all_rings(struct ifnet *);
void netmap_enable_all_rings(struct ifnet *);
void netmap_disable_ring(struct netmap_kring *kr);
/* set/clear native flags and if_transmit/netdev_ops */
static inline void
nm_set_native_flags(struct netmap_adapter *na)
{
@ -685,6 +728,7 @@ nm_set_native_flags(struct netmap_adapter *na)
#endif
}
static inline void
nm_clear_native_flags(struct netmap_adapter *na)
{
@ -701,36 +745,58 @@ nm_clear_native_flags(struct netmap_adapter *na)
#endif
}
/*
* validates parameters in the ring/kring, returns a value for cur,
* and the 'new_slots' value in the argument.
* If any error, returns cur > lim to force a reinit.
*/
u_int nm_txsync_prologue(struct netmap_kring *, u_int *);
/*
* validates parameters in the ring/kring, returns a value for cur,
* validates parameters in the ring/kring, returns a value for head
* If any error, returns ring_size to force a reinit.
*/
uint32_t nm_txsync_prologue(struct netmap_kring *);
/*
* validates parameters in the ring/kring, returns a value for head,
* and the 'reserved' value in the argument.
* If any error, returns cur > lim to force a reinit.
* If any error, returns ring_size lim to force a reinit.
*/
u_int nm_rxsync_prologue(struct netmap_kring *, u_int *);
uint32_t nm_rxsync_prologue(struct netmap_kring *);
/*
* update kring and ring at the end of txsync
* update kring and ring at the end of txsync.
*/
static inline void
nm_txsync_finalize(struct netmap_kring *kring, u_int cur)
nm_txsync_finalize(struct netmap_kring *kring)
{
/* recompute hwreserved */
kring->nr_hwreserved = cur - kring->nr_hwcur;
if (kring->nr_hwreserved < 0)
kring->nr_hwreserved += kring->nkr_num_slots;
/* update avail and reserved to what the kernel knows */
kring->ring->avail = kring->nr_hwavail;
kring->ring->reserved = kring->nr_hwreserved;
/* update ring head/tail to what the kernel knows */
kring->ring->tail = kring->rtail = kring->nr_hwtail;
kring->ring->head = kring->rhead = kring->nr_hwcur;
/* note, head/rhead/hwcur might be behind cur/rcur
* if no carrier
*/
ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
kring->name, kring->nr_hwcur, kring->nr_hwtail,
kring->rhead, kring->rcur, kring->rtail);
}
/*
* update kring and ring at the end of rxsync
*/
static inline void
nm_rxsync_finalize(struct netmap_kring *kring)
{
/* tell userspace that there might be new packets */
//struct netmap_ring *ring = kring->ring;
ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail,
kring->nr_hwtail);
kring->ring->tail = kring->rtail = kring->nr_hwtail;
/* make a copy of the state for next round */
kring->rhead = kring->ring->head;
kring->rcur = kring->ring->cur;
}
/* check/fix address and len in tx rings */
#if 1 /* debug version */
#define NM_CHECK_ADDR_LEN(_a, _l) do { \
@ -755,6 +821,8 @@ nm_txsync_finalize(struct netmap_kring *kring, u_int cur)
int netmap_update_config(struct netmap_adapter *na);
int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom);
void netmap_krings_delete(struct netmap_adapter *na);
int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
struct netmap_if *
netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
@ -766,10 +834,13 @@ u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg);
int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create);
int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na);
#ifdef WITH_VALE
/*
* The following bridge-related interfaces are used by other kernel modules
* In the version that only supports unicast or broadcast, the lookup
* The following bridge-related functions are used by other
* kernel modules.
*
* VALE only supports unicast or broadcast. The lookup
* function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports,
* NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown.
* XXX in practice "unknown" might be handled same as broadcast.
@ -799,8 +870,6 @@ int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func);
/* Various prototypes */
int netmap_poll(struct cdev *dev, int events, struct thread *td);
int netmap_init(void);
void netmap_fini(void);
int netmap_get_memory(struct netmap_priv_d* p);
@ -811,7 +880,8 @@ int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct t
/* netmap_adapter creation/destruction */
#define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie")
#define NM_DEBUG_PUTGET 1
// #define NM_DEBUG_PUTGET 1
#ifdef NM_DEBUG_PUTGET
@ -844,12 +914,15 @@ int netmap_adapter_put(struct netmap_adapter *na);
#endif /* !NM_DEBUG_PUTGET */
/*
* module variables
*/
extern u_int netmap_buf_size;
#define NETMAP_BUF_SIZE netmap_buf_size // XXX remove
extern int netmap_mitigate;
extern int netmap_mitigate; // XXX not really used
extern int netmap_no_pendintr;
extern u_int netmap_total_buffers;
extern char *netmap_buffer_base;
extern u_int netmap_total_buffers; // global allocator
extern char *netmap_buffer_base; // global allocator
extern int netmap_verbose; // XXX debugging
enum { /* verbose flags */
NM_VERB_ON = 1, /* generic verbose */
@ -908,7 +981,7 @@ extern int netmap_generic_ringsize;
#ifdef __FreeBSD__
/* Callback invoked by the dma machinery after a successfull dmamap_load */
/* Callback invoked by the dma machinery after a successful dmamap_load */
static void netmap_dmamap_cb(__unused void *arg,
__unused bus_dma_segment_t * segs, __unused int nseg, __unused int error)
{
@ -1053,31 +1126,27 @@ BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot)
lut[0].vaddr : lut[i].vaddr;
}
/* default functions to handle rx/tx interrupts */
int netmap_rx_irq(struct ifnet *, u_int, u_int *);
#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL)
void netmap_common_irq(struct ifnet *, u_int, u_int *work_done);
void netmap_txsync_to_host(struct netmap_adapter *na);
void netmap_disable_all_rings(struct ifnet *);
void netmap_enable_all_rings(struct ifnet *);
void netmap_disable_ring(struct netmap_kring *kr);
/* Structure associated to each thread which registered an interface.
/*
* Structure associated to each thread which registered an interface.
*
* The first 4 fields of this structure are written by NIOCREGIF and
* read by poll() and NIOC?XSYNC.
* There is low contention among writers (actually, a correct user program
* should have no contention among writers) and among writers and readers,
* so we use a single global lock to protect the structure initialization.
* Since initialization involves the allocation of memory, we reuse the memory
* allocator lock.
*
* There is low contention among writers (a correct user program
* should have none) and among writers and readers, so we use a
* single global lock to protect the structure initialization;
* since initialization involves the allocation of memory,
* we reuse the memory allocator lock.
*
* Read access to the structure is lock free. Readers must check that
* np_nifp is not NULL before using the other fields.
* If np_nifp is NULL initialization has not been performed, so they should
* return an error to userlevel.
* If np_nifp is NULL initialization has not been performed,
* so they should return an error to userspace.
*
* The ref_done field is used to regulate access to the refcount in the
* memory allocator. The refcount must be incremented at most once for
@ -1091,38 +1160,29 @@ struct netmap_priv_d {
struct netmap_if * volatile np_nifp; /* netmap if descriptor. */
struct netmap_adapter *np_na;
int np_ringid; /* from the ioctl */
u_int np_qfirst, np_qlast; /* range of rings to scan */
uint16_t np_txpoll;
int np_ringid; /* from the ioctl */
u_int np_qfirst, np_qlast; /* range of rings to scan */
uint16_t np_txpoll;
struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */
/* np_refcount is only used on FreeBSD */
int np_refcount; /* use with NMG_LOCK held */
int np_refcount; /* use with NMG_LOCK held */
};
/*
* generic netmap emulation for devices that do not have
* native netmap support.
* XXX generic_netmap_register() is only exported to implement
* nma_is_generic().
*/
int generic_netmap_register(struct netmap_adapter *na, int enable);
int generic_netmap_attach(struct ifnet *ifp);
int netmap_catch_rx(struct netmap_adapter *na, int intercept);
void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);;
void netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable);
void netmap_catch_tx(struct netmap_generic_adapter *na, int enable);
int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr);
int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx);
void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq);
static __inline int
nma_is_generic(struct netmap_adapter *na)
{
return na->nm_register == generic_netmap_register;
}
/*
* netmap_mitigation API. This is used by the generic adapter
* to reduce the number of interrupt requests/selwakeup
@ -1134,6 +1194,4 @@ void netmap_mitigation_restart(struct netmap_generic_adapter *na);
int netmap_mitigation_active(struct netmap_generic_adapter *na);
void netmap_mitigation_cleanup(struct netmap_generic_adapter *na);
// int generic_timer_handler(struct hrtimer *t);
#endif /* _NET_NETMAP_KERN_H_ */

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2013 Vincenzo Maffione. All rights reserved.
* Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -47,17 +47,20 @@ static inline void __mbq_init(struct mbq *q)
q->count = 0;
}
void mbq_safe_init(struct mbq *q)
{
mtx_init(&q->lock, "mbq", NULL, MTX_SPIN);
__mbq_init(q);
}
void mbq_init(struct mbq *q)
{
__mbq_init(q);
}
static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m)
{
m->m_nextpkt = NULL;
@ -70,6 +73,7 @@ static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m)
q->count++;
}
void mbq_safe_enqueue(struct mbq *q, struct mbuf *m)
{
mtx_lock(&q->lock);
@ -77,11 +81,13 @@ void mbq_safe_enqueue(struct mbq *q, struct mbuf *m)
mtx_unlock(&q->lock);
}
void mbq_enqueue(struct mbq *q, struct mbuf *m)
{
__mbq_enqueue(q, m);
}
static inline struct mbuf *__mbq_dequeue(struct mbq *q)
{
struct mbuf *ret = NULL;
@ -99,6 +105,7 @@ static inline struct mbuf *__mbq_dequeue(struct mbq *q)
return ret;
}
struct mbuf *mbq_safe_dequeue(struct mbq *q)
{
struct mbuf *ret;
@ -110,11 +117,13 @@ struct mbuf *mbq_safe_dequeue(struct mbq *q)
return ret;
}
struct mbuf *mbq_dequeue(struct mbq *q)
{
return __mbq_dequeue(q);
}
/* XXX seems pointless to have a generic purge */
static void __mbq_purge(struct mbq *q, int safe)
{
@ -130,16 +139,19 @@ static void __mbq_purge(struct mbq *q, int safe)
}
}
void mbq_purge(struct mbq *q)
{
__mbq_purge(q, 0);
}
void mbq_safe_purge(struct mbq *q)
{
__mbq_purge(q, 1);
}
void mbq_safe_destroy(struct mbq *q)
{
mtx_destroy(&q->lock);
@ -149,4 +161,3 @@ void mbq_safe_destroy(struct mbq *q)
void mbq_destroy(struct mbq *q)
{
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2013 Vincenzo Maffione. All rights reserved.
* Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
* Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -506,7 +506,7 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj
p->r_objsize = objsize;
#define MAX_CLUSTSIZE (1<<17)
#define LINE_ROUND 64
#define LINE_ROUND NM_CACHE_ALIGN // 64
if (objsize >= MAX_CLUSTSIZE) {
/* we could do it but there is no point */
D("unsupported allocation for %d bytes", objsize);
@ -960,13 +960,15 @@ netmap_mem_rings_create(struct netmap_adapter *na)
ND("txring[%d] at %p ofs %d", i, ring);
kring->ring = ring;
*(uint32_t *)(uintptr_t)&ring->num_slots = ndesc;
*(ssize_t *)(uintptr_t)&ring->buf_ofs =
*(int64_t *)(uintptr_t)&ring->buf_ofs =
(na->nm_mem->pools[NETMAP_IF_POOL].memtotal +
na->nm_mem->pools[NETMAP_RING_POOL].memtotal) -
netmap_ring_offset(na->nm_mem, ring);
ring->avail = kring->nr_hwavail;
ring->cur = kring->nr_hwcur;
/* copy values from kring */
ring->head = kring->rhead;
ring->cur = kring->rcur;
ring->tail = kring->rtail;
*(uint16_t *)(uintptr_t)&ring->nr_buf_size =
NETMAP_BDG_BUF_SIZE(na->nm_mem);
ND("initializing slots for txring");
@ -989,13 +991,15 @@ netmap_mem_rings_create(struct netmap_adapter *na)
kring->ring = ring;
*(uint32_t *)(uintptr_t)&ring->num_slots = ndesc;
*(ssize_t *)(uintptr_t)&ring->buf_ofs =
*(int64_t *)(uintptr_t)&ring->buf_ofs =
(na->nm_mem->pools[NETMAP_IF_POOL].memtotal +
na->nm_mem->pools[NETMAP_RING_POOL].memtotal) -
netmap_ring_offset(na->nm_mem, ring);
ring->cur = kring->nr_hwcur;
ring->avail = kring->nr_hwavail;
/* copy values from kring */
ring->head = kring->rhead;
ring->cur = kring->rcur;
ring->tail = kring->rtail;
*(int *)(uintptr_t)&ring->nr_buf_size =
NETMAP_BDG_BUF_SIZE(na->nm_mem);
ND("initializing slots for rxring[%d]", i);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
* Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2013 Universita` di Pisa. All rights reserved.
* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -250,44 +250,6 @@ struct nm_bridge {
struct nm_bridge nm_bridges[NM_BRIDGES];
/*
* A few function to tell which kind of port are we using.
* XXX should we hold a lock ?
*
* nma_is_vp() virtual port
* nma_is_host() port connected to the host stack
* nma_is_hw() port connected to a NIC
* nma_is_generic() generic netmap adapter XXX stop this madness
*/
static __inline int
nma_is_vp(struct netmap_adapter *na)
{
return na->nm_register == bdg_netmap_reg;
}
static __inline int
nma_is_host(struct netmap_adapter *na)
{
return na->nm_register == NULL;
}
static __inline int
nma_is_hw(struct netmap_adapter *na)
{
/* In case of sw adapter, nm_register is NULL */
return !nma_is_vp(na) && !nma_is_host(na) && !nma_is_generic(na);
}
static __inline int
nma_is_bwrap(struct netmap_adapter *na)
{
return na->nm_register == netmap_bwrap_register;
}
/*
* this is a slightly optimized copy routine which rounds
* to multiple of 64 bytes and is often faster than dealing
@ -318,7 +280,6 @@ pkt_copy(void *_src, void *_dst, int l)
}
/*
* locate a bridge among the existing ones.
* MUST BE CALLED WITH NMG_LOCK()
@ -393,8 +354,8 @@ nm_free_bdgfwd(struct netmap_adapter *na)
struct netmap_kring *kring;
NMG_LOCK_ASSERT();
nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
nrings = na->num_tx_rings;
kring = na->tx_rings;
for (i = 0; i < nrings; i++) {
if (kring[i].nkr_ft) {
free(kring[i].nkr_ft, M_DEVBUF);
@ -502,6 +463,7 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
}
}
static void
netmap_adapter_vp_dtor(struct netmap_adapter *na)
{
@ -520,6 +482,16 @@ netmap_adapter_vp_dtor(struct netmap_adapter *na)
na->ifp = NULL;
}
/* Try to get a reference to a netmap adapter attached to a VALE switch.
* If the adapter is found (or is created), this function returns 0, a
* non NULL pointer is returned into *na, and the caller holds a
* reference to the adapter.
* If an adapter is not found, then no reference is grabbed and the
* function returns an error code, or 0 if there is just a VALE prefix
* mismatch. Therefore the caller holds a reference when
* (*na != NULL && return == 0).
*/
int
netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
{
@ -688,18 +660,12 @@ nm_bdg_attach(struct nmreq *nmr)
return ENOMEM;
NMG_LOCK();
/* XXX probably netmap_get_bdg_na() */
error = netmap_get_na(nmr, &na, 1 /* create if not exists */);
error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
if (error) /* no device, or another bridge or user owns the device */
goto unlock_exit;
/* netmap_get_na() sets na_bdg if this is a physical interface
* that we can attach to a switch.
*/
if (!nma_is_bwrap(na)) {
/* got reference to a virtual port or direct access to a NIC.
* perhaps specified no bridge prefix or wrong NIC name
*/
if (na == NULL) { /* VALE prefix missing */
error = EINVAL;
goto unref_exit;
goto unlock_exit;
}
if (na->active_fds > 0) { /* already registered */
@ -727,6 +693,7 @@ nm_bdg_attach(struct nmreq *nmr)
return error;
}
static int
nm_bdg_detach(struct nmreq *nmr)
{
@ -736,17 +703,15 @@ nm_bdg_detach(struct nmreq *nmr)
int last_instance;
NMG_LOCK();
error = netmap_get_na(nmr, &na, 0 /* don't create */);
error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
if (error) { /* no device, or another bridge or user owns the device */
goto unlock_exit;
}
if (!nma_is_bwrap(na)) {
/* got reference to a virtual port or direct access to a NIC.
* perhaps specified no bridge's prefix or wrong NIC's name
*/
if (na == NULL) { /* VALE prefix missing */
error = EINVAL;
goto unref_exit;
goto unlock_exit;
}
bna = (struct netmap_bwrap_adapter *)na;
if (na->active_fds == 0) { /* not registered */
@ -890,12 +855,13 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
case NETMAP_BDG_OFFSET:
NMG_LOCK();
error = netmap_get_bdg_na(nmr, &na, 0);
if (!error) {
if (na && !error) {
vpna = (struct netmap_vp_adapter *)na;
if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET)
nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET;
vpna->offset = nmr->nr_arg1;
D("Using offset %d for %p", vpna->offset, vpna);
netmap_adapter_put(na);
}
NMG_UNLOCK();
break;
@ -947,6 +913,7 @@ netmap_vp_krings_create(struct netmap_adapter *na)
return 0;
}
static void
netmap_vp_krings_delete(struct netmap_adapter *na)
{
@ -1027,10 +994,6 @@ nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
}
/*
*---- support for virtual bridge -----
*/
/* ----- FreeBSD if_bridge hash function ------- */
/*
@ -1052,6 +1015,7 @@ do { \
c -= a; c -= b; c ^= (b >> 15); \
} while (/*CONSTCOND*/0)
static __inline uint32_t
nm_bridge_rthash(const uint8_t *addr)
{
@ -1143,6 +1107,77 @@ netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
}
/*
* Available space in the ring. Only used in VALE code
* and only with is_rx = 1
*/
static inline uint32_t
nm_kr_space(struct netmap_kring *k, int is_rx)
{
int space;
if (is_rx) {
int busy = k->nkr_hwlease - k->nr_hwcur;
if (busy < 0)
busy += k->nkr_num_slots;
space = k->nkr_num_slots - 1 - busy;
} else {
/* XXX never used in this branch */
space = k->nr_hwtail - k->nkr_hwlease;
if (space < 0)
space += k->nkr_num_slots;
}
#if 0
// sanity check
if (k->nkr_hwlease >= k->nkr_num_slots ||
k->nr_hwcur >= k->nkr_num_slots ||
k->nr_tail >= k->nkr_num_slots ||
busy < 0 ||
busy >= k->nkr_num_slots) {
D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
k->nkr_lease_idx, k->nkr_num_slots);
}
#endif
return space;
}
/* make a lease on the kring for N positions. return the
* lease index
* XXX only used in VALE code and with is_rx = 1
*/
static inline uint32_t
nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
{
uint32_t lim = k->nkr_num_slots - 1;
uint32_t lease_idx = k->nkr_lease_idx;
k->nkr_leases[lease_idx] = NR_NOSLOT;
k->nkr_lease_idx = nm_next(lease_idx, lim);
if (n > nm_kr_space(k, is_rx)) {
D("invalid request for %d slots", n);
panic("x");
}
/* XXX verify that there are n slots */
k->nkr_hwlease += n;
if (k->nkr_hwlease > lim)
k->nkr_hwlease -= lim + 1;
if (k->nkr_hwlease >= k->nkr_num_slots ||
k->nr_hwcur >= k->nkr_num_slots ||
k->nr_hwtail >= k->nkr_num_slots ||
k->nkr_lease_idx >= k->nkr_num_slots) {
D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
k->na->ifp->if_xname,
k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
k->nkr_lease_idx, k->nkr_num_slots);
}
return lease_idx;
}
/*
* This flush routine supports only unicast and broadcast but a large
* number of ports, and lets us replace the learn and dispatch functions.
@ -1357,28 +1392,30 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
dst = BDG_NMB(&dst_na->up, slot);
if (unlikely(fix_mismatch)) {
if (na->offset > dst_na->offset) {
src += na->offset - dst_na->offset;
copy_len -= na->offset - dst_na->offset;
dst_len = copy_len;
} else {
bzero(dst, dst_na->offset - na->offset);
dst_len += dst_na->offset - na->offset;
dst += dst_na->offset - na->offset;
}
/* fix the first fragment only */
fix_mismatch = 0;
/* completely skip an header only fragment */
if (copy_len == 0) {
ft_p++;
continue;
}
/* We are processing the first fragment
* and there is a mismatch between source
* and destination offsets. Create a zeroed
* header for the destination, independently
* of the source header length and content.
*/
src += na->offset;
copy_len -= na->offset;
bzero(dst, dst_na->offset);
dst += dst_na->offset;
dst_len = dst_na->offset + copy_len;
/* fix the first fragment only */
fix_mismatch = 0;
/* Here it could be copy_len == dst_len == 0,
* and so a zero length fragment is passed.
*/
}
ND("send [%d] %d(%d) bytes at %s:%d",
i, (int)copy_len, (int)dst_len,
NM_IFPNAME(dst_ifp), j);
/* round to a multiple of 64 */
copy_len = (copy_len + 63) & ~63;
ND("send %d %d bytes at %s:%d",
i, ft_p->ft_len, NM_IFPNAME(dst_ifp), j);
if (ft_p->ft_flags & NS_INDIRECT) {
if (copyin(src, dst, copy_len)) {
// invalid user pointer, pretend len is 0
@ -1426,7 +1463,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
}
p[lease_idx] = j; /* report I am done */
update_pos = nm_kr_rxpos(kring);
update_pos = kring->nr_hwtail;
if (my_start == update_pos) {
/* all slots before my_start have been reported,
@ -1443,15 +1480,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
* means there are new buffers to report
*/
if (likely(j != my_start)) {
uint32_t old_avail = kring->nr_hwavail;
kring->nr_hwavail = (j >= kring->nr_hwcur) ?
j - kring->nr_hwcur :
j + lim + 1 - kring->nr_hwcur;
if (kring->nr_hwavail < old_avail) {
D("avail shrink %d -> %d",
old_avail, kring->nr_hwavail);
}
kring->nr_hwtail = j;
dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
still_locked = 0;
mtx_unlock(&kring->q_lock);
@ -1471,35 +1500,32 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
return 0;
}
static int
netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
{
struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int j, k, lim = kring->nkr_num_slots - 1;
k = ring->cur;
if (k > lim)
return netmap_ring_reinit(kring);
u_int done;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = kring->rcur;
if (bridge_batch <= 0) { /* testing only */
j = k; // used all
done = cur; // used all
goto done;
}
if (bridge_batch > NM_BDG_BATCH)
bridge_batch = NM_BDG_BATCH;
j = nm_bdg_preflush(na, ring_nr, kring, k);
if (j != k)
D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail);
/* k-j modulo ring size is the number of slots processed */
if (k < j)
k += kring->nkr_num_slots;
kring->nr_hwavail = lim - (k - j);
done = nm_bdg_preflush(na, ring_nr, kring, cur);
done:
kring->nr_hwcur = j;
ring->avail = kring->nr_hwavail;
if (done != cur)
D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
/*
* packets between 'done' and 'cur' are left unsent.
*/
kring->nr_hwcur = done;
kring->nr_hwtail = nm_prev(done, lim);
nm_txsync_finalize(kring);
if (netmap_verbose)
D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
return 0;
@ -1518,6 +1544,48 @@ bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
return netmap_vp_txsync(vpna, ring_nr, flags);
}
static int
netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
{
struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i, lim = kring->nkr_num_slots - 1;
u_int head = nm_rxsync_prologue(kring);
int n;
if (head > lim) {
D("ouch dangerous reset!!!");
n = netmap_ring_reinit(kring);
goto done;
}
/* First part, import newly received packets. */
/* actually nothing to do here, they are already in the kring */
/* Second part, skip past packets that userspace has released. */
nm_i = kring->nr_hwcur;
if (nm_i != head) {
/* consistency check, but nothing really important here */
for (n = 0; likely(nm_i != head); n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
void *addr = BDG_NMB(na, slot);
if (addr == netmap_buffer_base) { /* bad buf */
D("bad buffer index %d, ignore ?",
slot->buf_idx);
}
slot->flags &= ~NS_BUF_CHANGED;
nm_i = nm_next(nm_i, lim);
}
kring->nr_hwcur = head;
}
/* tell userspace that there are new packets */
nm_rxsync_finalize(kring);
n = 0;
done:
return n;
}
/*
* user process reading from a VALE switch.
@ -1529,55 +1597,15 @@ static int
bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
{
struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int j, lim = kring->nkr_num_slots - 1;
u_int k = ring->cur, resvd = ring->reserved;
int n;
mtx_lock(&kring->q_lock);
if (k > lim) {
D("ouch dangerous reset!!!");
n = netmap_ring_reinit(kring);
goto done;
}
/* skip past packets that userspace has released */
j = kring->nr_hwcur; /* netmap ring index */
if (resvd > 0) {
if (resvd + ring->avail >= lim + 1) {
D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
ring->reserved = resvd = 0; // XXX panic...
}
k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
}
if (j != k) { /* userspace has released some packets. */
n = k - j;
if (n < 0)
n += kring->nkr_num_slots;
ND("userspace releases %d packets", n);
for (n = 0; likely(j != k); n++) {
struct netmap_slot *slot = &ring->slot[j];
void *addr = BDG_NMB(na, slot);
if (addr == netmap_buffer_base) { /* bad buf */
D("bad buffer index %d, ignore ?",
slot->buf_idx);
}
slot->flags &= ~NS_BUF_CHANGED;
j = nm_next(j, lim);
}
kring->nr_hwavail -= n;
kring->nr_hwcur = k;
}
/* tell userspace that there are new packets */
ring->avail = kring->nr_hwavail - resvd;
n = 0;
done:
n = netmap_vp_rxsync(na, ring_nr, flags);
mtx_unlock(&kring->q_lock);
return n;
}
static int
bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
{
@ -1627,6 +1655,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
return 0;
}
static void
netmap_bwrap_dtor(struct netmap_adapter *na)
{
@ -1652,16 +1681,22 @@ netmap_bwrap_dtor(struct netmap_adapter *na)
}
/*
* Pass packets from nic to the bridge.
* Intr callback for NICs connected to a bridge.
* Simply ignore tx interrupts (maybe we could try to recover space ?)
* and pass received packets from nic to the bridge.
*
* XXX TODO check locking: this is called from the interrupt
* handler so we should make sure that the interface is not
* disconnected while passing down an interrupt.
*
* Note, no user process can access this NIC so we can ignore
* the info in the 'ring'.
*/
/* callback that overwrites the hwna notify callback.
* Note, no user process can access this NIC or the host stack.
* The only part of the ring that is significant are the slots,
* and head/cur/tail are set from the kring as needed
* (part as a receive ring, part as a transmit ring).
*
* callback that overwrites the hwna notify callback.
* Packets come from the outside or from the host stack and are put on an hwna rx ring.
* The bridge wrapper then sends the packets through the bridge.
*/
@ -1677,21 +1712,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
struct netmap_vp_adapter *vpna = &bna->up;
int error = 0;
ND("%s[%d] %s %x", NM_IFPNAME(ifp), ring_nr, (tx == NR_TX ? "TX" : "RX"), flags);
if (netmap_verbose)
D("%s %s%d 0x%x", NM_IFPNAME(ifp),
(tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
if (flags & NAF_DISABLE_NOTIFY) {
kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
if (kring->nkr_stopped)
netmap_disable_ring(bkring);
if (kring[ring_nr].nkr_stopped)
netmap_disable_ring(&bkring[ring_nr]);
else
bkring->nkr_stopped = 0;
bkring[ring_nr].nkr_stopped = 0;
return 0;
}
if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
return 0;
/* we only care about receive interrupts */
if (tx == NR_TX)
return 0;
@ -1707,7 +1745,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
goto put_out;
}
/* Here we expect ring->head = ring->cur = ring->tail
* because everything has been released from the previous round.
* However the ring is shared and we might have info from
* the wrong side (the tx ring). Hence we overwrite with
* the info from the rx kring.
*/
if (netmap_verbose)
D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp),
ring->head, ring->cur, ring->tail,
kring->rhead, kring->rcur, kring->rtail);
ring->head = kring->rhead;
ring->cur = kring->rcur;
ring->tail = kring->rtail;
/* simulate a user wakeup on the rx ring */
if (is_host_ring) {
netmap_rxsync_from_host(na, NULL, NULL);
vpna = hostna;
ring_nr = 0;
} else {
@ -1718,23 +1773,46 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
if (error)
goto put_out;
}
if (kring->nr_hwavail == 0 && netmap_verbose) {
if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
D("how strange, interrupt with no packets on %s",
NM_IFPNAME(ifp));
goto put_out;
}
/* XXX avail ? */
ring->cur = nm_kr_rxpos(kring);
/* new packets are ring->cur to ring->tail, and the bkring
* had hwcur == ring->cur. So advance ring->cur to ring->tail
* to push all packets out.
*/
ring->head = ring->cur = ring->tail;
/* also set tail to what the bwrap expects */
bkring = &vpna->up.tx_rings[ring_nr];
ring->tail = bkring->nr_hwtail; // rtail too ?
/* pass packets to the switch */
nm_txsync_prologue(bkring); // XXX error checking ?
netmap_vp_txsync(vpna, ring_nr, flags);
if (!is_host_ring)
/* mark all buffers as released on this ring */
ring->head = ring->cur = kring->nr_hwtail;
ring->tail = kring->rtail;
/* another call to actually release the buffers */
if (!is_host_ring) {
error = na->nm_rxsync(na, ring_nr, 0);
} else {
/* mark all packets as released, as in the
* second part of netmap_rxsync_from_host()
*/
kring->nr_hwcur = kring->nr_hwtail;
nm_rxsync_finalize(kring);
}
put_out:
nm_kr_put(kring);
return error;
}
static int
netmap_bwrap_register(struct netmap_adapter *na, int onoff)
{
@ -1744,7 +1822,7 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
struct netmap_vp_adapter *hostna = &bna->host;
int error;
ND("%s %d", NM_IFPNAME(ifp), onoff);
ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
if (onoff) {
int i;
@ -1788,6 +1866,7 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
return 0;
}
static int
netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
u_int *rxr, u_int *rxd)
@ -1807,6 +1886,7 @@ netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
return 0;
}
static int
netmap_bwrap_krings_create(struct netmap_adapter *na)
{
@ -1834,6 +1914,7 @@ netmap_bwrap_krings_create(struct netmap_adapter *na)
return 0;
}
static void
netmap_bwrap_krings_delete(struct netmap_adapter *na)
{
@ -1847,6 +1928,7 @@ netmap_bwrap_krings_delete(struct netmap_adapter *na)
netmap_vp_krings_delete(na);
}
/* notify method for the bridge-->hwna direction */
static int
netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
@ -1856,7 +1938,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
struct netmap_adapter *hwna = bna->hwna;
struct netmap_kring *kring, *hw_kring;
struct netmap_ring *ring;
u_int lim, k;
u_int lim;
int error = 0;
if (tx == NR_TX)
@ -1865,35 +1947,49 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
kring = &na->rx_rings[ring_n];
hw_kring = &hwna->tx_rings[ring_n];
ring = kring->ring;
lim = kring->nkr_num_slots - 1;
k = nm_kr_rxpos(kring);
if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
return 0;
ring->cur = k;
ND("%s[%d] PRE rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)",
/* first step: simulate a user wakeup on the rx ring */
netmap_vp_rxsync(na, ring_n, flags);
ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
NM_IFPNAME(na->ifp), ring_n,
kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved,
ring->cur, ring->avail, ring->reserved,
hw_kring->nr_hwcur, hw_kring->nr_hwavail);
kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
ring->head, ring->cur, ring->tail,
hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
/* second step: the simulated user consumes all new packets */
ring->head = ring->cur = ring->tail;
/* third step: the new packets are sent on the tx ring
* (which is actually the same ring)
*/
/* set tail to what the hw expects */
ring->tail = hw_kring->rtail;
if (ring_n == na->num_rx_rings) {
netmap_txsync_to_host(hwna);
} else {
nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
error = hwna->nm_txsync(hwna, ring_n, flags);
}
kring->nr_hwcur = ring->cur;
kring->nr_hwavail = 0;
kring->nr_hwreserved = lim - ring->avail;
ND("%s[%d] PST rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)",
/* fourth step: now we are back the rx ring */
/* claim ownership on all hw owned bufs */
ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
ring->tail = kring->rtail; /* restore saved value of tail, for safety */
/* fifth step: the user goes to sleep again, causing another rxsync */
netmap_vp_rxsync(na, ring_n, flags);
ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
NM_IFPNAME(na->ifp), ring_n,
kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved,
ring->cur, ring->avail, ring->reserved,
hw_kring->nr_hwcur, hw_kring->nr_hwavail);
kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
ring->head, ring->cur, ring->tail,
hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
return error;
}
static int
netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
{
@ -1904,6 +2000,7 @@ netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx,
return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
}
/* attach a bridge wrapper to the 'real' device */
static int
netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
@ -1957,7 +2054,8 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
hostna->nm_mem = na->nm_mem;
hostna->na_private = bna;
D("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname,
ND("%s<->%s txr %d txd %d rxr %d rxd %d",
fake->if_xname, real->if_xname,
na->num_tx_rings, na->num_tx_desc,
na->num_rx_rings, na->num_rx_desc);
@ -1970,6 +2068,7 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
return 0;
}
void
netmap_init_bridges(void)
{

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -39,6 +39,16 @@
#ifndef _NET_NETMAP_H_
#define _NET_NETMAP_H_
#define NETMAP_API 10 /* current API version */
/*
* Some fields should be cache-aligned to reduce contention.
* The alignment is architecture and OS dependent, but rather than
* digging into OS headers to find the exact value we use an estimate
* that should cover most architectures.
*/
#define NM_CACHE_ALIGN 128
/*
* --- Netmap data structures ---
*
@ -52,23 +62,23 @@
====================================================================
|
USERSPACE | struct netmap_ring
+---->+--------------+
/ | cur |
struct netmap_if (nifp, 1 per fd) / | avail |
+---------------+ / | buf_ofs |
| ni_tx_rings | / +==============+
| ni_rx_rings | / | buf_idx, len | slot[0]
| | / | flags, ptr |
| | / +--------------+
+===============+ / | buf_idx, len | slot[1]
| txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
| txring_ofs[1] | +--------------+
(ni_tx_rings+1 entries) (num_slots entries)
| txring_ofs[t] | | buf_idx, len | slot[n-1]
+---------------+ | flags, ptr |
| rxring_ofs[0] | +--------------+
+---->+---------------+
/ | head,cur,tail |
struct netmap_if (nifp, 1 per fd) / | buf_ofs |
+---------------+ / | other fields |
| ni_tx_rings | / +===============+
| ni_rx_rings | / | buf_idx, len | slot[0]
| | / | flags, ptr |
| | / +---------------+
+===============+ / | buf_idx, len | slot[1]
| txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
| txring_ofs[1] | +---------------+
(tx+1+extra_tx entries) (num_slots entries)
| txring_ofs[t] | | buf_idx, len | slot[n-1]
+---------------+ | flags, ptr |
| rxring_ofs[0] | +---------------+
| rxring_ofs[1] |
(ni_rx_rings+1 entries)
(rx+1+extra_rx entries)
| rxring_ofs[r] |
+---------------+
@ -93,122 +103,115 @@
/*
* struct netmap_slot is a buffer descriptor
*
* buf_idx the index of the buffer associated to the slot.
* len the length of the payload
* flags control operation on the slot, as defined below
*
* NS_BUF_CHANGED must be set whenever userspace wants
* to change buf_idx (it might be necessary to
* reprogram the NIC)
*
* NS_REPORT must be set if we want the NIC to generate an interrupt
* when this slot is used. Leaving it to 0 improves
* performance.
*
* NS_FORWARD if set on a receive ring, and the device is in
* transparent mode, buffers released with the flag set
* will be forwarded to the 'other' side (host stack
* or NIC, respectively) on the next select() or ioctl()
*
* NS_NO_LEARN on a VALE switch, do not 'learn' the source port for
* this packet.
*
* NS_INDIRECT (tx rings only) data is in a userspace buffer pointed
* by the ptr field in the slot.
*
* NS_MOREFRAG Part of a multi-segment frame. The last (or only)
* segment must not have this flag.
* Only supported on VALE ports.
*
* NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the
* destination port for the VALE switch, overriding
* the lookup table.
*/
struct netmap_slot {
uint32_t buf_idx; /* buffer index */
uint16_t len; /* packet length */
uint16_t len; /* length for this slot */
uint16_t flags; /* buf changed, etc. */
uint64_t ptr; /* pointer for indirect buffers */
};
/*
* The following flags control how the slot is used
*/
#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */
#define NS_REPORT 0x0002 /* ask the hardware to report results
* e.g. by generating an interrupt
*/
#define NS_FORWARD 0x0004 /* pass packet to the other endpoint
* (host stack or device)
*/
#define NS_NO_LEARN 0x0008
#define NS_INDIRECT 0x0010
#define NS_MOREFRAG 0x0020
/*
* must be set whenever buf_idx is changed (as it might be
* necessary to recompute the physical address and mapping)
*/
#define NS_REPORT 0x0002 /* ask the hardware to report results */
/*
* Request notification when slot is used by the hardware.
* Normally transmit completions are handled lazily and
* may be unreported. This flag lets us know when a slot
* has been sent (e.g. to terminate the sender).
*/
#define NS_FORWARD 0x0004 /* pass packet 'forward' */
/*
* (Only for physical ports, rx rings with NR_FORWARD set).
* Slot released to the kernel (i.e. before ring->head) with
* this flag set are passed to the peer ring (host/NIC),
* thus restoring the host-NIC connection for these slots.
* This supports efficient traffic monitoring or firewalling.
*/
#define NS_NO_LEARN 0x0008 /* disable bridge learning */
/*
* On a VALE switch, do not 'learn' the source port for
* this buffer.
*/
#define NS_INDIRECT 0x0010 /* userspace buffer */
/*
* (VALE tx rings only) data is in a userspace buffer,
* whose address is in the 'ptr' field in the slot.
*/
#define NS_MOREFRAG 0x0020 /* packet has more fragments */
/*
* (VALE ports only)
* Set on all but the last slot of a multi-segment packet.
* The 'len' field refers to the individual fragment.
*/
#define NS_PORT_SHIFT 8
#define NS_PORT_MASK (0xff << NS_PORT_SHIFT)
/*
* in rx rings, the high 8 bits
* are the number of fragments.
*/
/*
* The high 8 bits of the flag, if not zero, indicate the
* destination port for the VALE switch, overriding
* the lookup table.
*/
#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff)
uint64_t ptr; /* pointer for indirect buffers */
};
/*
* (VALE rx rings only) the high 8 bits
* are the number of fragments.
*/
/*
* struct netmap_ring
*
* Netmap representation of a TX or RX ring (also known as "queue").
* This is a queue implemented as a fixed-size circular array.
* At the software level, two fields are important: avail and cur.
* At the software level the important fields are: head, cur, tail.
*
* In TX rings:
*
* avail tells how many slots are available for transmission.
* It is updated by the kernel in each netmap system call.
* It MUST BE decremented by the user when it
* adds a new packet to send.
* head first slot available for transmission.
* cur wakeup point. select() and poll() will unblock
* when 'tail' moves past 'cur'
* tail (readonly) first slot reserved to the kernel
*
* cur indicates the slot to use for the next packet
* to send (i.e. the "tail" of the queue).
* It MUST BE incremented by the user before
* netmap system calls to reflect the number of newly
* sent packets.
* It is checked by the kernel on netmap system calls
* (normally unmodified by the kernel unless invalid).
* [head .. tail-1] can be used for new packets to send;
* 'head' and 'cur' must be incremented as slots are filled
* with new packets to be sent;
* 'cur' can be moved further ahead if we need more space
* for new transmissions.
*
* In RX rings:
*
* avail is the number of packets available (possibly 0).
* It is updated by the kernel in each netmap system call.
* It MUST BE decremented by the user when it
* consumes a packet.
*
* cur indicates the first slot that contains a packet not
* yet processed (the "head" of the queue).
* It MUST BE incremented by the user when it consumes
* a packet.
*
* reserved indicates the number of buffers before 'cur'
* that the user has not released yet. Normally 0,
* it MUST BE incremented by the user when it
* does not return the buffer immediately, and decremented
* when the buffer is finally freed.
* head first valid received packet
* cur wakeup point. select() and poll() will unblock
* when 'tail' moves past 'cur'
* tail (readonly) first slot reserved to the kernel
*
* [head .. tail-1] contain received packets;
* 'head' and 'cur' must be incremented as slots are consumed
* and can be returned to the kernel;
* 'cur' can be moved further ahead if we want to wait for
* new packets without returning the previous ones.
*
* DATA OWNERSHIP/LOCKING:
* The netmap_ring, all slots, and buffers in the range
* [reserved-cur , cur+avail[ are owned by the user program,
* and the kernel only touches them in the same thread context
* during a system call.
* Other buffers are reserved for use by the NIC's DMA engines.
* The netmap_ring, and all slots and buffers in the range
* [head .. tail-1] are owned by the user program;
* the kernel only accesses them during a netmap system call
* and in the user thread context.
*
* FLAGS
* NR_TIMESTAMP updates the 'ts' field on each syscall. This is
* a global timestamp for all packets.
* NR_RX_TSTMP if set, the last 64 byte in each buffer will
* contain a timestamp for the frame supplied by
* the hardware (if supported)
* NR_FORWARD if set, the NS_FORWARD flag in each slot of the
* RX ring is checked, and if set the packet is
* passed to the other side (host stack or device,
* respectively). This permits bpf-like behaviour
* or transparency for selected packets.
* Other slots and buffers are reserved for use by the kernel
*/
struct netmap_ring {
/*
@ -216,25 +219,44 @@ struct netmap_ring {
* It contains the offset of the buffer region from this
* descriptor.
*/
const ssize_t buf_ofs;
const int64_t buf_ofs;
const uint32_t num_slots; /* number of slots in the ring. */
uint32_t avail; /* number of usable slots */
uint32_t cur; /* 'current' r/w position */
uint32_t reserved; /* not refilled before current */
const uint32_t nr_buf_size;
const uint16_t ringid;
const uint16_t dir; /* 0: tx, 1: rx */
const uint16_t nr_buf_size;
uint16_t flags;
#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */
uint32_t head; /* (u) first user slot */
uint32_t cur; /* (u) wakeup point */
uint32_t tail; /* (k) first kernel slot */
struct timeval ts; /* time of last *sync() */
uint32_t flags;
struct timeval ts; /* (k) time of last *sync() */
/* opaque room for a mutex or similar object */
uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN)));
/* the slots follow. This struct has variable size */
struct netmap_slot slot[0]; /* array of slots. */
};
/*
* RING FLAGS
*/
#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
/*
* updates the 'ts' field on each netmap syscall. This saves
* saves a separate gettimeofday(), and is not much worse than
* software timestamps generated in the interrupt handler.
*/
#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
/*
* Enables the NS_FORWARD slot flag for the ring.
*/
/*
* Netmap representation of an interface and its queue(s).
* This is initialized by the kernel when binding a file
@ -252,81 +274,109 @@ struct netmap_if {
const uint32_t ni_flags; /* properties */
#define NI_PRIV_MEM 0x1 /* private memory region */
const uint32_t ni_rx_rings; /* number of rx rings */
const uint32_t ni_tx_rings; /* number of tx rings */
/*
* The number of packet rings available in netmap mode.
* Physical NICs can have different numbers of tx and rx rings.
* Physical NICs also have a 'host' ring pair.
* Additionally, clients can request additional ring pairs to
* be used for internal communication.
*/
const uint32_t ni_tx_rings; /* number of HW tx rings */
const uint32_t ni_rx_rings; /* number of HW rx rings */
const uint32_t ni_extra_tx_rings;
const uint32_t ni_extra_rx_rings;
/*
* The following array contains the offset of each netmap ring
* from this structure. The first ni_tx_rings+1 entries refer
* to the tx rings, the next ni_rx_rings+1 refer to the rx rings
* (the last entry in each block refers to the host stack rings).
* from this structure, in the following order:
* NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings;
* NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings.
*
* The area is filled up by the kernel on NIOCREGIF,
* and then only read by userspace code.
*/
const ssize_t ring_ofs[0];
};
#ifndef NIOCREGIF
/*
* ioctl names and related fields
*
* NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
* whose identity is set in NIOCREGIF through nr_ringid.
* These are non blocking and take no argument.
*
* NIOCGINFO takes a struct ifreq, the interface name is the input,
* the outputs are number of queues and number of descriptor
* for each queue (useful to set number of threads etc.).
* The info returned is only advisory and may change before
* the interface is bound to a file descriptor.
*
* NIOCREGIF takes an interface name within a struct ifreq,
* NIOCREGIF takes an interface name within a struct nmre,
* and activates netmap mode on the interface (if possible).
*
* nr_name is the name of the interface
* The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we
* can pass it down to other NIC-related ioctls.
*
* nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings
* indicate the configuration of the port on return.
* The actual argument (struct nmreq) has a number of options to request
* different functions.
*
* On input, non-zero values for nr_tx_rings, nr_tx_slots and the
* rx counterparts may be used to reconfigure the port according
* to the requested values, but this is not guaranteed.
* The actual values are returned on completion of the ioctl().
* nr_name (in)
* The name of the port (em0, valeXXX:YYY, etc.)
* limited to IFNAMSIZ for backward compatibility.
*
* nr_ringid
* indicates how rings should be bound to the file descriptors.
* The default (0) means all physical rings of a NIC are bound.
* NETMAP_HW_RING plus a ring number lets you bind just
* a single ring pair.
* NETMAP_SW_RING binds only the host tx/rx rings
* NETMAP_NO_TX_POLL prevents select()/poll() from pushing
* out packets on the tx ring unless POLLOUT is specified.
* nr_version (in/out)
* Must match NETMAP_API as used in the kernel, error otherwise.
* Always returns the desired value on output.
*
* NETMAP_PRIV_MEM is a return value used to indicate that
* this ring is in a private memory region hence buffer
* swapping cannot be used
* nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out)
* On input, non-zero values may be used to reconfigure the port
* according to the requested values, but this is not guaranteed.
* On output the actual values in use are reported.
*
* nr_cmd is used to configure NICs attached to a VALE switch,
* or to dump the configuration of a VALE switch.
* nr_ringid (in)
* Indicates how rings should be bound to the file descriptors.
* 0 (default) binds all physical rings
* NETMAP_HW_RING | ring number binds a single ring pair
* NETMAP_SW_RING binds only the host tx/rx rings
*
* nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname
* attaches the NIC to the switch, with nr_ringid specifying
* which rings to use
* NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push
* packets on tx rings only if POLLOUT is set.
* The default is to push any pending packet.
*
* nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname
* disconnects a previously attached NIC
* NETMAP_PRIV_MEM is set on return for ports that use private
* memory regions and cannot use buffer swapping.
*
* nr_cmd = NETMAP_BDG_LIST is used to list the configuration
* of VALE switches, with additional arguments.
* nr_cmd (in) if non-zero indicates a special command:
* NETMAP_BDG_ATTACH and nr_name = vale*:ifname
* attaches the NIC to the switch; nr_ringid specifies
* which rings to use. Used by vale-ctl -a ...
* nr_arg1 = NETMAP_BDG_HOST also attaches the host port
* as in vale-ctl -h ...
*
* NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
* whose identity is set in NIOCREGIF through nr_ringid
* NETMAP_BDG_DETACH and nr_name = vale*:ifname
* disconnects a previously attached NIC.
* Used by vale-ctl -d ...
*
* NETMAP_BDG_LIST
* list the configuration of VALE switches.
*
* NETMAP_BDG_OFFSET XXX ?
* Set the offset of data in packets. Used with VALE
* switches where the clients use the vhost header.
*
* nr_arg1, nr_arg2 (in/out) command specific
*
* NETMAP_API is the API version.
*/
/*
* struct nmreq overlays a struct ifreq
*/
struct nmreq {
char nr_name[IFNAMSIZ];
uint32_t nr_version; /* API version */
#define NETMAP_API 5 /* current version */
uint32_t nr_offset; /* nifp offset in the shared region */
uint32_t nr_memsize; /* size of the shared region */
uint32_t nr_tx_slots; /* slots in tx rings */
@ -339,19 +389,23 @@ struct nmreq {
#define NETMAP_SW_RING 0x2000 /* process the sw ring */
#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */
#define NETMAP_RING_MASK 0xfff /* the ring number */
uint16_t nr_cmd;
#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
#define NETMAP_BDG_DETACH 2 /* detach the NIC */
#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */
#define NETMAP_BDG_LIST 4 /* get bridge's info */
#define NETMAP_BDG_OFFSET 5 /* set the port offset */
uint16_t nr_arg1;
#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */
#define NETMAP_BDG_MAX_OFFSET 12
uint16_t nr_arg2;
uint32_t spare2[3];
};
/*
* FreeBSD uses the size value embedded in the _IOWR to determine
* how much to copy in/out. So we need it to match the actual
@ -360,9 +414,22 @@ struct nmreq {
*/
#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */
#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */
#define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */
#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */
#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */
#endif /* !NIOCREGIF */
/*
* Helper functions for kernel and userspace
*/
/*
* check if space is available in the ring.
*/
static inline int
nm_ring_empty(struct netmap_ring *ring)
{
return (ring->cur == ring->tail);
}
#endif /* _NET_NETMAP_H_ */

View File

@ -1,6 +1,5 @@
/*
* Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2013 Universita` di Pisa
* Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -28,8 +27,8 @@
/*
* $FreeBSD$
*
* This header contains the macros used to manipulate netmap structures
* and packets in userspace. See netmap(4) for more information.
* Functions and macros to manipulate netmap structures and packets
* in userspace. See netmap(4) for more information.
*
* The address of the struct netmap_if, say nifp, is computed from the
* value returned from ioctl(.., NIOCREG, ...) and the mmap region:
@ -44,17 +43,20 @@
* we can access ring->nr_cur, ring->nr_avail, ring->nr_flags
*
* ring->slot[i] gives us the i-th slot (we can access
* directly plen, flags, bufindex)
* directly len, flags, buf_idx)
*
* char *buf = NETMAP_BUF(ring, x) returns a pointer to
* the buffer numbered x
*
* Since rings are circular, we have macros to compute the next index
* i = NETMAP_RING_NEXT(ring, i);
* All ring indexes (head, cur, tail) should always move forward.
* To compute the next index in a circular ring you can use
* i = nm_ring_next(ring, i);
*
* To ease porting apps from pcap to netmap we supply a few fuctions
* that can be called to open, close and read from netmap in a way
* similar to libpcap.
* that can be called to open, close, read and write on netmap in a way
* similar to libpcap. Note that the read/write function depend on
* an ioctl()/select()/poll() being issued to refill rings or push
* packets out.
*
* In order to use these, include #define NETMAP_WITH_LIBS
* in the source file that invokes these functions.
@ -65,12 +67,19 @@
#include <stdint.h>
#include <net/if.h> /* IFNAMSIZ */
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif /* likely and unlikely */
#include <net/netmap.h>
/* helper macro */
#define _NETMAP_OFFSET(type, ptr, offset) \
((type)(void *)((char *)(ptr) + (offset)))
#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o)
#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs)
#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
nifp, (nifp)->ring_ofs[index] )
@ -85,18 +94,34 @@
( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \
(ring)->nr_buf_size )
#define NETMAP_RING_NEXT(r, i) \
((i)+1 == (r)->num_slots ? 0 : (i) + 1 )
#define NETMAP_RING_FIRST_RESERVED(r) \
( (r)->cur < (r)->reserved ? \
(r)->cur + (r)->num_slots - (r)->reserved : \
(r)->cur - (r)->reserved )
static inline uint32_t
nm_ring_next(struct netmap_ring *r, uint32_t i)
{
return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1);
}
/*
* Return 1 if the given tx ring is empty.
* Return 1 if we have pending transmissions in the tx ring.
* When everything is complete ring->cur = ring->tail + 1 (modulo ring size)
*/
#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1)
static inline int
nm_tx_pending(struct netmap_ring *r)
{
return nm_ring_next(r, r->tail) != r->cur;
}
static inline uint32_t
nm_ring_space(struct netmap_ring *ring)
{
int ret = ring->tail - ring->cur;
if (ret < 0)
ret += ring->num_slots;
return ret;
}
#ifdef NETMAP_WITH_LIBS
/*
@ -113,7 +138,12 @@
#include <sys/ioctl.h>
#include <sys/errno.h> /* EINVAL */
#include <fcntl.h> /* O_RDWR */
#include <malloc.h>
#include <unistd.h> /* close() */
#ifdef __FreeBSD__
#include <stdlib.h>
#else
#include <malloc.h> /* on FreeBSD it is stdlib.h */
#endif
struct nm_hdr_t { /* same as pcap_pkthdr */
struct timeval ts;
@ -139,30 +169,73 @@ struct nm_desc_t {
#define IS_NETMAP_DESC(d) (P2NMD(d)->self == P2NMD(d))
#define NETMAP_FD(d) (P2NMD(d)->fd)
/*
* this is a slightly optimized copy routine which rounds
* to multiple of 64 bytes and is often faster than dealing
* with other odd sizes. We assume there is enough room
* in the source and destination buffers.
*
* XXX only for multiples of 64 bytes, non overlapped.
*/
static inline void
pkt_copy(const void *_src, void *_dst, int l)
{
const uint64_t *src = _src;
uint64_t *dst = _dst;
if (unlikely(l >= 1024)) {
memcpy(dst, src, l);
return;
}
for (; likely(l > 0); l-=64) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
/*
* The callback, invoked on each received packet. Same as libpcap
*/
typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d);
/*
* The open routine accepts an ifname (netmap:foo or vale:foo) and
* optionally a second (string) argument indicating the ring number
*--- the pcap-like API ---
*
* nm_open() opens a file descriptor, binds to a port and maps memory.
*
* ifname (netmap:foo or vale:foo) is the port name
* flags can be NETMAP_SW_RING or NETMAP_HW_RING etc.
* ring_no only used if NETMAP_HW_RING is specified, is interpreted
* as a string or integer indicating the ring number
* ring_flags is stored in all ring flags (e.g. for transparent mode)
* to open. If successful, t opens the fd and maps the memory.
*/
static struct nm_desc_t *nm_open(const char *ifname,
const char *ring_no, int flags, int ring_flags);
/*
* nm_dispatch() is the same as pcap_dispatch()
* nm_next() is the same as pcap_next()
* nm_close() closes and restores the port to its previous state
*/
static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *);
static u_char *nm_next(struct nm_desc_t *, struct nm_hdr_t *);
static int nm_close(struct nm_desc_t *);
/*
* unmap memory, close file descriptor and free the descriptor.
* nm_inject() is the same as pcap_inject()
* nm_dispatch() is the same as pcap_dispatch()
* nm_nextpkt() is the same as pcap_next()
*/
static int nm_close(struct nm_desc_t *);
static int nm_inject(struct nm_desc_t *, const void *, size_t);
static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *);
static u_char *nm_nextpkt(struct nm_desc_t *, struct nm_hdr_t *);
/*
@ -240,6 +313,12 @@ nm_open(const char *ifname, const char *ring_name, int flags, int ring_flags)
static int
nm_close(struct nm_desc_t *d)
{
/*
* ugly trick to avoid unused warnings
*/
static void *__xxzt[] __attribute__ ((unused)) =
{ nm_open, nm_inject, nm_dispatch, nm_nextpkt } ;
if (d == NULL || d->self != d)
return EINVAL;
if (d->mem)
@ -252,10 +331,46 @@ nm_close(struct nm_desc_t *d)
}
/*
* Same prototype as pcap_inject(), only need to cast.
*/
static int
nm_inject(struct nm_desc_t *d, const void *buf, size_t size)
{
u_int c, n = d->last_ring - d->first_ring + 1;
if (0) fprintf(stderr, "%s rings %d %d %d\n", __FUNCTION__,
d->first_ring, d->cur_ring, d->last_ring);
for (c = 0; c < n ; c++) {
/* compute current ring to use */
struct netmap_ring *ring;
uint32_t i, idx;
uint32_t ri = d->cur_ring + c;
if (ri > d->last_ring)
ri = d->first_ring;
ring = NETMAP_TXRING(d->nifp, ri);
if (nm_ring_empty(ring)) {
if (0) fprintf(stderr, "%s ring %d cur %d tail %d\n",
__FUNCTION__,
ri, ring->cur, ring->tail);
continue;
}
i = ring->cur;
idx = ring->slot[i].buf_idx;
ring->slot[i].len = size;
pkt_copy(buf, NETMAP_BUF(ring, idx), size);
d->cur_ring = ri;
ring->head = ring->cur = nm_ring_next(ring, i);
return size;
}
return 0; /* fail */
}
/*
* Same prototype as pcap_dispatch(), only need to cast.
*/
inline /* not really, but disable unused warnings */
static int
nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
{
@ -276,7 +391,7 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
if (ri > d->last_ring)
ri = d->first_ring;
ring = NETMAP_RXRING(d->nifp, ri);
for ( ; ring->avail > 0 && cnt != got; got++) {
for ( ; !nm_ring_empty(ring) && cnt != got; got++) {
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
@ -285,24 +400,22 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
d->hdr.len = d->hdr.caplen = ring->slot[i].len;
d->hdr.ts = ring->ts;
cb(arg, &d->hdr, buf);
ring->cur = NETMAP_RING_NEXT(ring, i);
ring->avail--;
ring->head = ring->cur = nm_ring_next(ring, i);
}
}
d->cur_ring = ri;
return got;
}
inline /* not really, but disable unused warnings */
static u_char *
nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr)
nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr)
{
int ri = d->cur_ring;
do {
/* compute current ring to use */
struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri);
if (ring->avail > 0) {
if (!nm_ring_empty(ring)) {
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
@ -310,8 +423,12 @@ nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr)
// prefetch(buf);
hdr->ts = ring->ts;
hdr->len = hdr->caplen = ring->slot[i].len;
ring->cur = NETMAP_RING_NEXT(ring, i);
ring->avail--;
ring->cur = nm_ring_next(ring, i);
/* we could postpone advancing head if we want
* to hold the buffer. This can be supported in
* the future.
*/
ring->head = ring->cur;
d->cur_ring = ri;
return buf;
}

View File

@ -1,5 +1,5 @@
/*
* (C) 2011 Luigi Rizzo, Matteo Landi
* (C) 2011-2014 Luigi Rizzo, Matteo Landi
*
* BSD license
*
@ -42,10 +42,12 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring,
msg, rxring->flags, txring->flags);
j = rxring->cur; /* RX */
k = txring->cur; /* TX */
if (rxring->avail < limit)
limit = rxring->avail;
if (txring->avail < limit)
limit = txring->avail;
m = nm_ring_space(rxring);
if (m < limit)
limit = m;
m = nm_ring_space(txring);
if (m < limit)
limit = m;
m = limit;
while (limit-- > 0) {
struct netmap_slot *rs = &rxring->slot[j];
@ -81,13 +83,11 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring,
ts->flags |= NS_BUF_CHANGED;
rs->flags |= NS_BUF_CHANGED;
#endif /* NO_SWAP */
j = NETMAP_RING_NEXT(rxring, j);
k = NETMAP_RING_NEXT(txring, k);
j = nm_ring_next(rxring, j);
k = nm_ring_next(txring, k);
}
rxring->avail -= m;
txring->avail -= m;
rxring->cur = j;
txring->cur = k;
rxring->head = rxring->cur = j;
txring->head = txring->cur = k;
if (verbose && m > 0)
D("%s sent %d packets to %p", msg, m, txring);
@ -107,11 +107,11 @@ move(struct my_ring *src, struct my_ring *dst, u_int limit)
rxring = NETMAP_RXRING(src->nifp, si);
txring = NETMAP_TXRING(dst->nifp, di);
ND("txring %p rxring %p", txring, rxring);
if (rxring->avail == 0) {
if (nm_ring_empty(rxring)) {
si++;
continue;
}
if (txring->avail == 0) {
if (nm_ring_empty(txring)) {
di++;
continue;
}
@ -133,7 +133,7 @@ pkt_queued(struct my_ring *me, int tx)
for (i = me->begin; i < me->end; i++) {
struct netmap_ring *ring = tx ?
NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i);
tot += ring->avail;
tot += nm_ring_space(ring);
}
if (0 && verbose && tot && !tx)
D("ring %s %s %s has %d avail at %d",
@ -288,12 +288,12 @@ main(int argc, char **argv)
if (ret < 0)
continue;
if (pollfd[0].revents & POLLERR) {
D("error on fd0, rxcur %d@%d",
me[0].rx->avail, me[0].rx->cur);
D("error on fd0, rx [%d,%d)",
me[0].rx->cur, me[0].rx->tail);
}
if (pollfd[1].revents & POLLERR) {
D("error on fd1, rxcur %d@%d",
me[1].rx->avail, me[1].rx->cur);
D("error on fd1, rx [%d,%d)",
me[1].rx->cur, me[1].rx->tail);
}
if (pollfd[0].revents & POLLOUT) {
move(me + 1, me, burst);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2012-2013 Luigi Rizzo. All rights reserved.
* Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -232,7 +232,7 @@ pkt_queued(struct my_ring *me, int tx)
for (i = me->begin; i < me->end; i++) {
struct netmap_ring *ring = tx ?
NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i);
tot += ring->avail;
tot += nm_ring_space(ring);
}
if (0 && verbose && tot && !tx)
D("ring %s %s %s has %d avail at %d",
@ -242,3 +242,90 @@ pkt_queued(struct my_ring *me, int tx)
tot, NETMAP_TXRING(me->nifp, me->begin)->cur);
return tot;
}
#if 0
/*
*
Helper routines for multiple readers from the same queue
- all readers open the device in 'passive' mode (NETMAP_PRIV_RING set).
In this mode a thread that loses the race on a poll() just continues
without calling *xsync()
- all readers share an extra 'ring' which contains the sync information.
In particular we have a shared head+tail pointers that work
together with cur and available
ON RETURN FROM THE SYSCALL:
shadow->head = ring->cur
shadow->tail = ring->tail
shadow->link[i] = i for all slots // mark invalid
*/
struct nm_q_arg {
u_int want; /* Input */
u_int have; /* Output, 0 on error */
u_int head;
u_int tail;
struct netmap_ring *ring;
};
/*
* grab a number of slots from the queue.
*/
struct nm_q_arg
my_grab(struct nm_q_arg q)
{
const u_int ns = q.ring->num_slots;
for (;;) {
q.head = (volatile u_int)q.ring->head;
q.have = ns + q.head - (volatile u_int)q.ring->tail;
if (q.have >= ns)
q.have -= ns;
if (q.have == 0) /* no space */
break;
if (q.want < q.have)
q.have = q.want;
q.tail = q.head + q.have;
if (q.tail >= ns)
q.tail -= ns;
if (atomic_cmpset_int(&q.ring->head, q.head, q.tail)
break; /* success */
}
D("returns %d out of %d at %d,%d",
q.have, q.want, q.head, q.tail);
/* the last one can clear avail ? */
return q;
}
int
my_release(struct nm_q_arg q)
{
u_int head = q.head, tail = q.tail, i;
struct netmap_ring *r = q.ring;
/* link the block to the next one.
* there is no race here because the location is mine.
*/
r->slot[head].ptr = tail; /* this is mine */
// memory barrier
if (r->head != head)
return; /* not my turn to release */
for (;;) {
// advance head
r->head = head = r->slot[head].ptr;
// barrier ?
if (head == r->slot[head].ptr)
break; // stop here
}
/* we have advanced from q.head to head (r.head might be
* further down.
*/
// do an ioctl/poll to flush.
}
#endif /* unused */

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2012 Luigi Rizzo. All rights reserved.
* Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -32,6 +32,9 @@
#ifndef _NM_UTIL_H
#define _NM_UTIL_H
#define _GNU_SOURCE /* for CPU_SET() */
#include <errno.h>
#include <signal.h> /* signal */
#include <stdlib.h>
@ -79,6 +82,9 @@ struct pcap_pkthdr;
#include <pthread.h> /* pthread_* */
#ifdef linux
#define cpuset_t cpu_set_t
#define ifr_flagshigh ifr_flags
#define ifr_curcap ifr_flags
#define ifr_reqcap ifr_flags

View File

@ -1,5 +1,5 @@
/*
* (C) 2011-2012 Luigi Rizzo
* (C) 2011-2014 Luigi Rizzo
*
* BSD license
*
@ -499,15 +499,14 @@ pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
/* scan all rings */
for (si = me->begin; si < me->end; si++) {
struct netmap_ring *ring = NETMAP_RXRING(me->nifp, si);
ND("ring has %d pkts", ring->avail);
if (ring->avail == 0)
if (nm_ring_empty(ring))
continue;
pme->hdr.ts = ring->ts;
/*
* XXX a proper prefetch should be done as
* prefetch(i); callback(i-1); ...
*/
while ((cnt == -1 || cnt != got) && ring->avail > 0) {
while ((cnt == -1 || cnt != got) && !nm_ring_empty(ring)) {
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
if (idx < 2) {
@ -520,8 +519,7 @@ pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
pme->hdr.len = pme->hdr.caplen = ring->slot[i].len;
// D("call %p len %d", p, me->hdr.len);
callback(user, &pme->hdr, buf);
ring->cur = NETMAP_RING_NEXT(ring, i);
ring->avail--;
ring->head = ring->cur = nm_ring_next(ring, i);
got++;
}
}
@ -540,8 +538,7 @@ pcap_inject(pcap_t *p, const void *buf, size_t size)
for (si = me->begin; si < me->end; si++) {
struct netmap_ring *ring = NETMAP_TXRING(me->nifp, si);
ND("ring has %d pkts", ring->avail);
if (ring->avail == 0)
if (nm_ring_empty(ring))
continue;
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
@ -553,9 +550,8 @@ pcap_inject(pcap_t *p, const void *buf, size_t size)
u_char *dst = (u_char *)NETMAP_BUF(ring, idx);
ring->slot[i].len = size;
pkt_copy(buf, dst, size);
ring->cur = NETMAP_RING_NEXT(ring, i);
ring->avail--;
// if (ring->avail == 0) ioctl(me->fd, NIOCTXSYNC, NULL);
ring->head = ring->cur = nm_ring_next(ring, i);
// if (ring->cur == ring->tail) ioctl(me->fd, NIOCTXSYNC, NULL);
return size;
}
errno = ENOBUFS;

View File

@ -1,5 +1,6 @@
/*
* Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -52,7 +53,16 @@ int verbose = 0;
#define SKIP_PAYLOAD 1 /* do not check payload. */
#define VIRT_HDR_1 10 /* length of a base vnet-hdr */
#define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */
#define VIRT_HDR_MAX VIRT_HDR_2
struct virt_header {
uint8_t fields[VIRT_HDR_MAX];
};
struct pkt {
struct virt_header vh;
struct ether_header eh;
struct ip ip;
struct udphdr udp;
@ -109,6 +119,8 @@ struct glob_arg {
char *ifname;
char *nmr_config;
int dummy_send;
int virt_header; /* send also the virt_header */
int host_ring;
};
enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
@ -146,7 +158,8 @@ extract_ip_range(struct ip_range *r)
char *ap, *pp;
struct in_addr a;
D("extract IP range from %s", r->name);
if (verbose)
D("extract IP range from %s", r->name);
r->port0 = r->port1 = 0;
r->start = r->end = 0;
@ -192,7 +205,8 @@ extract_ip_range(struct ip_range *r)
a.s_addr = htonl(r->end);
strncpy(buf1, inet_ntoa(a), sizeof(buf1));
a.s_addr = htonl(r->start);
D("range is %s:%d to %s:%d",
if (1)
D("range is %s:%d to %s:%d",
inet_ntoa(a), r->port0, buf1, r->port1);
}
}
@ -200,7 +214,8 @@ extract_ip_range(struct ip_range *r)
static void
extract_mac_range(struct mac_range *r)
{
D("extract MAC range from %s", r->name);
if (verbose)
D("extract MAC range from %s", r->name);
bcopy(ether_aton(r->name), &r->start, 6);
bcopy(ether_aton(r->name), &r->end, 6);
#if 0
@ -215,7 +230,8 @@ extract_mac_range(struct mac_range *r)
if (p)
targ->dst_mac_range = atoi(p+1);
#endif
D("%s starts at %s", r->name, ether_ntoa(&r->start));
if (verbose)
D("%s starts at %s", r->name, ether_ntoa(&r->start));
}
static struct targ *targs;
@ -281,7 +297,7 @@ system_ncpus(void)
* Missing numbers or zeroes stand for default values.
* As an additional convenience, if exactly one number
* is specified, then this is assigned to both #tx-slots and #rx-slots.
* If there is no 4th number, then the 3rd is assigned to both #tx-rings
* If there is no 4th number, then the 3rd is assigned to both #tx-rings
* and #rx-rings.
*/
void parse_nmr_config(const char* conf, struct nmreq *nmr)
@ -362,7 +378,7 @@ source_hwaddr(const char *ifname, char *buf)
static int
setaffinity(pthread_t me, int i)
{
#ifdef __FreeBSD__
#if 1 // def __FreeBSD__
cpuset_t cpumask;
if (i == -1)
@ -373,7 +389,7 @@ setaffinity(pthread_t me, int i)
CPU_SET(i, &cpumask);
if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
D("Unable to set affinity");
D("Unable to set affinity: %s", strerror(errno));
return 1;
}
#else
@ -559,6 +575,8 @@ initialize_packet(struct targ *targ)
bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
eh->ether_type = htons(ETHERTYPE_IP);
bzero(&pkt->vh, sizeof(pkt->vh));
// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
}
@ -570,18 +588,19 @@ initialize_packet(struct targ *targ)
* an interrupt when done.
*/
static int
send_packets(struct netmap_ring *ring, struct pkt *pkt,
struct glob_arg *g, u_int count, int options, u_int nfrags)
send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
int size, struct glob_arg *g, u_int count, int options,
u_int nfrags)
{
u_int sent, cur = ring->cur;
u_int n, sent, cur = ring->cur;
int fcnt;
int size = g->pkt_size;
if (ring->avail < count)
count = ring->avail;
n = nm_ring_space(ring);
if (n < count)
count = n;
if (count < nfrags) {
D("truncating packet, no room for frags %d %d",
count, nfrags);
count, nfrags);
}
#if 0
if (options & (OPT_COPY | OPT_PREFETCH) ) {
@ -590,7 +609,7 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt,
char *p = NETMAP_BUF(ring, slot->buf_idx);
prefetch(p);
cur = NETMAP_RING_NEXT(ring, cur);
cur = nm_ring_next(ring, cur);
}
cur = ring->cur;
}
@ -602,13 +621,13 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt,
slot->flags = 0;
if (options & OPT_INDIRECT) {
slot->flags |= NS_INDIRECT;
slot->ptr = (uint64_t)pkt;
slot->ptr = (uint64_t)frame;
} else if (options & OPT_COPY) {
pkt_copy(pkt, p, size);
pkt_copy(frame, p, size);
if (fcnt == 1)
update_addresses(pkt, g);
} else if (options & OPT_MEMCPY) {
memcpy(p, pkt, size);
memcpy(p, frame, size);
if (fcnt == 1)
update_addresses(pkt, g);
} else if (options & OPT_PREFETCH) {
@ -625,10 +644,9 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt,
slot->flags &= ~NS_MOREFRAG;
slot->flags |= NS_REPORT;
}
cur = NETMAP_RING_NEXT(ring, cur);
cur = nm_ring_next(ring, cur);
}
ring->avail -= sent;
ring->cur = cur;
ring->head = ring->cur = cur;
return (sent);
}
@ -647,6 +665,12 @@ pinger_body(void *data)
struct pollfd fds[1];
struct netmap_if *nifp = targ->nifp;
int i, rx = 0, n = targ->g->npackets;
void *frame;
int size;
frame = &targ->pkt;
frame += sizeof(targ->pkt.vh) - targ->g->virt_header;
size = targ->g->pkt_size + targ->g->virt_header;
fds[0].fd = targ->fd;
fds[0].events = (POLLIN);
@ -660,36 +684,37 @@ pinger_body(void *data)
}
clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
now = last_print;
while (n == 0 || (int)sent < n) {
struct netmap_ring *ring = NETMAP_TXRING(nifp, 0);
struct netmap_slot *slot;
char *p;
for (i = 0; i < 1; i++) {
for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */
slot = &ring->slot[ring->cur];
slot->len = targ->g->pkt_size;
slot->len = size;
p = NETMAP_BUF(ring, slot->buf_idx);
if (ring->avail == 0) {
if (nm_ring_empty(ring)) {
D("-- ouch, cannot send");
} else {
pkt_copy(&targ->pkt, p, targ->g->pkt_size);
pkt_copy(frame, p, size);
clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
bcopy(&sent, p+42, sizeof(sent));
bcopy(&ts, p+46, sizeof(ts));
sent++;
ring->cur = NETMAP_RING_NEXT(ring, ring->cur);
ring->avail--;
ring->head = ring->cur = nm_ring_next(ring, ring->cur);
}
}
/* should use a parameter to decide how often to send */
if (poll(fds, 1, 3000) <= 0) {
D("poll error/timeout on queue %d", targ->me);
D("poll error/timeout on queue %d: %s", targ->me,
strerror(errno));
continue;
}
/* see what we got back */
for (i = targ->qfirst; i < targ->qlast; i++) {
ring = NETMAP_RXRING(nifp, i);
while (ring->avail > 0) {
while (!nm_ring_empty(ring)) {
uint32_t seq;
slot = &ring->slot[ring->cur];
p = NETMAP_BUF(ring, slot->buf_idx);
@ -709,8 +734,7 @@ pinger_body(void *data)
min = ts.tv_nsec;
count ++;
av += ts.tv_nsec;
ring->avail--;
ring->cur = NETMAP_RING_NEXT(ring, ring->cur);
ring->head = ring->cur = nm_ring_next(ring, ring->cur);
rx++;
}
}
@ -761,25 +785,25 @@ ponger_body(void *data)
ioctl(fds[0].fd, NIOCRXSYNC, NULL);
#else
if (poll(fds, 1, 1000) <= 0) {
D("poll error/timeout on queue %d", targ->me);
D("poll error/timeout on queue %d: %s", targ->me,
strerror(errno));
continue;
}
#endif
txring = NETMAP_TXRING(nifp, 0);
txcur = txring->cur;
txavail = txring->avail;
txavail = nm_ring_space(txring);
/* see what we got back */
for (i = targ->qfirst; i < targ->qlast; i++) {
rxring = NETMAP_RXRING(nifp, i);
while (rxring->avail > 0) {
while (!nm_ring_empty(rxring)) {
uint16_t *spkt, *dpkt;
uint32_t cur = rxring->cur;
struct netmap_slot *slot = &rxring->slot[cur];
char *src, *dst;
src = NETMAP_BUF(rxring, slot->buf_idx);
//D("got pkt %p of size %d", src, slot->len);
rxring->avail--;
rxring->cur = NETMAP_RING_NEXT(rxring, cur);
rxring->head = rxring->cur = nm_ring_next(rxring, cur);
rx++;
if (txavail == 0)
continue;
@ -797,13 +821,12 @@ ponger_body(void *data)
dpkt[5] = spkt[2];
txring->slot[txcur].len = slot->len;
/* XXX swap src dst mac */
txcur = NETMAP_RING_NEXT(txring, txcur);
txcur = nm_ring_next(txring, txcur);
txavail--;
sent++;
}
}
txring->cur = txcur;
txring->avail = txavail;
txring->head = txring->cur = txcur;
targ->count = sent;
#ifdef BUSYWAIT
ioctl(fds[0].fd, NIOCTXSYNC, NULL);
@ -847,43 +870,47 @@ timespec2val(const struct timespec *a)
}
static int
wait_time(struct timespec ts, struct timespec *wakeup_ts, long long *waited)
static __inline struct timespec
timespec_add(struct timespec a, struct timespec b)
{
struct timespec curtime;
curtime.tv_sec = 0;
curtime.tv_nsec = 0;
if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) {
D("clock_gettime: %s", strerror(errno));
return (-1);
struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };
if (ret.tv_nsec >= 1000000000) {
ret.tv_sec++;
ret.tv_nsec -= 1000000000;
}
while (timespec_ge(&ts, &curtime)) {
if (waited != NULL)
(*waited)++;
if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) {
D("clock_gettime");
return (-1);
}
}
if (wakeup_ts != NULL)
*wakeup_ts = curtime;
return (0);
return ret;
}
static __inline void
timespec_add(struct timespec *tsa, struct timespec *tsb)
static __inline struct timespec
timespec_sub(struct timespec a, struct timespec b)
{
tsa->tv_sec += tsb->tv_sec;
tsa->tv_nsec += tsb->tv_nsec;
if (tsa->tv_nsec >= 1000000000) {
tsa->tv_sec++;
tsa->tv_nsec -= 1000000000;
struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };
if (ret.tv_nsec < 0) {
ret.tv_sec--;
ret.tv_nsec += 1000000000;
}
return ret;
}
/*
* wait until ts, either busy or sleeping if more than 1ms.
* Return wakeup time.
*/
static struct timespec
wait_time(struct timespec ts)
{
for (;;) {
struct timespec w, cur;
clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
w = timespec_sub(ts, cur);
if (w.tv_sec < 0)
return cur;
else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
poll(NULL, 0, 1);
}
}
static void *
sender_body(void *data)
{
@ -894,9 +921,15 @@ sender_body(void *data)
struct netmap_ring *txring;
int i, n = targ->g->npackets / targ->g->nthreads, sent = 0;
int options = targ->g->options | OPT_COPY;
struct timespec tmptime, nexttime = { 0, 0}; // XXX silence compiler
struct timespec nexttime = { 0, 0}; // XXX silence compiler
int rate_limit = targ->g->tx_rate;
long long waited = 0;
struct pkt *pkt = &targ->pkt;
void *frame;
int size;
frame = pkt;
frame += sizeof(pkt->vh) - targ->g->virt_header;
size = targ->g->pkt_size + targ->g->virt_header;
D("start");
if (setaffinity(targ->thread, targ->affinity))
@ -909,23 +942,16 @@ sender_body(void *data)
/* main loop.*/
clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
if (rate_limit) {
tmptime.tv_sec = 2;
tmptime.tv_nsec = 0;
timespec_add(&targ->tic, &tmptime);
targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
targ->tic.tv_nsec = 0;
if (wait_time(targ->tic, NULL, NULL) == -1) {
D("wait_time: %s", strerror(errno));
goto quit;
}
wait_time(targ->tic);
nexttime = targ->tic;
}
if (targ->g->dev_type == DEV_PCAP) {
int size = targ->g->pkt_size;
void *pkt = &targ->pkt;
pcap_t *p = targ->g->p;
for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
if (pcap_inject(p, pkt, size) != -1)
if (pcap_inject(p, frame, size) != -1)
sent++;
update_addresses(pkt, targ->g);
if (i > 10000) {
@ -934,12 +960,10 @@ sender_body(void *data)
}
}
} else if (targ->g->dev_type == DEV_TAP) { /* tap */
int size = targ->g->pkt_size;
void *pkt = &targ->pkt;
D("writing to file desc %d", targ->g->main_fd);
for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
if (write(targ->g->main_fd, pkt, size) != -1)
if (write(targ->g->main_fd, frame, size) != -1)
sent++;
update_addresses(pkt, targ->g);
if (i > 10000) {
@ -955,11 +979,8 @@ sender_body(void *data)
if (rate_limit && tosend <= 0) {
tosend = targ->g->burst;
timespec_add(&nexttime, &targ->g->tx_period);
if (wait_time(nexttime, &tmptime, &waited) == -1) {
D("wait_time");
goto quit;
}
nexttime = timespec_add(nexttime, targ->g->tx_period);
wait_time(nexttime);
}
/*
@ -968,7 +989,12 @@ sender_body(void *data)
if (poll(fds, 1, 2000) <= 0) {
if (targ->cancel)
break;
D("poll error/timeout on queue %d", targ->me);
D("poll error/timeout on queue %d: %s", targ->me,
strerror(errno));
goto quit;
}
if (fds[0].revents & POLLERR) {
D("poll error");
goto quit;
}
/*
@ -983,12 +1009,12 @@ sender_body(void *data)
if (n > 0 && n - sent < limit)
limit = n - sent;
txring = NETMAP_TXRING(nifp, i);
if (txring->avail == 0)
if (nm_ring_empty(txring))
continue;
if (frags > 1)
limit = ((limit + frags - 1) / frags) * frags;
m = send_packets(txring, &targ->pkt, targ->g,
m = send_packets(txring, pkt, frame, size, targ->g,
limit, options, frags);
ND("limit %d avail %d frags %d m %d",
limit, txring->avail, frags, m);
@ -1007,7 +1033,7 @@ sender_body(void *data)
/* final part: wait all the TX queues to be empty. */
for (i = targ->qfirst; i < targ->qlast; i++) {
txring = NETMAP_TXRING(nifp, i);
while (!NETMAP_TX_RING_EMPTY(txring)) {
while (nm_tx_pending(txring)) {
ioctl(fds[0].fd, NIOCTXSYNC, NULL);
usleep(1); /* wait 1 tick */
}
@ -1039,11 +1065,12 @@ receive_pcap(u_char *user, const struct pcap_pkthdr * h,
static int
receive_packets(struct netmap_ring *ring, u_int limit, int dump)
{
u_int cur, rx;
u_int cur, rx, n;
cur = ring->cur;
if (ring->avail < limit)
limit = ring->avail;
n = nm_ring_space(ring);
if (n < limit)
limit = n;
for (rx = 0; rx < limit; rx++) {
struct netmap_slot *slot = &ring->slot[cur];
char *p = NETMAP_BUF(ring, slot->buf_idx);
@ -1051,10 +1078,9 @@ receive_packets(struct netmap_ring *ring, u_int limit, int dump)
if (dump)
dump_payload(p, slot->len, ring, cur);
cur = NETMAP_RING_NEXT(ring, cur);
cur = nm_ring_next(ring, cur);
}
ring->avail -= rx;
ring->cur = cur;
ring->head = ring->cur = cur;
return (rx);
}
@ -1082,7 +1108,7 @@ receiver_body(void *data)
i = poll(fds, 1, 1000);
if (i > 0 && !(fds[0].revents & POLLERR))
break;
D("waiting for initial packets, poll returns %d %d", i, fds[0].revents);
RD(1, "waiting for initial packets, poll returns %d %d", i, fds[0].revents);
}
/* main loop, exit after 1s silence */
@ -1111,11 +1137,16 @@ receiver_body(void *data)
break;
}
if (fds[0].revents & POLLERR) {
D("poll err");
goto quit;
}
for (i = targ->qfirst; i < targ->qlast; i++) {
int m;
rxring = NETMAP_RXRING(nifp, i);
if (rxring->avail == 0)
if (nm_ring_empty(rxring))
continue;
m = receive_packets(rxring, targ->g->burst, dump);
@ -1215,6 +1246,8 @@ usage(void)
"\t-w wait_for_link_time in seconds\n"
"\t-R rate in packets per second\n"
"\t-X dump payload\n"
"\t-H len add empty virtio-net-header with size 'len'\n"
"\t-h use host ring\n"
"",
cmd);
@ -1243,7 +1276,7 @@ start_threads(struct glob_arg *g)
/* register interface. */
tfd = open("/dev/netmap", O_RDWR);
if (tfd == -1) {
D("Unable to open /dev/netmap");
D("Unable to open /dev/netmap: %s", strerror(errno));
continue;
}
targs[i].fd = tfd;
@ -1251,7 +1284,11 @@ start_threads(struct glob_arg *g)
bzero(&tifreq, sizeof(tifreq));
strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name));
tifreq.nr_version = NETMAP_API;
tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0;
if (g->host_ring) {
tifreq.nr_ringid = NETMAP_SW_RING;
} else {
tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0;
}
parse_nmr_config(g->nmr_config, &tifreq);
/*
@ -1264,7 +1301,7 @@ start_threads(struct glob_arg *g)
}
if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) {
D("Unable to register %s", g->ifname);
D("Unable to register %s: %s", g->ifname, strerror(errno));
continue;
}
D("memsize is %d MB", tifreq.nr_memsize >> 20);
@ -1272,9 +1309,14 @@ start_threads(struct glob_arg *g)
targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset);
D("nifp flags 0x%x", targs[i].nifp->ni_flags);
/* start threads. */
targs[i].qfirst = (g->nthreads > 1) ? i : 0;
targs[i].qlast = (g->nthreads > 1) ? i+1 :
(g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
if (g->host_ring) {
targs[i].qfirst = (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
targs[i].qlast = targs[i].qfirst + 1;
} else {
targs[i].qfirst = (g->nthreads > 1) ? i : 0;
targs[i].qlast = (g->nthreads > 1) ? i+1 :
(g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
}
} else {
targs[i].fd = g->main_fd;
}
@ -1292,7 +1334,7 @@ start_threads(struct glob_arg *g)
if (pthread_create(&targs[i].thread, NULL, g->td_body,
&targs[i]) == -1) {
D("Unable to create thread %d", i);
D("Unable to create thread %d: %s", i, strerror(errno));
targs[i].used = 0;
}
}
@ -1439,7 +1481,7 @@ tap_alloc(char *dev)
/* try to create the device */
if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
D("failed to to a TUNSETIFF");
D("failed to to a TUNSETIFF: %s", strerror(errno));
close(fd);
return err;
}
@ -1488,9 +1530,10 @@ main(int arc, char **argv)
g.tx_rate = 0;
g.frags = 1;
g.nmr_config = "";
g.virt_header = 0;
while ( (ch = getopt(arc, argv,
"a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:")) != -1) {
"a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:H:h")) != -1) {
struct sf *fn;
switch(ch) {
@ -1613,6 +1656,11 @@ main(int arc, char **argv)
break;
case 'C':
g.nmr_config = strdup(optarg);
break;
case 'H':
g.virt_header = atoi(optarg);
case 'h':
g.host_ring = 1;
}
}
@ -1649,6 +1697,12 @@ main(int arc, char **argv)
extract_mac_range(&g.src_mac);
extract_mac_range(&g.dst_mac);
if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
&& g.virt_header != VIRT_HDR_2) {
D("bad virtio-net-header length");
usage();
}
if (g.dev_type == DEV_TAP) {
D("want to use tap %s", g.ifname);
g.main_fd = tap_alloc(g.ifname);
@ -1682,7 +1736,7 @@ main(int arc, char **argv)
*/
g.main_fd = open("/dev/netmap", O_RDWR);
if (g.main_fd == -1) {
D("Unable to open /dev/netmap");
D("Unable to open /dev/netmap: %s", strerror(errno));
// fail later
}
/*
@ -1696,22 +1750,16 @@ main(int arc, char **argv)
bzero(&nmr, sizeof(nmr));
nmr.nr_version = NETMAP_API;
strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name));
nmr.nr_version = NETMAP_API;
parse_nmr_config(g.nmr_config, &nmr);
if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) {
D("Unable to register interface %s", g.ifname);
D("Unable to register interface %s: %s", g.ifname, strerror(errno));
//continue, fail later
}
ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname,
nmr.nr_tx_rings, nmr.nr_tx_slots,
nmr.nr_rx_rings, nmr.nr_rx_slots);
//if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) {
// D("Unable to get if info without name");
//} else {
// D("map size is %d Kb", nmr.nr_memsize >> 10);
//}
if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) {
D("Unable to get if info for %s", g.ifname);
D("Unable to get if info for %s: %s", g.ifname, strerror(errno));
}
devqueues = nmr.nr_rx_rings;
@ -1732,7 +1780,7 @@ main(int arc, char **argv)
PROT_WRITE | PROT_READ,
MAP_SHARED, g.main_fd, 0);
if (g.mmap_addr == MAP_FAILED) {
D("Unable to mmap %d KB", nmr.nr_memsize >> 10);
D("Unable to mmap %d KB: %s", nmr.nr_memsize >> 10, strerror(errno));
// continue, fail later
}
@ -1772,14 +1820,17 @@ main(int arc, char **argv)
g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
if (g.tx_rate > 0) {
/* try to have at least something every second,
* reducing the burst size to 0.5s worth of data
* reducing the burst size to some 0.01s worth of data
* (but no less than one full set of fragments)
*/
if (g.burst > g.tx_rate/2)
g.burst = g.tx_rate/2;
uint64_t x;
int lim = (g.tx_rate)/300;
if (g.burst > lim)
g.burst = lim;
if (g.burst < g.frags)
g.burst = g.frags;
g.tx_period.tv_nsec = (1e9 / g.tx_rate) * g.burst;
x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
g.tx_period.tv_nsec = x;
g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2013 Michio Honda. All rights reserved.
* Copyright (C) 2013-2014 Michio Honda. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -118,7 +118,7 @@ main(int argc, char *argv[])
const char *command = basename(argv[0]);
char *name = NULL;
if (argc != 3 && argc != 1 /* list all */ ) {
if (argc > 3) {
usage:
fprintf(stderr,
"Usage:\n"
@ -127,12 +127,13 @@ main(int argc, char *argv[])
"\t-d interface interface name to be detached\n"
"\t-a interface interface name to be attached\n"
"\t-h interface interface name to be attached with the host stack\n"
"\t-l list all or specified bridge's interfaces\n"
"\t-l list all or specified bridge's interfaces (default)\n"
"", command);
return 0;
}
while ((ch = getopt(argc, argv, "d:a:h:g:l:")) != -1) {
while ((ch = getopt(argc, argv, "d:a:h:g:l")) != -1) {
name = optarg; /* default */
switch (ch) {
default:
fprintf(stderr, "bad option %c %s", ch, optarg);
@ -152,9 +153,14 @@ main(int argc, char *argv[])
break;
case 'l':
nr_cmd = NETMAP_BDG_LIST;
if (optind < argc && argv[optind][0] == '-')
name = NULL;
break;
}
name = optarg;
if (optind != argc) {
// fprintf(stderr, "optind %d argc %d\n", optind, argc);
goto usage;
}
}
if (argc == 1)
nr_cmd = NETMAP_BDG_LIST;