freebsd-dev/sys/dev/netmap/netmap_pipe.c
Luigi Rizzo 4bf50f18eb Update to the current version of netmap.
Mostly bugfixes or features developed in the past 6 months,
so this is a 10.1 candidate.

Basically no user API changes (some bugfixes in sys/net/netmap_user.h).

In detail:

1. netmap support for virtio-net, including in netmap mode.
  Under bhyve and with a netmap backend [2] we reach over 1Mpps
  with standard APIs (e.g. libpcap), and 5-8 Mpps in netmap mode.

2. (kernel) add support for multiple memory allocators, so we can
  better partition physical and virtual interfaces giving access
  to separate users. The most visible effect is one additional
  argument to the various kernel functions to compute buffer
  addresses. All netmap-supported drivers are affected, but changes
  are mechanical and trivial

3. (kernel) simplify the prototype for *txsync() and *rxsync()
  driver methods. All netmap drivers affected, changes mostly mechanical.

4. add support for netmap-monitor ports. Think of it as a mirroring
  port on a physical switch: a netmap monitor port replicates traffic
  present on the main port. Restrictions apply. Drive carefully.

5. if_lem.c: support for various paravirtualization features,
  experimental and disabled by default.
  Most of these are described in our ANCS'13 paper [1].
  Paravirtualized support in netmap mode is new, and beats the
  numbers in the paper by a large factor (under qemu-kvm,
  we measured gues-host throughput up to 10-12 Mpps).

A lot of refactoring and additional documentation in the files
in sys/dev/netmap, but apart from #2 and #3 above, almost nothing
of this stuff is visible to other kernel parts.

Example programs in tools/tools/netmap have been updated with bugfixes
and to support more of the existing features.

This is meant to go into 10.1 so we plan an MFC before the Aug.22 deadline.

A lot of this code has been contributed by my colleagues at UNIPI,
including Giuseppe Lettieri, Vincenzo Maffione, Stefano Garzarella.

MFC after:	3 days.
2014-08-16 15:00:01 +00:00

689 lines
19 KiB
C

/*
* Copyright (C) 2014 Giuseppe Lettieri. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* $FreeBSD$ */
#if defined(__FreeBSD__)
#include <sys/cdefs.h> /* prerequisite */
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/param.h> /* defines used in kernel.h */
#include <sys/kernel.h> /* types used in module initialization */
#include <sys/malloc.h>
#include <sys/poll.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <sys/selinfo.h>
#include <sys/sysctl.h>
#include <sys/socket.h> /* sockaddrs */
#include <net/if.h>
#include <net/if_var.h>
#include <machine/bus.h> /* bus_dmamap_* */
#include <sys/refcount.h>
#elif defined(linux)
#include "bsd_glue.h"
#elif defined(__APPLE__)
#warning OSX support is only partial
#include "osx_glue.h"
#else
#error Unsupported platform
#endif /* unsupported */
/*
* common headers
*/
#include <net/netmap.h>
#include <dev/netmap/netmap_kern.h>
#include <dev/netmap/netmap_mem2.h>
#ifdef WITH_PIPES
#define NM_PIPE_MAXSLOTS 4096
int netmap_default_pipes = 0; /* default number of pipes for each nic */
SYSCTL_DECL(_dev_netmap);
SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , "");
/* allocate the pipe array in the parent adapter */
int
netmap_pipe_alloc(struct netmap_adapter *na, struct nmreq *nmr)
{
size_t len;
int mode = nmr->nr_flags & NR_REG_MASK;
u_int npipes;
if (mode == NR_REG_PIPE_MASTER || mode == NR_REG_PIPE_SLAVE) {
/* this is for our parent, not for us */
return 0;
}
/* TODO: we can resize the array if the new
* request can accomodate the already existing pipes
*/
if (na->na_pipes) {
nmr->nr_arg1 = na->na_max_pipes;
return 0;
}
npipes = nmr->nr_arg1;
if (npipes == 0)
npipes = netmap_default_pipes;
nm_bound_var(&npipes, 0, 0, NM_MAXPIPES, NULL);
if (npipes == 0) {
/* really zero, nothing to alloc */
goto out;
}
len = sizeof(struct netmap_pipe_adapter *) * npipes;
na->na_pipes = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO);
if (na->na_pipes == NULL)
return ENOMEM;
na->na_max_pipes = npipes;
na->na_next_pipe = 0;
out:
nmr->nr_arg1 = npipes;
return 0;
}
/* deallocate the parent array in the parent adapter */
void
netmap_pipe_dealloc(struct netmap_adapter *na)
{
if (na->na_pipes) {
ND("freeing pipes for %s", na->name);
free(na->na_pipes, M_DEVBUF);
na->na_pipes = NULL;
na->na_max_pipes = 0;
na->na_next_pipe = 0;
}
}
/* find a pipe endpoint with the given id among the parent's pipes */
static struct netmap_pipe_adapter *
netmap_pipe_find(struct netmap_adapter *parent, u_int pipe_id)
{
int i;
struct netmap_pipe_adapter *na;
for (i = 0; i < parent->na_next_pipe; i++) {
na = parent->na_pipes[i];
if (na->id == pipe_id) {
return na;
}
}
return NULL;
}
/* add a new pipe endpoint to the parent array */
static int
netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na)
{
if (parent->na_next_pipe >= parent->na_max_pipes) {
D("%s: no space left for pipes", parent->name);
return ENOMEM;
}
parent->na_pipes[parent->na_next_pipe] = na;
na->parent_slot = parent->na_next_pipe;
parent->na_next_pipe++;
return 0;
}
/* remove the given pipe endpoint from the parent array */
static void
netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na)
{
u_int n;
n = --parent->na_next_pipe;
if (n != na->parent_slot) {
parent->na_pipes[na->parent_slot] =
parent->na_pipes[n];
}
parent->na_pipes[n] = NULL;
}
static int
netmap_pipe_txsync(struct netmap_kring *txkring, int flags)
{
struct netmap_kring *rxkring = txkring->pipe;
u_int limit; /* slots to transfer */
u_int j, k, lim_tx = txkring->nkr_num_slots - 1,
lim_rx = rxkring->nkr_num_slots - 1;
int m, busy;
ND("%p: %s %x -> %s", txkring, txkring->name, flags, rxkring->name);
ND(2, "before: hwcur %d hwtail %d cur %d head %d tail %d", txkring->nr_hwcur, txkring->nr_hwtail,
txkring->rcur, txkring->rhead, txkring->rtail);
j = rxkring->nr_hwtail; /* RX */
k = txkring->nr_hwcur; /* TX */
m = txkring->rhead - txkring->nr_hwcur; /* new slots */
if (m < 0)
m += txkring->nkr_num_slots;
limit = m;
m = rxkring->nkr_num_slots - 1; /* max avail space on destination */
busy = j - rxkring->nr_hwcur; /* busy slots */
if (busy < 0)
busy += txkring->nkr_num_slots;
m -= busy; /* subtract busy slots */
ND(2, "m %d limit %d", m, limit);
if (m < limit)
limit = m;
if (limit == 0) {
/* either the rxring is full, or nothing to send */
nm_txsync_finalize(txkring); /* actually useless */
return 0;
}
while (limit-- > 0) {
struct netmap_slot *rs = &rxkring->save_ring->slot[j];
struct netmap_slot *ts = &txkring->ring->slot[k];
struct netmap_slot tmp;
/* swap the slots */
tmp = *rs;
*rs = *ts;
*ts = tmp;
/* no need to report the buffer change */
j = nm_next(j, lim_rx);
k = nm_next(k, lim_tx);
}
wmb(); /* make sure the slots are updated before publishing them */
rxkring->nr_hwtail = j;
txkring->nr_hwcur = k;
txkring->nr_hwtail = nm_prev(k, lim_tx);
nm_txsync_finalize(txkring);
ND(2, "after: hwcur %d hwtail %d cur %d head %d tail %d j %d", txkring->nr_hwcur, txkring->nr_hwtail,
txkring->rcur, txkring->rhead, txkring->rtail, j);
wmb(); /* make sure rxkring->nr_hwtail is updated before notifying */
rxkring->na->nm_notify(rxkring->na, rxkring->ring_id, NR_RX, 0);
return 0;
}
static int
netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags)
{
struct netmap_kring *txkring = rxkring->pipe;
uint32_t oldhwcur = rxkring->nr_hwcur;
ND("%s %x <- %s", rxkring->name, flags, txkring->name);
rxkring->nr_hwcur = rxkring->rhead; /* recover user-relased slots */
ND(5, "hwcur %d hwtail %d cur %d head %d tail %d", rxkring->nr_hwcur, rxkring->nr_hwtail,
rxkring->rcur, rxkring->rhead, rxkring->rtail);
rmb(); /* paired with the first wmb() in txsync */
nm_rxsync_finalize(rxkring);
if (oldhwcur != rxkring->nr_hwcur) {
/* we have released some slots, notify the other end */
wmb(); /* make sure nr_hwcur is updated before notifying */
txkring->na->nm_notify(txkring->na, txkring->ring_id, NR_TX, 0);
}
return 0;
}
/* Pipe endpoints are created and destroyed together, so that endopoints do not
* have to check for the existence of their peer at each ?xsync.
*
* To play well with the existing netmap infrastructure (refcounts etc.), we
* adopt the following strategy:
*
* 1) The first endpoint that is created also creates the other endpoint and
* grabs a reference to it.
*
* state A) user1 --> endpoint1 --> endpoint2
*
* 2) If, starting from state A, endpoint2 is then registered, endpoint1 gives
* its reference to the user:
*
* state B) user1 --> endpoint1 endpoint2 <--- user2
*
* 3) Assume that, starting from state B endpoint2 is closed. In the unregister
* callback endpoint2 notes that endpoint1 is still active and adds a reference
* from endpoint1 to itself. When user2 then releases her own reference,
* endpoint2 is not destroyed and we are back to state A. A symmetrical state
* would be reached if endpoint1 were released instead.
*
* 4) If, starting from state A, endpoint1 is closed, the destructor notes that
* it owns a reference to endpoint2 and releases it.
*
* Something similar goes on for the creation and destruction of the krings.
*/
/* netmap_pipe_krings_delete.
*
* There are two cases:
*
* 1) state is
*
* usr1 --> e1 --> e2
*
* and we are e1. We have to create both sets
* of krings.
*
* 2) state is
*
* usr1 --> e1 --> e2
*
* and we are e2. e1 is certainly registered and our
* krings already exist, but they may be hidden.
*/
static int
netmap_pipe_krings_create(struct netmap_adapter *na)
{
struct netmap_pipe_adapter *pna =
(struct netmap_pipe_adapter *)na;
struct netmap_adapter *ona = &pna->peer->up;
int error = 0;
if (pna->peer_ref) {
int i;
/* case 1) above */
D("%p: case 1, create everything", na);
error = netmap_krings_create(na, 0);
if (error)
goto err;
/* we also create all the rings, since we need to
* update the save_ring pointers.
* netmap_mem_rings_create (called by our caller)
* will not create the rings again
*/
error = netmap_mem_rings_create(na);
if (error)
goto del_krings1;
/* update our hidden ring pointers */
for (i = 0; i < na->num_tx_rings + 1; i++)
na->tx_rings[i].save_ring = na->tx_rings[i].ring;
for (i = 0; i < na->num_rx_rings + 1; i++)
na->rx_rings[i].save_ring = na->rx_rings[i].ring;
/* now, create krings and rings of the other end */
error = netmap_krings_create(ona, 0);
if (error)
goto del_rings1;
error = netmap_mem_rings_create(ona);
if (error)
goto del_krings2;
for (i = 0; i < ona->num_tx_rings + 1; i++)
ona->tx_rings[i].save_ring = ona->tx_rings[i].ring;
for (i = 0; i < ona->num_rx_rings + 1; i++)
ona->rx_rings[i].save_ring = ona->rx_rings[i].ring;
/* cross link the krings */
for (i = 0; i < na->num_tx_rings; i++) {
na->tx_rings[i].pipe = pna->peer->up.rx_rings + i;
na->rx_rings[i].pipe = pna->peer->up.tx_rings + i;
pna->peer->up.tx_rings[i].pipe = na->rx_rings + i;
pna->peer->up.rx_rings[i].pipe = na->tx_rings + i;
}
} else {
int i;
/* case 2) above */
/* recover the hidden rings */
ND("%p: case 2, hidden rings", na);
for (i = 0; i < na->num_tx_rings + 1; i++)
na->tx_rings[i].ring = na->tx_rings[i].save_ring;
for (i = 0; i < na->num_rx_rings + 1; i++)
na->rx_rings[i].ring = na->rx_rings[i].save_ring;
}
return 0;
del_krings2:
netmap_krings_delete(ona);
del_rings1:
netmap_mem_rings_delete(na);
del_krings1:
netmap_krings_delete(na);
err:
return error;
}
/* netmap_pipe_reg.
*
* There are two cases on registration (onoff==1)
*
* 1.a) state is
*
* usr1 --> e1 --> e2
*
* and we are e1. Nothing special to do.
*
* 1.b) state is
*
* usr1 --> e1 --> e2 <-- usr2
*
* and we are e2. Drop the ref e1 is holding.
*
* There are two additional cases on unregister (onoff==0)
*
* 2.a) state is
*
* usr1 --> e1 --> e2
*
* and we are e1. Nothing special to do, e2 will
* be cleaned up by the destructor of e1.
*
* 2.b) state is
*
* usr1 --> e1 e2 <-- usr2
*
* and we are either e1 or e2. Add a ref from the
* other end and hide our rings.
*/
static int
netmap_pipe_reg(struct netmap_adapter *na, int onoff)
{
struct netmap_pipe_adapter *pna =
(struct netmap_pipe_adapter *)na;
ND("%p: onoff %d", na, onoff);
if (onoff) {
na->na_flags |= NAF_NETMAP_ON;
} else {
na->na_flags &= ~NAF_NETMAP_ON;
}
if (pna->peer_ref) {
ND("%p: case 1.a or 2.a, nothing to do", na);
return 0;
}
if (onoff) {
ND("%p: case 1.b, drop peer", na);
pna->peer->peer_ref = 0;
netmap_adapter_put(na);
} else {
int i;
ND("%p: case 2.b, grab peer", na);
netmap_adapter_get(na);
pna->peer->peer_ref = 1;
/* hide our rings from netmap_mem_rings_delete */
for (i = 0; i < na->num_tx_rings + 1; i++) {
na->tx_rings[i].ring = NULL;
}
for (i = 0; i < na->num_rx_rings + 1; i++) {
na->rx_rings[i].ring = NULL;
}
}
return 0;
}
/* netmap_pipe_krings_delete.
*
* There are two cases:
*
* 1) state is
*
* usr1 --> e1 --> e2
*
* and we are e1 (e2 is not registered, so krings_delete cannot be
* called on it);
*
* 2) state is
*
* usr1 --> e1 e2 <-- usr2
*
* and we are either e1 or e2.
*
* In the former case we have to also delete the krings of e2;
* in the latter case we do nothing (note that our krings
* have already been hidden in the unregister callback).
*/
static void
netmap_pipe_krings_delete(struct netmap_adapter *na)
{
struct netmap_pipe_adapter *pna =
(struct netmap_pipe_adapter *)na;
struct netmap_adapter *ona; /* na of the other end */
int i;
if (!pna->peer_ref) {
ND("%p: case 2, kept alive by peer", na);
return;
}
/* case 1) above */
ND("%p: case 1, deleting everyhing", na);
netmap_krings_delete(na); /* also zeroes tx_rings etc. */
/* restore the ring to be deleted on the peer */
ona = &pna->peer->up;
if (ona->tx_rings == NULL) {
/* already deleted, we must be on an
* cleanup-after-error path */
return;
}
for (i = 0; i < ona->num_tx_rings + 1; i++)
ona->tx_rings[i].ring = ona->tx_rings[i].save_ring;
for (i = 0; i < ona->num_rx_rings + 1; i++)
ona->rx_rings[i].ring = ona->rx_rings[i].save_ring;
netmap_mem_rings_delete(ona);
netmap_krings_delete(ona);
}
static void
netmap_pipe_dtor(struct netmap_adapter *na)
{
struct netmap_pipe_adapter *pna =
(struct netmap_pipe_adapter *)na;
ND("%p", na);
if (pna->peer_ref) {
ND("%p: clean up peer", na);
pna->peer_ref = 0;
netmap_adapter_put(&pna->peer->up);
}
if (pna->role == NR_REG_PIPE_MASTER)
netmap_pipe_remove(pna->parent, pna);
netmap_adapter_put(pna->parent);
pna->parent = NULL;
}
int
netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
{
struct nmreq pnmr;
struct netmap_adapter *pna; /* parent adapter */
struct netmap_pipe_adapter *mna, *sna, *req;
u_int pipe_id;
int role = nmr->nr_flags & NR_REG_MASK;
int error;
ND("flags %x", nmr->nr_flags);
if (role != NR_REG_PIPE_MASTER && role != NR_REG_PIPE_SLAVE) {
ND("not a pipe");
return 0;
}
role = nmr->nr_flags & NR_REG_MASK;
/* first, try to find the parent adapter */
bzero(&pnmr, sizeof(pnmr));
memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ);
/* pass to parent the requested number of pipes */
pnmr.nr_arg1 = nmr->nr_arg1;
error = netmap_get_na(&pnmr, &pna, create);
if (error) {
ND("parent lookup failed: %d", error);
return error;
}
ND("found parent: %s", na->name);
if (NETMAP_OWNED_BY_KERN(pna)) {
ND("parent busy");
error = EBUSY;
goto put_out;
}
/* next, lookup the pipe id in the parent list */
req = NULL;
pipe_id = nmr->nr_ringid & NETMAP_RING_MASK;
mna = netmap_pipe_find(pna, pipe_id);
if (mna) {
if (mna->role == role) {
ND("found %d directly at %d", pipe_id, mna->parent_slot);
req = mna;
} else {
ND("found %d indirectly at %d", pipe_id, mna->parent_slot);
req = mna->peer;
}
/* the pipe we have found already holds a ref to the parent,
* so we need to drop the one we got from netmap_get_na()
*/
netmap_adapter_put(pna);
goto found;
}
ND("pipe %d not found, create %d", pipe_id, create);
if (!create) {
error = ENODEV;
goto put_out;
}
/* we create both master and slave.
* The endpoint we were asked for holds a reference to
* the other one.
*/
mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
if (mna == NULL) {
error = ENOMEM;
goto put_out;
}
snprintf(mna->up.name, sizeof(mna->up.name), "%s{%d", pna->name, pipe_id);
mna->id = pipe_id;
mna->role = NR_REG_PIPE_MASTER;
mna->parent = pna;
mna->up.nm_txsync = netmap_pipe_txsync;
mna->up.nm_rxsync = netmap_pipe_rxsync;
mna->up.nm_register = netmap_pipe_reg;
mna->up.nm_dtor = netmap_pipe_dtor;
mna->up.nm_krings_create = netmap_pipe_krings_create;
mna->up.nm_krings_delete = netmap_pipe_krings_delete;
mna->up.nm_mem = pna->nm_mem;
mna->up.na_lut = pna->na_lut;
mna->up.na_lut_objtotal = pna->na_lut_objtotal;
mna->up.na_lut_objsize = pna->na_lut_objsize;
mna->up.num_tx_rings = 1;
mna->up.num_rx_rings = 1;
mna->up.num_tx_desc = nmr->nr_tx_slots;
nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc,
1, NM_PIPE_MAXSLOTS, NULL);
mna->up.num_rx_desc = nmr->nr_rx_slots;
nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc,
1, NM_PIPE_MAXSLOTS, NULL);
error = netmap_attach_common(&mna->up);
if (error)
goto free_mna;
/* register the master with the parent */
error = netmap_pipe_add(pna, mna);
if (error)
goto free_mna;
/* create the slave */
sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
if (sna == NULL) {
error = ENOMEM;
goto free_mna;
}
/* most fields are the same, copy from master and then fix */
*sna = *mna;
snprintf(sna->up.name, sizeof(sna->up.name), "%s}%d", pna->name, pipe_id);
sna->role = NR_REG_PIPE_SLAVE;
error = netmap_attach_common(&sna->up);
if (error)
goto free_sna;
/* join the two endpoints */
mna->peer = sna;
sna->peer = mna;
/* we already have a reference to the parent, but we
* need another one for the other endpoint we created
*/
netmap_adapter_get(pna);
if (role == NR_REG_PIPE_MASTER) {
req = mna;
mna->peer_ref = 1;
netmap_adapter_get(&sna->up);
} else {
req = sna;
sna->peer_ref = 1;
netmap_adapter_get(&mna->up);
}
ND("created master %p and slave %p", mna, sna);
found:
ND("pipe %d %s at %p", pipe_id,
(req->role == NR_REG_PIPE_MASTER ? "master" : "slave"), req);
*na = &req->up;
netmap_adapter_get(*na);
/* write the configuration back */
nmr->nr_tx_rings = req->up.num_tx_rings;
nmr->nr_rx_rings = req->up.num_rx_rings;
nmr->nr_tx_slots = req->up.num_tx_desc;
nmr->nr_rx_slots = req->up.num_rx_desc;
/* keep the reference to the parent.
* It will be released by the req destructor
*/
return 0;
free_sna:
free(sna, M_DEVBUF);
free_mna:
free(mna, M_DEVBUF);
put_out:
netmap_adapter_put(pna);
return error;
}
#endif /* WITH_PIPES */