305 lines
9.9 KiB
Groff
305 lines
9.9 KiB
Groff
.\" Copyright (c) 2011 Matteo Landi, Luigi Rizzo, Universita` di Pisa
|
|
.\" All rights reserved.
|
|
.\"
|
|
.\" Redistribution and use in source and binary forms, with or without
|
|
.\" modification, are permitted provided that the following conditions
|
|
.\" are met:
|
|
.\" 1. Redistributions of source code must retain the above copyright
|
|
.\" notice, this list of conditions and the following disclaimer.
|
|
.\" 2. Redistributions in binary form must reproduce the above copyright
|
|
.\" notice, this list of conditions and the following disclaimer in the
|
|
.\" documentation and/or other materials provided with the distribution.
|
|
.\"
|
|
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
.\" SUCH DAMAGE.
|
|
.\"
|
|
.\" This document is derived in part from the enet man page (enet.4)
|
|
.\" distributed with 4.3BSD Unix.
|
|
.\"
|
|
.\" $FreeBSD$
|
|
.\" $Id: netmap.4 9662 2011-11-16 13:18:06Z luigi $: stable/8/share/man/man4/bpf.4 181694 2008-08-13 17:45:06Z ed $
|
|
.\"
|
|
.Dd February 27, 2012
|
|
.Dt NETMAP 4
|
|
.Os
|
|
.Sh NAME
|
|
.Nm netmap
|
|
.Nd a framework for fast packet I/O
|
|
.Sh SYNOPSIS
|
|
.Cd device netmap
|
|
.Sh DESCRIPTION
|
|
.Nm
|
|
is a framework for fast and safe access to network devices
|
|
(reaching 14.88 Mpps at less than 1 GHz).
|
|
.Nm
|
|
uses memory mapped buffers and metadata
|
|
(buffer indexes and lengths) to communicate with the kernel,
|
|
which is in charge of validating information through
|
|
.Pa ioctl()
|
|
and
|
|
.Pa select()/poll().
|
|
.Nm
|
|
can exploit the parallelism in multiqueue devices and
|
|
multicore systems.
|
|
.Pp
|
|
.Nm
|
|
requires explicit support in device drivers.
|
|
For a list of supported devices, see the end of this manual page.
|
|
.Sh OPERATION
|
|
.Nm
|
|
clients must first open the
|
|
.Pa open("/dev/netmap") ,
|
|
and then issue an
|
|
.Pa ioctl(...,NIOCREGIF,...)
|
|
to bind the file descriptor to a network device.
|
|
.Pp
|
|
When a device is put in
|
|
.Nm
|
|
mode, its data path is disconnected from the host stack.
|
|
The processes owning the file descriptor
|
|
can exchange packets with the device, or with the host stack,
|
|
through an mmapped memory region that contains pre-allocated
|
|
buffers and metadata.
|
|
.Pp
|
|
Non blocking I/O is done with special
|
|
.Pa ioctl()'s ,
|
|
whereas the file descriptor can be passed to
|
|
.Pa select()/poll()
|
|
to be notified about incoming packet or available transmit buffers.
|
|
.Ss Data structures
|
|
All data structures for all devices in
|
|
.Nm
|
|
mode are in a memory
|
|
region shared by the kernel and all processes
|
|
who open
|
|
.Pa /dev/netmap
|
|
(NOTE: visibility may be restricted in future implementations).
|
|
All references between the shared data structure
|
|
are relative (offsets or indexes). Some macros help converting
|
|
them into actual pointers.
|
|
.Pp
|
|
The data structures in shared memory are the following:
|
|
.Bl -tag -width XXX
|
|
.It Dv struct netmap_if (one per interface)
|
|
indicates the number of rings supported by an interface, their
|
|
sizes, and the offsets of the
|
|
.Pa netmap_rings
|
|
associated to the interface.
|
|
The offset of a
|
|
.Pa struct netmap_if
|
|
in the shared memory region is indicated by the
|
|
.Pa nr_offset
|
|
field in the structure returned by the
|
|
.Pa NIOCREGIF
|
|
(see below).
|
|
.Bd -literal
|
|
struct netmap_if {
|
|
char ni_name[IFNAMSIZ]; /* name of the interface. */
|
|
const u_int ni_num_queues; /* number of hw ring pairs */
|
|
const ssize_t ring_ofs[]; /* offset of tx and rx rings */
|
|
};
|
|
.Ed
|
|
.It Dv struct netmap_ring (one per ring)
|
|
contains the index of the current read or write slot (cur),
|
|
the number of slots available for reception or transmission (avail),
|
|
and an array of
|
|
.Pa slots
|
|
describing the buffers.
|
|
There is one ring pair for each of the N hardware ring pairs
|
|
supported by the card (numbered 0..N-1), plus
|
|
one ring pair (numbered N) for packets from/to the host stack.
|
|
.Bd -literal
|
|
struct netmap_ring {
|
|
const ssize_t buf_ofs;
|
|
const uint32_t num_slots; /* number of slots in the ring. */
|
|
uint32_t avail; /* number of usable slots */
|
|
uint32_t cur; /* 'current' index for the user side */
|
|
uint32_t reserved; /* not refilled before current */
|
|
|
|
const uint16_t nr_buf_size;
|
|
uint16_t flags;
|
|
struct netmap_slot slot[0]; /* array of slots. */
|
|
}
|
|
.Ed
|
|
.It Dv struct netmap_slot (one per packet)
|
|
contains the metadata for a packet: a buffer index (buf_idx),
|
|
a buffer length (len), and some flags.
|
|
.Bd -literal
|
|
struct netmap_slot {
|
|
uint32_t buf_idx; /* buffer index */
|
|
uint16_t len; /* packet length */
|
|
uint16_t flags; /* buf changed, etc. */
|
|
#define NS_BUF_CHANGED 0x0001 /* must resync, buffer changed */
|
|
#define NS_REPORT 0x0002 /* tell hw to report results
|
|
* e.g. by generating an interrupt
|
|
*/
|
|
};
|
|
.Ed
|
|
.It Dv packet buffers
|
|
are fixed size (approximately 2k) buffers allocated by the kernel
|
|
that contain packet data. Buffers addresses are computed through
|
|
macros.
|
|
.El
|
|
.Pp
|
|
Some macros support the access to objects in the shared memory
|
|
region. In particular:
|
|
.Bd -literal
|
|
struct netmap_if *nifp;
|
|
struct netmap_ring *txring = NETMAP_TXRING(nifp, i);
|
|
struct netmap_ring *rxring = NETMAP_RXRING(nifp, i);
|
|
int i = txring->slot[txring->cur].buf_idx;
|
|
char *buf = NETMAP_BUF(txring, i);
|
|
.Ed
|
|
.Ss IOCTLS
|
|
.Nm
|
|
supports some ioctl() to synchronize the state of the rings
|
|
between the kernel and the user processes, plus some
|
|
to query and configure the interface.
|
|
The former do not require any argument, whereas the latter
|
|
use a
|
|
.Pa struct netmap_req
|
|
defined as follows:
|
|
.Bd -literal
|
|
struct nmreq {
|
|
char nr_name[IFNAMSIZ];
|
|
uint32_t nr_version; /* API version */
|
|
#define NETMAP_API 2 /* current version */
|
|
uint32_t nr_offset; /* nifp offset in the shared region */
|
|
uint32_t nr_memsize; /* size of the shared region */
|
|
uint32_t nr_tx_slots; /* slots in tx rings */
|
|
uint32_t nr_rx_slots; /* slots in rx rings */
|
|
uint16_t nr_tx_rings; /* number of tx rings */
|
|
uint16_t nr_rx_rings; /* number of tx rings */
|
|
uint16_t nr_ringid; /* ring(s) we care about */
|
|
#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */
|
|
#define NETMAP_SW_RING 0x2000 /* we process the sw ring */
|
|
#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */
|
|
#define NETMAP_RING_MASK 0xfff /* the actual ring number */
|
|
};
|
|
|
|
.Ed
|
|
A device descriptor obtained through
|
|
.Pa /dev/netmap
|
|
also supports the ioctl supported by network devices.
|
|
.Pp
|
|
The netmap-specific
|
|
.Xr ioctl 2
|
|
command codes below are defined in
|
|
.In net/netmap.h
|
|
and are:
|
|
.Bl -tag -width XXXX
|
|
.It Dv NIOCGINFO
|
|
returns information about the interface named in nr_name.
|
|
On return, nr_memsize indicates the size of the shared netmap
|
|
memory region (this is device-independent),
|
|
nr_tx_slots and nr_rx_slots indicates how many buffers are in a
|
|
transmit and receive ring,
|
|
nr_tx_rings and nr_rx_rings indicates the number of transmit
|
|
and receive rings supported by the hardware.
|
|
.Pp
|
|
If the device does not support netmap, the ioctl returns EINVAL.
|
|
.It Dv NIOCREGIF
|
|
puts the interface named in nr_name into netmap mode, disconnecting
|
|
it from the host stack, and/or defines which rings are controlled
|
|
through this file descriptor.
|
|
On return, it gives the same info as NIOCGINFO, and nr_ringid
|
|
indicates the identity of the rings controlled through the file
|
|
descriptor.
|
|
.Pp
|
|
Possible values for nr_ringid are
|
|
.Bl -tag -width XXXXX
|
|
.It 0
|
|
default, all hardware rings
|
|
.It NETMAP_SW_RING
|
|
the ``host rings'' connecting to the host stack
|
|
.It NETMAP_HW_RING + i
|
|
the i-th hardware ring
|
|
.El
|
|
By default, a
|
|
.Nm poll
|
|
or
|
|
.Nm select
|
|
call pushes out any pending packets on the transmit ring, even if
|
|
no write events are specified.
|
|
The feature can be disabled by or-ing
|
|
.Nm NETMAP_NO_TX_SYNC
|
|
to nr_ringid.
|
|
But normally you should keep this feature unless you are using
|
|
separate file descriptors for the send and receive rings, because
|
|
otherwise packets are pushed out only if NETMAP_TXSYNC is called,
|
|
or the send queue is full.
|
|
.Pp
|
|
.Pa NIOCREGIF
|
|
can be used multiple times to change the association of a
|
|
file descriptor to a ring pair, always within the same device.
|
|
.It Dv NIOCUNREGIF
|
|
brings an interface back to normal mode.
|
|
.It Dv NIOCTXSYNC
|
|
tells the hardware of new packets to transmit, and updates the
|
|
number of slots available for transmission.
|
|
.It Dv NIOCRXSYNC
|
|
tells the hardware of consumed packets, and asks for newly available
|
|
packets.
|
|
.El
|
|
.Ss SYSTEM CALLS
|
|
.Nm
|
|
uses
|
|
.Nm select
|
|
and
|
|
.Nm poll
|
|
to wake up processes when significant events occur.
|
|
.Sh EXAMPLES
|
|
The following code implements a traffic generator
|
|
.Pp
|
|
.Bd -literal -compact
|
|
#include <net/netmap.h>
|
|
#include <net/netmap_user.h>
|
|
struct netmap_if *nifp;
|
|
struct netmap_ring *ring;
|
|
struct netmap_request nmr;
|
|
|
|
fd = open("/dev/netmap", O_RDWR);
|
|
bzero(&nmr, sizeof(nmr));
|
|
strcpy(nmr.nm_name, "ix0");
|
|
nmr.nm_version = NETMAP_API;
|
|
ioctl(fd, NIOCREG, &nmr);
|
|
p = mmap(0, nmr.memsize, fd);
|
|
nifp = NETMAP_IF(p, nmr.offset);
|
|
ring = NETMAP_TXRING(nifp, 0);
|
|
fds.fd = fd;
|
|
fds.events = POLLOUT;
|
|
for (;;) {
|
|
poll(list, 1, -1);
|
|
while (ring->avail-- > 0) {
|
|
i = ring->cur;
|
|
buf = NETMAP_BUF(ring, ring->slot[i].buf_index);
|
|
... prepare packet in buf ...
|
|
ring->slot[i].len = ... packet length ...
|
|
ring->cur = NETMAP_RING_NEXT(ring, i);
|
|
}
|
|
}
|
|
.Ed
|
|
.Sh SUPPORTED INTERFACES
|
|
.Nm
|
|
supports the following interfaces:
|
|
.Xr em 4 ,
|
|
.Xr ixgbe 4 ,
|
|
.Xr re 4 ,
|
|
.Sh AUTHORS
|
|
The
|
|
.Nm
|
|
framework has been designed and implemented by
|
|
.An Luigi Rizzo
|
|
and
|
|
.An Matteo Landi
|
|
in 2011 at the Universita` di Pisa.
|