netmap: add support for multiple host rings

Some applications forward from/to host rings most or all the
traffic received or sent on a physical interface. In this
cases it is desirable to have more than a pair of RX/TX host
rings, and use multiple threads to speed up forwarding.
This change adds support for multiple host rings. On registering
a netmap port, the user can specify the number of desired receive
and transmit host rings in the nr_host_tx_rings and nr_host_rx_rings
fields of the nmreq_register structure.

MFC after:	2 weeks
This commit is contained in:
Vincenzo Maffione 2019-03-18 12:22:23 +00:00
parent 5c04f73e07
commit d12354a56c
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=345269
6 changed files with 161 additions and 110 deletions

View File

@ -1035,6 +1035,10 @@ netmap_do_unregif(struct netmap_priv_d *priv)
}
na->nm_krings_delete(na);
/* restore the default number of host tx and rx rings */
na->num_host_tx_rings = 1;
na->num_host_rx_rings = 1;
}
/* possibily decrement counter of tx_si/rx_si users */
@ -1575,6 +1579,19 @@ netmap_get_na(struct nmreq_header *hdr,
*na = ret;
netmap_adapter_get(ret);
/*
* if the adapter supports the host rings and it is not alread open,
* try to set the number of host rings as requested by the user
*/
if (((*na)->na_flags & NAF_HOST_RINGS) && (*na)->active_fds == 0) {
if (req->nr_host_tx_rings)
(*na)->num_host_tx_rings = req->nr_host_tx_rings;
if (req->nr_host_rx_rings)
(*na)->num_host_rx_rings = req->nr_host_rx_rings;
}
nm_prdis("%s: host tx %d rx %u", (*na)->name, (*na)->num_host_tx_rings,
(*na)->num_host_rx_rings);
out:
if (error) {
if (ret)
@ -1856,6 +1873,25 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t),
priv->np_qfirst[t], priv->np_qlast[t]);
break;
case NR_REG_ONE_SW:
if (!(na->na_flags & NAF_HOST_RINGS)) {
nm_prerr("host rings not supported");
return EINVAL;
}
if (nr_ringid >= na->num_host_tx_rings &&
nr_ringid >= na->num_host_rx_rings) {
nm_prerr("invalid ring id %d", nr_ringid);
return EINVAL;
}
/* if not enough rings, use the first one */
j = nr_ringid;
if (j >= nma_get_host_nrings(na, t))
j = 0;
priv->np_qfirst[t] = nma_get_nrings(na, t) + j;
priv->np_qlast[t] = nma_get_nrings(na, t) + j + 1;
nm_prdis("ONE_SW: %s %d %d", nm_txrx2str(t),
priv->np_qfirst[t], priv->np_qlast[t]);
break;
default:
nm_prerr("invalid regif type %d", nr_mode);
return EINVAL;
@ -2546,6 +2582,8 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
req->nr_tx_rings = na->num_tx_rings;
req->nr_rx_slots = na->num_rx_desc;
req->nr_tx_slots = na->num_tx_desc;
req->nr_host_tx_rings = na->num_host_tx_rings;
req->nr_host_rx_rings = na->num_host_rx_rings;
error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
&req->nr_mem_id);
if (error) {
@ -2610,6 +2648,8 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
regreq.nr_rx_slots = req->nr_rx_slots;
regreq.nr_tx_rings = req->nr_tx_rings;
regreq.nr_rx_rings = req->nr_rx_rings;
regreq.nr_host_tx_rings = req->nr_host_tx_rings;
regreq.nr_host_rx_rings = req->nr_host_rx_rings;
regreq.nr_mem_id = req->nr_mem_id;
/* get a refcount */
@ -2647,6 +2687,8 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
req->nr_tx_rings = na->num_tx_rings;
req->nr_rx_slots = na->num_rx_desc;
req->nr_tx_slots = na->num_tx_desc;
req->nr_host_tx_rings = na->num_host_tx_rings;
req->nr_host_rx_rings = na->num_host_rx_rings;
} while (0);
netmap_unget_na(na, ifp);
NMG_UNLOCK();

View File

@ -68,6 +68,8 @@ nmreq_register_from_legacy(struct nmreq *nmr, struct nmreq_header *hdr,
req->nr_rx_slots = nmr->nr_rx_slots;
req->nr_tx_rings = nmr->nr_tx_rings;
req->nr_rx_rings = nmr->nr_rx_rings;
req->nr_host_tx_rings = 0;
req->nr_host_rx_rings = 0;
req->nr_mem_id = nmr->nr_arg2;
req->nr_ringid = nmr->nr_ringid & NETMAP_RING_MASK;
if ((nmr->nr_flags & NR_REG_MASK) == NR_REG_DEFAULT) {
@ -249,6 +251,8 @@ nmreq_from_legacy(struct nmreq *nmr, u_long ioctl_cmd)
req->nr_rx_slots = nmr->nr_rx_slots;
req->nr_tx_rings = nmr->nr_tx_rings;
req->nr_rx_rings = nmr->nr_rx_rings;
req->nr_host_tx_rings = 0;
req->nr_host_rx_rings = 0;
req->nr_mem_id = nmr->nr_arg2;
}
break;
@ -367,8 +371,8 @@ netmap_ioctl_legacy(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
struct nmreq *nmr = (struct nmreq *) data;
struct nmreq_header *hdr;
if (nmr->nr_version < 11) {
nm_prerr("Minimum supported API is 11 (requested %u)",
if (nmr->nr_version < 14) {
nm_prerr("Minimum supported API is 14 (requested %u)",
nmr->nr_version);
return EINVAL;
}

View File

@ -2012,6 +2012,10 @@ netmap_mem2_if_new(struct netmap_adapter *na, struct netmap_priv_d *priv)
/* initialize base fields -- override const */
*(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings;
*(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings;
*(u_int *)(uintptr_t)&nifp->ni_host_tx_rings =
(na->num_host_tx_rings ? na->num_host_tx_rings : 1);
*(u_int *)(uintptr_t)&nifp->ni_host_rx_rings =
(na->num_host_rx_rings ? na->num_host_rx_rings : 1);
strlcpy(nifp->ni_name, na->name, sizeof(nifp->ni_name));
/*

View File

@ -41,9 +41,9 @@
#ifndef _NET_NETMAP_H_
#define _NET_NETMAP_H_
#define NETMAP_API 13 /* current API version */
#define NETMAP_API 14 /* current API version */
#define NETMAP_MIN_API 13 /* min and max versions accepted */
#define NETMAP_MIN_API 14 /* min and max versions accepted */
#define NETMAP_MAX_API 15
/*
* Some fields should be cache-aligned to reduce contention.
@ -64,34 +64,34 @@
KERNEL (opaque, obviously)
====================================================================
|
USERSPACE | struct netmap_ring
+---->+---------------+
/ | head,cur,tail |
struct netmap_if (nifp, 1 per fd) / | buf_ofs |
+---------------+ / | other fields |
| ni_tx_rings | / +===============+
| ni_rx_rings | / | buf_idx, len | slot[0]
| | / | flags, ptr |
| | / +---------------+
+===============+ / | buf_idx, len | slot[1]
| txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
| txring_ofs[1] | +---------------+
(tx+1 entries) (num_slots entries)
| txring_ofs[t] | | buf_idx, len | slot[n-1]
+---------------+ | flags, ptr |
| rxring_ofs[0] | +---------------+
| rxring_ofs[1] |
(rx+1 entries)
| rxring_ofs[r] |
+---------------+
|
USERSPACE | struct netmap_ring
+---->+---------------+
/ | head,cur,tail |
struct netmap_if (nifp, 1 per fd) / | buf_ofs |
+----------------+ / | other fields |
| ni_tx_rings | / +===============+
| ni_rx_rings | / | buf_idx, len | slot[0]
| | / | flags, ptr |
| | / +---------------+
+================+ / | buf_idx, len | slot[1]
| txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
| txring_ofs[1] | +---------------+
(tx+htx entries) (num_slots entries)
| txring_ofs[t] | | buf_idx, len | slot[n-1]
+----------------+ | flags, ptr |
| rxring_ofs[0] | +---------------+
| rxring_ofs[1] |
(rx+hrx entries)
| rxring_ofs[r] |
+----------------+
* For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to
* a file descriptor, the mmap()ed region contains a (logically readonly)
* struct netmap_if pointing to struct netmap_ring's.
*
* There is one netmap_ring per physical NIC ring, plus one tx/rx ring
* pair attached to the host stack (this pair is unused for non-NIC ports).
* There is one netmap_ring per physical NIC ring, plus at least one tx/rx ring
* pair attached to the host stack (these pairs are unused for non-NIC ports).
*
* All physical/host stack ports share the same memory region,
* so that zero-copy can be implemented between them.
@ -117,11 +117,6 @@
* as the index. On close, ni_bufs_head must point to the list of
* buffers to be released.
*
* + NIOCREGIF can request space for extra rings (and buffers)
* allocated in the same memory space. The number of extra rings
* is in nr_arg1, and is advisory. This is a no-op on NICs where
* the size of the memory space is fixed.
*
* + NIOCREGIF can attach to PIPE rings sharing the same memory
* space with a parent device. The ifname indicates the parent device,
* which must already exist. Flags in nr_flags indicate if we want to
@ -133,21 +128,22 @@
*
* Extra flags in nr_flags support the above functions.
* Application libraries may use the following naming scheme:
* netmap:foo all NIC ring pairs
* netmap:foo^ only host ring pair
* netmap:foo+ all NIC ring + host ring pairs
* netmap:foo-k the k-th NIC ring pair
* netmap:foo{k PIPE ring pair k, master side
* netmap:foo}k PIPE ring pair k, slave side
* netmap:foo all NIC rings pairs
* netmap:foo^ only host rings pairs
* netmap:foo^k the k-th host rings pair
* netmap:foo+ all NIC rings + host rings pairs
* netmap:foo-k the k-th NIC rings pair
* netmap:foo{k PIPE rings pair k, master side
* netmap:foo}k PIPE rings pair k, slave side
*
* Some notes about host rings:
*
* + The RX host ring is used to store those packets that the host network
* + The RX host rings are used to store those packets that the host network
* stack is trying to transmit through a NIC queue, but only if that queue
* is currently in netmap mode. Netmap will not intercept host stack mbufs
* designated to NIC queues that are not in netmap mode. As a consequence,
* registering a netmap port with netmap:foo^ is not enough to intercept
* mbufs in the RX host ring; the netmap port should be registered with
* mbufs in the RX host rings; the netmap port should be registered with
* netmap:foo*, or another registration should be done to open at least a
* NIC TX queue in netmap mode.
*
@ -157,7 +153,7 @@
* ifconfig on FreeBSD or ethtool -K on Linux) for an interface that is being
* used in netmap mode. If the offloadings are not disabled, GSO and/or
* unchecksummed packets may be dropped immediately or end up in the host RX
* ring, and will be dropped as soon as the packet reaches another netmap
* rings, and will be dropped as soon as the packet reaches another netmap
* adapter.
*/
@ -366,7 +362,7 @@ struct netmap_if {
/*
* The number of packet rings available in netmap mode.
* Physical NICs can have different numbers of tx and rx rings.
* Physical NICs also have a 'host' ring pair.
* Physical NICs also have at least a 'host' rings pair.
* Additionally, clients can request additional ring pairs to
* be used for internal communication.
*/
@ -374,14 +370,18 @@ struct netmap_if {
const uint32_t ni_rx_rings; /* number of HW rx rings */
uint32_t ni_bufs_head; /* head index for extra bufs */
uint32_t ni_spare1[5];
const uint32_t ni_host_tx_rings; /* number of SW tx rings */
const uint32_t ni_host_rx_rings; /* number of SW rx rings */
uint32_t ni_spare1[3];
/*
* The following array contains the offset of each netmap ring
* from this structure, in the following order:
* NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings;
* NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings.
* - NIC tx rings (ni_tx_rings);
* - host tx rings (ni_host_tx_rings);
* - NIC rx rings (ni_rx_rings);
* - host rx ring (ni_host_rx_rings);
*
* The area is filled up by the kernel on NIOCREGIF,
* The area is filled up by the kernel on NETMAP_REQ_REGISTER,
* and then only read by userspace code.
*/
const ssize_t ring_ofs[0];
@ -422,7 +422,8 @@ struct netmap_if {
* The request body (struct nmreq_register) has several arguments to
* specify how the port is to be registered.
*
* nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out)
* nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings,
* nr_host_tx_rings, nr_host_rx_rings (in/out)
* On input, non-zero values may be used to reconfigure the port
* according to the requested values, but this is not guaranteed.
* On output the actual values in use are reported.
@ -574,6 +575,8 @@ struct nmreq_register {
uint32_t nr_rx_slots; /* slots in rx rings */
uint16_t nr_tx_rings; /* number of tx rings */
uint16_t nr_rx_rings; /* number of rx rings */
uint16_t nr_host_tx_rings; /* number of host tx rings */
uint16_t nr_host_rx_rings; /* number of host rx rings */
uint16_t nr_mem_id; /* id of the memory allocator */
uint16_t nr_ringid; /* ring(s) we care about */
@ -592,9 +595,9 @@ struct nmreq_register {
#define NR_TX_RINGS_ONLY 0x4000
/* Applications set this flag if they are able to deal with virtio-net headers,
* that is send/receive frames that start with a virtio-net header.
* If not set, NIOCREGIF will fail with netmap ports that require applications
* to use those headers. If the flag is set, the application can use the
* NETMAP_VNET_HDR_GET command to figure out the header length. */
* If not set, NETMAP_REQ_REGISTER will fail with netmap ports that require
* applications to use those headers. If the flag is set, the application can
* use the NETMAP_VNET_HDR_GET command to figure out the header length. */
#define NR_ACCEPT_VNET_HDR 0x8000
/* The following two have the same meaning of NETMAP_NO_TX_POLL and
* NETMAP_DO_RX_POLL. */
@ -611,6 +614,7 @@ enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */
NR_REG_PIPE_MASTER = 5, /* deprecated, use "x{y" port name syntax */
NR_REG_PIPE_SLAVE = 6, /* deprecated, use "x}y" port name syntax */
NR_REG_NULL = 7,
NR_REG_ONE_SW = 8,
};
/* A single ioctl number is shared by all the new API command.
@ -622,7 +626,7 @@ enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */
/* The ioctl commands to sync TX/RX netmap rings.
* NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
* whose identity is set in NIOCREGIF through nr_ringid.
* whose identity is set in NETMAP_REQ_REGISTER through nr_ringid.
* These are non blocking and take no argument. */
#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */
#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */
@ -640,8 +644,10 @@ struct nmreq_port_info_get {
uint32_t nr_rx_slots; /* slots in rx rings */
uint16_t nr_tx_rings; /* number of tx rings */
uint16_t nr_rx_rings; /* number of rx rings */
uint16_t nr_host_tx_rings; /* number of host tx rings */
uint16_t nr_host_rx_rings; /* number of host rx rings */
uint16_t nr_mem_id; /* memory allocator id (in/out) */
uint16_t pad1;
uint16_t pad[3];
};
#define NM_BDG_NAME "vale" /* prefix for bridge port name */

View File

@ -99,14 +99,7 @@
* nr_flags is the recommended mode to indicate which rings should
* be bound to a file descriptor. Values are NR_REG_*
*
* nr_arg1 (in) The number of extra rings to be reserved.
* Especially when allocating a VALE port the system only
* allocates the amount of memory needed for the port.
* If more shared memory rings are desired (e.g. for pipes),
* the first invocation for the same basename/allocator
* should specify a suitable number. Memory cannot be
* extended after the first allocation without closing
* all ports on the same region.
* nr_arg1 (in) Reserved.
*
* nr_arg2 (in/out) The identity of the memory region used.
* On input, 0 means the system decides autonomously,
@ -188,7 +181,7 @@ struct nmreq {
#define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */
#define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */
#define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */
uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */
uint16_t nr_arg1; /* extra arguments */
#define NETMAP_BDG_HOST 1 /* nr_arg1 value for NETMAP_BDG_ATTACH */
uint16_t nr_arg2; /* id of the memory allocator */

View File

@ -93,6 +93,8 @@
#include <sys/socket.h> /* apple needs sockaddr */
#include <net/if.h> /* IFNAMSIZ */
#include <ctype.h>
#include <string.h> /* memset */
#include <sys/time.h> /* gettimeofday */
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
@ -111,7 +113,8 @@
nifp, (nifp)->ring_ofs[index] )
#define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] )
nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + \
(nifp)->ni_host_tx_rings] )
#define NETMAP_BUF(ring, index) \
((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size))
@ -149,27 +152,6 @@ nm_ring_space(struct netmap_ring *ring)
return ret;
}
#ifdef NETMAP_WITH_LIBS
/*
* Support for simple I/O libraries.
* Include other system headers required for compiling this.
*/
#ifndef HAVE_NETMAP_WITH_LIBS
#define HAVE_NETMAP_WITH_LIBS
#include <stdio.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <string.h> /* memset */
#include <sys/ioctl.h>
#include <sys/errno.h> /* EINVAL */
#include <fcntl.h> /* O_RDWR */
#include <unistd.h> /* close() */
#include <signal.h>
#include <stdlib.h>
#ifndef ND /* debug macros */
/* debug support */
#define ND(_fmt, ...) do {} while(0)
@ -198,6 +180,53 @@ nm_ring_space(struct netmap_ring *ring)
} while (0)
#endif
/*
* this is a slightly optimized copy routine which rounds
* to multiple of 64 bytes and is often faster than dealing
* with other odd sizes. We assume there is enough room
* in the source and destination buffers.
*/
static inline void
nm_pkt_copy(const void *_src, void *_dst, int l)
{
const uint64_t *src = (const uint64_t *)_src;
uint64_t *dst = (uint64_t *)_dst;
if (unlikely(l >= 1024 || l % 64)) {
memcpy(dst, src, l);
return;
}
for (; likely(l > 0); l-=64) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
#ifdef NETMAP_WITH_LIBS
/*
* Support for simple I/O libraries.
* Include other system headers required for compiling this.
*/
#ifndef HAVE_NETMAP_WITH_LIBS
#define HAVE_NETMAP_WITH_LIBS
#include <stdio.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/errno.h> /* EINVAL */
#include <fcntl.h> /* O_RDWR */
#include <unistd.h> /* close() */
#include <signal.h>
#include <stdlib.h>
struct nm_pkthdr { /* first part is the same as pcap_pkthdr */
struct timeval ts;
uint32_t caplen;
@ -268,33 +297,6 @@ struct nm_desc {
#define NETMAP_FD(d) (P2NMD(d)->fd)
/*
* this is a slightly optimized copy routine which rounds
* to multiple of 64 bytes and is often faster than dealing
* with other odd sizes. We assume there is enough room
* in the source and destination buffers.
*/
static inline void
nm_pkt_copy(const void *_src, void *_dst, int l)
{
const uint64_t *src = (const uint64_t *)_src;
uint64_t *dst = (uint64_t *)_dst;
if (unlikely(l >= 1024 || l % 64)) {
memcpy(dst, src, l);
return;
}
for (; likely(l > 0); l-=64) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
/*