From d12354a56c82993a353903ada310952bda231409 Mon Sep 17 00:00:00 2001 From: Vincenzo Maffione Date: Mon, 18 Mar 2019 12:22:23 +0000 Subject: [PATCH] netmap: add support for multiple host rings Some applications forward from/to host rings most or all the traffic received or sent on a physical interface. In this cases it is desirable to have more than a pair of RX/TX host rings, and use multiple threads to speed up forwarding. This change adds support for multiple host rings. On registering a netmap port, the user can specify the number of desired receive and transmit host rings in the nr_host_tx_rings and nr_host_rx_rings fields of the nmreq_register structure. MFC after: 2 weeks --- sys/dev/netmap/netmap.c | 42 +++++++++++++ sys/dev/netmap/netmap_legacy.c | 8 ++- sys/dev/netmap/netmap_mem2.c | 4 ++ sys/net/netmap.h | 106 +++++++++++++++++---------------- sys/net/netmap_legacy.h | 11 +--- sys/net/netmap_user.h | 100 ++++++++++++++++--------------- 6 files changed, 161 insertions(+), 110 deletions(-) diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index be7e636ae04c..43b383496a2e 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1035,6 +1035,10 @@ netmap_do_unregif(struct netmap_priv_d *priv) } na->nm_krings_delete(na); + + /* restore the default number of host tx and rx rings */ + na->num_host_tx_rings = 1; + na->num_host_rx_rings = 1; } /* possibily decrement counter of tx_si/rx_si users */ @@ -1575,6 +1579,19 @@ netmap_get_na(struct nmreq_header *hdr, *na = ret; netmap_adapter_get(ret); + /* + * if the adapter supports the host rings and it is not alread open, + * try to set the number of host rings as requested by the user + */ + if (((*na)->na_flags & NAF_HOST_RINGS) && (*na)->active_fds == 0) { + if (req->nr_host_tx_rings) + (*na)->num_host_tx_rings = req->nr_host_tx_rings; + if (req->nr_host_rx_rings) + (*na)->num_host_rx_rings = req->nr_host_rx_rings; + } + nm_prdis("%s: host tx %d rx %u", (*na)->name, (*na)->num_host_tx_rings, + (*na)->num_host_rx_rings); + out: if (error) { if (ret) @@ -1856,6 +1873,25 @@ netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode, nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t), priv->np_qfirst[t], priv->np_qlast[t]); break; + case NR_REG_ONE_SW: + if (!(na->na_flags & NAF_HOST_RINGS)) { + nm_prerr("host rings not supported"); + return EINVAL; + } + if (nr_ringid >= na->num_host_tx_rings && + nr_ringid >= na->num_host_rx_rings) { + nm_prerr("invalid ring id %d", nr_ringid); + return EINVAL; + } + /* if not enough rings, use the first one */ + j = nr_ringid; + if (j >= nma_get_host_nrings(na, t)) + j = 0; + priv->np_qfirst[t] = nma_get_nrings(na, t) + j; + priv->np_qlast[t] = nma_get_nrings(na, t) + j + 1; + nm_prdis("ONE_SW: %s %d %d", nm_txrx2str(t), + priv->np_qfirst[t], priv->np_qlast[t]); + break; default: nm_prerr("invalid regif type %d", nr_mode); return EINVAL; @@ -2546,6 +2582,8 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, req->nr_tx_rings = na->num_tx_rings; req->nr_rx_slots = na->num_rx_desc; req->nr_tx_slots = na->num_tx_desc; + req->nr_host_tx_rings = na->num_host_tx_rings; + req->nr_host_rx_rings = na->num_host_rx_rings; error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags, &req->nr_mem_id); if (error) { @@ -2610,6 +2648,8 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, regreq.nr_rx_slots = req->nr_rx_slots; regreq.nr_tx_rings = req->nr_tx_rings; regreq.nr_rx_rings = req->nr_rx_rings; + regreq.nr_host_tx_rings = req->nr_host_tx_rings; + regreq.nr_host_rx_rings = req->nr_host_rx_rings; regreq.nr_mem_id = req->nr_mem_id; /* get a refcount */ @@ -2647,6 +2687,8 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, req->nr_tx_rings = na->num_tx_rings; req->nr_rx_slots = na->num_rx_desc; req->nr_tx_slots = na->num_tx_desc; + req->nr_host_tx_rings = na->num_host_tx_rings; + req->nr_host_rx_rings = na->num_host_rx_rings; } while (0); netmap_unget_na(na, ifp); NMG_UNLOCK(); diff --git a/sys/dev/netmap/netmap_legacy.c b/sys/dev/netmap/netmap_legacy.c index afbd5ced8510..dd875c80e6aa 100644 --- a/sys/dev/netmap/netmap_legacy.c +++ b/sys/dev/netmap/netmap_legacy.c @@ -68,6 +68,8 @@ nmreq_register_from_legacy(struct nmreq *nmr, struct nmreq_header *hdr, req->nr_rx_slots = nmr->nr_rx_slots; req->nr_tx_rings = nmr->nr_tx_rings; req->nr_rx_rings = nmr->nr_rx_rings; + req->nr_host_tx_rings = 0; + req->nr_host_rx_rings = 0; req->nr_mem_id = nmr->nr_arg2; req->nr_ringid = nmr->nr_ringid & NETMAP_RING_MASK; if ((nmr->nr_flags & NR_REG_MASK) == NR_REG_DEFAULT) { @@ -249,6 +251,8 @@ nmreq_from_legacy(struct nmreq *nmr, u_long ioctl_cmd) req->nr_rx_slots = nmr->nr_rx_slots; req->nr_tx_rings = nmr->nr_tx_rings; req->nr_rx_rings = nmr->nr_rx_rings; + req->nr_host_tx_rings = 0; + req->nr_host_rx_rings = 0; req->nr_mem_id = nmr->nr_arg2; } break; @@ -367,8 +371,8 @@ netmap_ioctl_legacy(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct nmreq *nmr = (struct nmreq *) data; struct nmreq_header *hdr; - if (nmr->nr_version < 11) { - nm_prerr("Minimum supported API is 11 (requested %u)", + if (nmr->nr_version < 14) { + nm_prerr("Minimum supported API is 14 (requested %u)", nmr->nr_version); return EINVAL; } diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index 8aa7e3f46409..fc0b62d2abff 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -2012,6 +2012,10 @@ netmap_mem2_if_new(struct netmap_adapter *na, struct netmap_priv_d *priv) /* initialize base fields -- override const */ *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; + *(u_int *)(uintptr_t)&nifp->ni_host_tx_rings = + (na->num_host_tx_rings ? na->num_host_tx_rings : 1); + *(u_int *)(uintptr_t)&nifp->ni_host_rx_rings = + (na->num_host_rx_rings ? na->num_host_rx_rings : 1); strlcpy(nifp->ni_name, na->name, sizeof(nifp->ni_name)); /* diff --git a/sys/net/netmap.h b/sys/net/netmap.h index bb38c748f840..dd68953ff36c 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -41,9 +41,9 @@ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ -#define NETMAP_API 13 /* current API version */ +#define NETMAP_API 14 /* current API version */ -#define NETMAP_MIN_API 13 /* min and max versions accepted */ +#define NETMAP_MIN_API 14 /* min and max versions accepted */ #define NETMAP_MAX_API 15 /* * Some fields should be cache-aligned to reduce contention. @@ -64,34 +64,34 @@ KERNEL (opaque, obviously) ==================================================================== - | - USERSPACE | struct netmap_ring - +---->+---------------+ - / | head,cur,tail | - struct netmap_if (nifp, 1 per fd) / | buf_ofs | - +---------------+ / | other fields | - | ni_tx_rings | / +===============+ - | ni_rx_rings | / | buf_idx, len | slot[0] - | | / | flags, ptr | - | | / +---------------+ - +===============+ / | buf_idx, len | slot[1] - | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | - | txring_ofs[1] | +---------------+ - (tx+1 entries) (num_slots entries) - | txring_ofs[t] | | buf_idx, len | slot[n-1] - +---------------+ | flags, ptr | - | rxring_ofs[0] | +---------------+ - | rxring_ofs[1] | - (rx+1 entries) - | rxring_ofs[r] | - +---------------+ + | + USERSPACE | struct netmap_ring + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +----------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +================+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+htx entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +----------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ + | rxring_ofs[1] | + (rx+hrx entries) + | rxring_ofs[r] | + +----------------+ * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to * a file descriptor, the mmap()ed region contains a (logically readonly) * struct netmap_if pointing to struct netmap_ring's. * - * There is one netmap_ring per physical NIC ring, plus one tx/rx ring - * pair attached to the host stack (this pair is unused for non-NIC ports). + * There is one netmap_ring per physical NIC ring, plus at least one tx/rx ring + * pair attached to the host stack (these pairs are unused for non-NIC ports). * * All physical/host stack ports share the same memory region, * so that zero-copy can be implemented between them. @@ -117,11 +117,6 @@ * as the index. On close, ni_bufs_head must point to the list of * buffers to be released. * - * + NIOCREGIF can request space for extra rings (and buffers) - * allocated in the same memory space. The number of extra rings - * is in nr_arg1, and is advisory. This is a no-op on NICs where - * the size of the memory space is fixed. - * * + NIOCREGIF can attach to PIPE rings sharing the same memory * space with a parent device. The ifname indicates the parent device, * which must already exist. Flags in nr_flags indicate if we want to @@ -133,21 +128,22 @@ * * Extra flags in nr_flags support the above functions. * Application libraries may use the following naming scheme: - * netmap:foo all NIC ring pairs - * netmap:foo^ only host ring pair - * netmap:foo+ all NIC ring + host ring pairs - * netmap:foo-k the k-th NIC ring pair - * netmap:foo{k PIPE ring pair k, master side - * netmap:foo}k PIPE ring pair k, slave side + * netmap:foo all NIC rings pairs + * netmap:foo^ only host rings pairs + * netmap:foo^k the k-th host rings pair + * netmap:foo+ all NIC rings + host rings pairs + * netmap:foo-k the k-th NIC rings pair + * netmap:foo{k PIPE rings pair k, master side + * netmap:foo}k PIPE rings pair k, slave side * * Some notes about host rings: * - * + The RX host ring is used to store those packets that the host network + * + The RX host rings are used to store those packets that the host network * stack is trying to transmit through a NIC queue, but only if that queue * is currently in netmap mode. Netmap will not intercept host stack mbufs * designated to NIC queues that are not in netmap mode. As a consequence, * registering a netmap port with netmap:foo^ is not enough to intercept - * mbufs in the RX host ring; the netmap port should be registered with + * mbufs in the RX host rings; the netmap port should be registered with * netmap:foo*, or another registration should be done to open at least a * NIC TX queue in netmap mode. * @@ -157,7 +153,7 @@ * ifconfig on FreeBSD or ethtool -K on Linux) for an interface that is being * used in netmap mode. If the offloadings are not disabled, GSO and/or * unchecksummed packets may be dropped immediately or end up in the host RX - * ring, and will be dropped as soon as the packet reaches another netmap + * rings, and will be dropped as soon as the packet reaches another netmap * adapter. */ @@ -366,7 +362,7 @@ struct netmap_if { /* * The number of packet rings available in netmap mode. * Physical NICs can have different numbers of tx and rx rings. - * Physical NICs also have a 'host' ring pair. + * Physical NICs also have at least a 'host' rings pair. * Additionally, clients can request additional ring pairs to * be used for internal communication. */ @@ -374,14 +370,18 @@ struct netmap_if { const uint32_t ni_rx_rings; /* number of HW rx rings */ uint32_t ni_bufs_head; /* head index for extra bufs */ - uint32_t ni_spare1[5]; + const uint32_t ni_host_tx_rings; /* number of SW tx rings */ + const uint32_t ni_host_rx_rings; /* number of SW rx rings */ + uint32_t ni_spare1[3]; /* * The following array contains the offset of each netmap ring * from this structure, in the following order: - * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; - * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * - NIC tx rings (ni_tx_rings); + * - host tx rings (ni_host_tx_rings); + * - NIC rx rings (ni_rx_rings); + * - host rx ring (ni_host_rx_rings); * - * The area is filled up by the kernel on NIOCREGIF, + * The area is filled up by the kernel on NETMAP_REQ_REGISTER, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; @@ -422,7 +422,8 @@ struct netmap_if { * The request body (struct nmreq_register) has several arguments to * specify how the port is to be registered. * - * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings, + * nr_host_tx_rings, nr_host_rx_rings (in/out) * On input, non-zero values may be used to reconfigure the port * according to the requested values, but this is not guaranteed. * On output the actual values in use are reported. @@ -574,6 +575,8 @@ struct nmreq_register { uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ + uint16_t nr_host_tx_rings; /* number of host tx rings */ + uint16_t nr_host_rx_rings; /* number of host rx rings */ uint16_t nr_mem_id; /* id of the memory allocator */ uint16_t nr_ringid; /* ring(s) we care about */ @@ -592,9 +595,9 @@ struct nmreq_register { #define NR_TX_RINGS_ONLY 0x4000 /* Applications set this flag if they are able to deal with virtio-net headers, * that is send/receive frames that start with a virtio-net header. - * If not set, NIOCREGIF will fail with netmap ports that require applications - * to use those headers. If the flag is set, the application can use the - * NETMAP_VNET_HDR_GET command to figure out the header length. */ + * If not set, NETMAP_REQ_REGISTER will fail with netmap ports that require + * applications to use those headers. If the flag is set, the application can + * use the NETMAP_VNET_HDR_GET command to figure out the header length. */ #define NR_ACCEPT_VNET_HDR 0x8000 /* The following two have the same meaning of NETMAP_NO_TX_POLL and * NETMAP_DO_RX_POLL. */ @@ -611,6 +614,7 @@ enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ NR_REG_PIPE_MASTER = 5, /* deprecated, use "x{y" port name syntax */ NR_REG_PIPE_SLAVE = 6, /* deprecated, use "x}y" port name syntax */ NR_REG_NULL = 7, + NR_REG_ONE_SW = 8, }; /* A single ioctl number is shared by all the new API command. @@ -622,7 +626,7 @@ enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ /* The ioctl commands to sync TX/RX netmap rings. * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, - * whose identity is set in NIOCREGIF through nr_ringid. + * whose identity is set in NETMAP_REQ_REGISTER through nr_ringid. * These are non blocking and take no argument. */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ @@ -640,8 +644,10 @@ struct nmreq_port_info_get { uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ + uint16_t nr_host_tx_rings; /* number of host tx rings */ + uint16_t nr_host_rx_rings; /* number of host rx rings */ uint16_t nr_mem_id; /* memory allocator id (in/out) */ - uint16_t pad1; + uint16_t pad[3]; }; #define NM_BDG_NAME "vale" /* prefix for bridge port name */ diff --git a/sys/net/netmap_legacy.h b/sys/net/netmap_legacy.h index c7b0dffde173..f1dd7d625893 100644 --- a/sys/net/netmap_legacy.h +++ b/sys/net/netmap_legacy.h @@ -99,14 +99,7 @@ * nr_flags is the recommended mode to indicate which rings should * be bound to a file descriptor. Values are NR_REG_* * - * nr_arg1 (in) The number of extra rings to be reserved. - * Especially when allocating a VALE port the system only - * allocates the amount of memory needed for the port. - * If more shared memory rings are desired (e.g. for pipes), - * the first invocation for the same basename/allocator - * should specify a suitable number. Memory cannot be - * extended after the first allocation without closing - * all ports on the same region. + * nr_arg1 (in) Reserved. * * nr_arg2 (in/out) The identity of the memory region used. * On input, 0 means the system decides autonomously, @@ -188,7 +181,7 @@ struct nmreq { #define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */ #define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */ #define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */ - uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ + uint16_t nr_arg1; /* extra arguments */ #define NETMAP_BDG_HOST 1 /* nr_arg1 value for NETMAP_BDG_ATTACH */ uint16_t nr_arg2; /* id of the memory allocator */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index e9ce9c43e278..0ed785158c09 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -93,6 +93,8 @@ #include /* apple needs sockaddr */ #include /* IFNAMSIZ */ #include +#include /* memset */ +#include /* gettimeofday */ #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) @@ -111,7 +113,8 @@ nifp, (nifp)->ring_ofs[index] ) #define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ - nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] ) + nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + \ + (nifp)->ni_host_tx_rings] ) #define NETMAP_BUF(ring, index) \ ((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size)) @@ -149,27 +152,6 @@ nm_ring_space(struct netmap_ring *ring) return ret; } - -#ifdef NETMAP_WITH_LIBS -/* - * Support for simple I/O libraries. - * Include other system headers required for compiling this. - */ - -#ifndef HAVE_NETMAP_WITH_LIBS -#define HAVE_NETMAP_WITH_LIBS - -#include -#include -#include -#include /* memset */ -#include -#include /* EINVAL */ -#include /* O_RDWR */ -#include /* close() */ -#include -#include - #ifndef ND /* debug macros */ /* debug support */ #define ND(_fmt, ...) do {} while(0) @@ -198,6 +180,53 @@ nm_ring_space(struct netmap_ring *ring) } while (0) #endif +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + */ +static inline void +nm_pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = (const uint64_t *)_src; + uint64_t *dst = (uint64_t *)_dst; + + if (unlikely(l >= 1024 || l % 64)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + +#ifdef NETMAP_WITH_LIBS +/* + * Support for simple I/O libraries. + * Include other system headers required for compiling this. + */ + +#ifndef HAVE_NETMAP_WITH_LIBS +#define HAVE_NETMAP_WITH_LIBS + +#include +#include +#include +#include +#include /* EINVAL */ +#include /* O_RDWR */ +#include /* close() */ +#include +#include + struct nm_pkthdr { /* first part is the same as pcap_pkthdr */ struct timeval ts; uint32_t caplen; @@ -268,33 +297,6 @@ struct nm_desc { #define NETMAP_FD(d) (P2NMD(d)->fd) -/* - * this is a slightly optimized copy routine which rounds - * to multiple of 64 bytes and is often faster than dealing - * with other odd sizes. We assume there is enough room - * in the source and destination buffers. - */ -static inline void -nm_pkt_copy(const void *_src, void *_dst, int l) -{ - const uint64_t *src = (const uint64_t *)_src; - uint64_t *dst = (uint64_t *)_dst; - - if (unlikely(l >= 1024 || l % 64)) { - memcpy(dst, src, l); - return; - } - for (; likely(l > 0); l-=64) { - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - } -} /*