diff --git a/lib/Makefile b/lib/Makefile index 841018406328..3c0f42544032 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -89,6 +89,7 @@ SUBDIR= ${SUBDIR_BOOTSTRAP} \ libprocstat \ ${_libradius} \ librpcsvc \ + librss \ librt \ ${_librtld_db} \ libsbuf \ diff --git a/lib/librss/Makefile b/lib/librss/Makefile new file mode 100644 index 000000000000..384a205d85bd --- /dev/null +++ b/lib/librss/Makefile @@ -0,0 +1,13 @@ +# $FreeBSD$ + +PACKAGE= lib${LIB} +SHLIBDIR?= /lib + +.include + +LIB= rss +SHLIB_MAJOR= 1 + +SRCS=librss.c + +.include diff --git a/lib/librss/librss.3 b/lib/librss/librss.3 new file mode 100644 index 000000000000..02a50f89eed7 --- /dev/null +++ b/lib/librss/librss.3 @@ -0,0 +1,153 @@ +.\" $FreeBSD$ +.\" +.Dd September 29, 2016 +.Dt LIBRSS 3 +.Os +.Sh NAME +.Nm librss +.Nd Provide Receive-side scaling awareness to userland applications +.Sh LIBRARY +.Lb librss +.Sh SYNOPSIS +.In librss.h +.Ft struct rss_config * +.Fn rss_config_get "void" +.Ft void +.Fn rss_config_free "struct rss_config *cfg" +.Ft int +.Fn rss_config_get_bucket_count "struct rss_config *cfg" +.Ft int +.Fn rss_set_bucket_rebalance_cb "rss_bucket_rebalance_cb_t *cb" "void *cbdata" +.Ft int +.Fn rss_sock_set_bindmulti "int fd" "int af" "int val" +.Ft int +.Fn rss_sock_set_rss_bucket "int fd" "int af" "int rss_bucket" +.Ft int +.Fn rss_sock_set_recvrss "int fd" "int af" "int val" +.Sh DESCRIPTION +The +.Nm +library and the functions it provides are used for both fetching +the system RSS configuration and interacting with RSS aware +sockets. +.Pp +Applications will typically call +.Fn rss_config_get +to fetch the current RSS configuration from the system and perform +initial setup. +This typically involves spawning worker threads, one per RSS bucket, +and optionally binding them to the per-bucket CPU set. +.Pp +The +.Vt rss_config +struct is defined as: +.Bd -literal +struct rss_config { + int rss_ncpus; + int rss_nbuckets; + int rss_basecpu; + int *rss_bucket_map; +}; +.Ed +.Pp +Applications will typically use the +.Fn rss_config_get_bucket_count +function to fetch the number of RSS buckets, create one thread +per RSS bucket for RSS aware work, then one RSS aware socket to receive +UDP datagrams or TCP connections +in each particular RSS bucket / thread. +.Pp +The +.Fn rss_get_bucket_cpuset +function sets the given cpuset up for the given +RSS bucket and behaviour. +Typically applications will wish to just query for +.Vt RSS_BUCKET_TYPE_KERNEL_ALL +unless they wish to potentially setup different +worker threads for transmit and receive. +.Pp +The +.Vt rss_bucket_type_t +enum is defined as: +.Bd -literal +typedef enum { + RSS_BUCKET_TYPE_NONE = 0, + RSS_BUCKET_TYPE_KERNEL_ALL = 1, + RSS_BUCKET_TYPE_KERNEL_TX = 2, + RSS_BUCKET_TYPE_KERNEL_RX = 3, + RSS_BUCKET_TYPE_MAX = 3, +} rss_bucket_type_t; +.Ed +.Pp +The rebalance callback +.Vt rss_bucket_rebalance_cb_t +is defined as: +.Bd -literal +typedef void rss_bucket_rebalance_cb_t(void *arg); +.Ed +.Pp +The +.Fn rss_set_bucket_rebalance_cb +function sets an optional callback that will be called if the kernel +rebalances RSS buckets. +This is intended as a future expansion to rebalance buckets rather than +reprogram the RSS key, so typically the only work to be performed +is to rebind worker threads to an updated cpuset. +.Pp +Once RSS setup is completed, +.Fn rss_config_free +is called to free the RSS configuration structure. +.Pp +To make a +.Vt bind +socket RSS aware, the +.Fn rss_sock_set_bindmulti +function is used to enable or disable per-RSS bucket +behaviour. +The socket filedescriptor, address family and enable flag +.Vt val +are passed in. +.Pp +If +.Vt val +is set to 1, the socket can be placed in an RSS bucket and will only accept +datagrams (for UDP) or connections (for TCP) that are received for that +RSS bucket. +If set to 0, the socket is placed in the default PCB and will see +datagrams/connections that are not initially consumed by a PCB aware +socket. +.Pp +The +.Fn rss_sock_set_rss_bucket +function configures the RSS bucket which a socket belongs in. +Note that TCP sockets created by +.Xr accept 2 +will automatically be assigned to the RSS bucket. +.Pp +The +.Fn rss_sock_set_recvrss +function enables or disables receiving RSS related information +as socket options in. +.2 recvmsg +calls. +.Pp +When enabled, UDP datagrams will have a message with the +.Vt IP_RECVFLOWID +option indicating the 32-bit receive flowid as a uint32_t, +and the +.Vt IP_RECVRSSBUCKETID +option indicating the 32 bit RSS bucket id as a uint32_t. +.Sh ERRORS +The functions return either <0 or NULL as appropriate upon error. +.Sh SEE ALSO +.Xr PCBGROUP 9 +.Sh HISTORY +The +.Xr librss.3 +library first appeared in +.Fx 11.0 . +.Sh AUTHORS +.An Adrian Chadd Aq Mt adrian@FreeBSD.org +.Sh BUGS +There is currently no kernel mechanism to rebalance the RSS bucket to CPU +mapping, and so the callback mechanism is a no-op. diff --git a/lib/librss/librss.c b/lib/librss/librss.c new file mode 100644 index 000000000000..215687e2a331 --- /dev/null +++ b/lib/librss/librss.c @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2016 Adrian Chadd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "librss.h" + +int +rss_sock_set_bindmulti(int fd, int af, int val) +{ + int opt; + socklen_t optlen; + int retval; + + /* Set bindmulti */ + opt = val; + optlen = sizeof(opt); + retval = setsockopt(fd, + af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6, + af == AF_INET ? IP_BINDMULTI : IPV6_BINDMULTI, + &opt, + optlen); + if (retval < 0) { + warn("%s: setsockopt(IP_BINDMULTI)", __func__); + return (-1); + } + return (0); +} + +int +rss_sock_set_rss_bucket(int fd, int af, int rss_bucket) +{ + int opt; + socklen_t optlen; + int retval; + int f, p; + + switch (af) { + case AF_INET: + p = IPPROTO_IP; + f = IP_RSS_LISTEN_BUCKET; + break; + case AF_INET6: + p = IPPROTO_IPV6; + f = IPV6_RSS_LISTEN_BUCKET; + break; + default: + return (-1); + } + + /* Set RSS bucket */ + opt = rss_bucket; + optlen = sizeof(opt); + retval = setsockopt(fd, p, f, &opt, optlen); + if (retval < 0) { + warn("%s: setsockopt(IP_RSS_LISTEN_BUCKET)", __func__); + return (-1); + } + return (0); +} + +int +rss_sock_set_recvrss(int fd, int af, int val) +{ + int opt, retval; + socklen_t optlen; + int f1, f2, p; + + switch (af) { + case AF_INET: + p = IPPROTO_IP; + f1 = IP_RECVFLOWID; + f2 = IP_RECVRSSBUCKETID; + break; + case AF_INET6: + p = IPPROTO_IPV6; + f1 = IPV6_RECVFLOWID; + f2 = IPV6_RECVRSSBUCKETID; + break; + default: + return (-1); + } + + /* Enable/disable flowid */ + opt = val; + optlen = sizeof(opt); + retval = setsockopt(fd, p, f1, &opt, optlen); + if (retval < 0) { + warn("%s: setsockopt(IP_RECVFLOWID)", __func__); + return (-1); + } + + /* Enable/disable RSS bucket reception */ + opt = val; + optlen = sizeof(opt); + retval = setsockopt(fd, p, f2, &opt, optlen); + if (retval < 0) { + warn("%s: setsockopt(IP_RECVRSSBUCKETID)", __func__); + return (-1); + } + + return (0); +} + +static int +rss_getsysctlint(const char *s) +{ + int val, retval; + size_t rlen; + + rlen = sizeof(int); + retval = sysctlbyname(s, &val, &rlen, NULL, 0); + if (retval < 0) { + warn("sysctlbyname (%s)", s); + return (-1); + } + + return (val); +} + +static int +rss_getbucketmap(int *bucket_map, int nbuckets) +{ + /* XXX I'm lazy; so static string it is */ + char bstr[2048]; + int retval; + size_t rlen; + char *s, *ss; + int r, b, c; + + /* Paranoia */ + memset(bstr, '\0', sizeof(bstr)); + + rlen = sizeof(bstr) - 1; + retval = sysctlbyname("net.inet.rss.bucket_mapping", bstr, &rlen, NULL, 0); + if (retval < 0) { + warn("sysctlbyname (net.inet.rss.bucket_mapping)"); + return (-1); + } + + ss = bstr; + while ((s = strsep(&ss, " ")) != NULL) { + r = sscanf(s, "%d:%d", &b, &c); + if (r != 2) { + fprintf(stderr, "%s: string (%s) not parsable\n", + __func__, + s); + return (-1); + } + if (b > nbuckets) { + fprintf(stderr, "%s: bucket %d > nbuckets %d\n", + __func__, + b, + nbuckets); + return (-1); + } + /* XXX no maxcpu check */ + bucket_map[b] = c; + } + return (0); +} + +struct rss_config * +rss_config_get(void) +{ + struct rss_config *rc = NULL; + + rc = calloc(1, sizeof(*rc)); + if (rc == NULL) { + warn("%s: calloc", __func__); + goto error; + } + + rc->rss_ncpus = rss_getsysctlint("net.inet.rss.ncpus"); + if (rc->rss_ncpus < 0) { + fprintf(stderr, "%s: couldn't fetch net.inet.rss.ncpus\n", __func__); + goto error; + } + + rc->rss_nbuckets = rss_getsysctlint("net.inet.rss.buckets"); + if (rc->rss_nbuckets < 0) { + fprintf(stderr, "%s: couldn't fetch net.inet.rss.nbuckets\n", __func__); + goto error; + } + + rc->rss_basecpu = rss_getsysctlint("net.inet.rss.basecpu"); + if (rc->rss_basecpu< 0) { + fprintf(stderr, "%s: couldn't fetch net.inet.rss.basecpu\n", __func__); + goto error; + } + + rc->rss_bucket_map = calloc(rc->rss_nbuckets, sizeof(int)); + if (rc->rss_bucket_map == NULL) { + warn("%s: calloc (rss buckets; %d entries)", __func__, rc->rss_nbuckets); + goto error; + } + + if (rss_getbucketmap(rc->rss_bucket_map, rc->rss_nbuckets) != 0) { + fprintf(stderr, "%s: rss_getbucketmap failed\n", __func__); + goto error; + } + + return (rc); + +error: + if ((rc != NULL) && rc->rss_bucket_map) + free(rc->rss_bucket_map); + if (rc != NULL) + free(rc); + return (NULL); +} + +void +rss_config_free(struct rss_config *rc) +{ + + if ((rc != NULL) && rc->rss_bucket_map) + free(rc->rss_bucket_map); + if (rc != NULL) + free(rc); +} + +int +rss_config_get_bucket_count(struct rss_config *rc) +{ + + if (rc == NULL) + return (-1); + return (rc->rss_nbuckets); +} + +int +rss_get_bucket_cpuset(struct rss_config *rc, rss_bucket_type_t btype, + int bucket, cpuset_t *cs) +{ + + if (bucket < 0 || bucket >= rc->rss_nbuckets) { + errno = EINVAL; + return (-1); + } + + /* + * For now all buckets are the same, but eventually we'll want + * to allow administrators to set separate RSS cpusets for + * {kernel,user} {tx, rx} combinations. + */ + if (btype <= RSS_BUCKET_TYPE_NONE || btype > RSS_BUCKET_TYPE_MAX) { + errno = ENOTSUP; + return (-1); + } + + CPU_ZERO(cs); + CPU_SET(rc->rss_bucket_map[bucket], cs); + + return (0); +} + +int +rss_set_bucket_rebalance_cb(rss_bucket_rebalance_cb_t *cb, void *cbdata) +{ + + (void) cb; + (void) cbdata; + + /* + * For now there's no rebalance callback, so + * just return 0 and ignore it. + */ + return (0); +} diff --git a/lib/librss/librss.h b/lib/librss/librss.h new file mode 100644 index 000000000000..d0d03df9d5ea --- /dev/null +++ b/lib/librss/librss.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016 Adrian Chadd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __LIBRSS_H__ +#define __LIBRSS_H__ + +struct rss_config { + int rss_ncpus; + int rss_nbuckets; + int rss_basecpu; + int *rss_bucket_map; +}; + +typedef enum { + RSS_BUCKET_TYPE_NONE = 0, + RSS_BUCKET_TYPE_KERNEL_ALL = 1, + RSS_BUCKET_TYPE_KERNEL_TX = 2, + RSS_BUCKET_TYPE_KERNEL_RX = 3, + RSS_BUCKET_TYPE_MAX = 3, +} rss_bucket_type_t; + +typedef void rss_bucket_rebalance_cb_t(void *arg); + +/* + * Enable/disable whether to allow for multiple bind()s to the + * given PCB entry. + * + * This must be done before bind(). + */ +extern int rss_sock_set_bindmulti(int fd, int af, int val); + +/* + * Set the RSS bucket for the given file descriptor. + * + * This must be done before bind(). + */ +extern int rss_sock_set_rss_bucket(int fd, int af, int rss_bucket); + +/* + * Enable or disable receiving RSS/flowid information on + * received UDP frames. + */ +extern int rss_sock_set_recvrss(int fd, int af, int val); + +/* + * Fetch RSS configuration information. + */ +extern struct rss_config * rss_config_get(void); + +/* + * Free an RSS configuration structure. + */ +extern void rss_config_free(struct rss_config *rc); + +/* + * Return how many RSS buckets there are. + */ +extern int rss_config_get_bucket_count(struct rss_config *rc); + +/* + * Fetch the cpuset configuration for the given RSS bucket and + * type. + */ +extern int rss_get_bucket_cpuset(struct rss_config *rc, + rss_bucket_type_t btype, int bucket, cpuset_t *cs); + +/* + * Set a callback for bucket rebalancing. + * + * This will occur in a separate thread context rather than + * a signal handler. + */ +extern int rss_set_bucket_rebalance_cb(rss_bucket_rebalance_cb_t *cb, + void *cbdata); + +#endif /* __LIBRSS_H__ */