Implement kernel support for hardware rate limited sockets.

- Add RATELIMIT kernel configuration keyword which must be set to
enable the new functionality.

- Add support for hardware driven, Receive Side Scaling, RSS aware, rate
limited sendqueues and expose the functionality through the already
established SO_MAX_PACING_RATE setsockopt(). The API support rates in
the range from 1 to 4Gbytes/s which are suitable for regular TCP and
UDP streams. The setsockopt(2) manual page has been updated.

- Add rate limit function callback API to "struct ifnet" which supports
the following operations: if_snd_tag_alloc(), if_snd_tag_modify(),
if_snd_tag_query() and if_snd_tag_free().

- Add support to ifconfig to view, set and clear the IFCAP_TXRTLMT
flag, which tells if a network driver supports rate limiting or not.

- This patch also adds support for rate limiting through VLAN and LAGG
intermediate network devices.

- How rate limiting works:

1) The userspace application calls setsockopt() after accepting or
making a new connection to set the rate which is then stored in the
socket structure in the kernel. Later on when packets are transmitted
a check is made in the transmit path for rate changes. A rate change
implies a non-blocking ifp->if_snd_tag_alloc() call will be made to the
destination network interface, which then sets up a custom sendqueue
with the given rate limitation parameter. A "struct m_snd_tag" pointer is
returned which serves as a "snd_tag" hint in the m_pkthdr for the
subsequently transmitted mbufs.

2) When the network driver sees the "m->m_pkthdr.snd_tag" different
from NULL, it will move the packets into a designated rate limited sendqueue
given by the snd_tag pointer. It is up to the individual drivers how the rate
limited traffic will be rate limited.

3) Route changes are detected by the NIC drivers in the ifp->if_transmit()
routine when the ifnet pointer in the incoming snd_tag mismatches the
one of the network interface. The network adapter frees the mbuf and
returns EAGAIN which causes the ip_output() to release and clear the send
tag. Upon next ip_output() a new "snd_tag" will be tried allocated.

4) When the PCB is detached the custom sendqueue will be released by a
non-blocking ifp->if_snd_tag_free() call to the currently bound network
interface.

Reviewed by:		wblock (manpages), adrian, gallatin, scottl (network)
Differential Revision:	https://reviews.freebsd.org/D3687
Sponsored by:		Mellanox Technologies
MFC after:		3 months
This commit is contained in:
Hans Petter Selasky 2017-01-18 13:31:17 +00:00
parent ae69172343
commit f3e7afe2d7
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=312379
24 changed files with 606 additions and 13 deletions

View File

@ -28,7 +28,7 @@
.\" @(#)getsockopt.2 8.4 (Berkeley) 5/2/95
.\" $FreeBSD$
.\"
.Dd April 5, 2013
.Dd January 18, 2017
.Dt GETSOCKOPT 2
.Os
.Sh NAME
@ -188,6 +188,7 @@ The following options are recognized in
.It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)"
.It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)"
.It Dv SO_TS_CLOCK Ta "set specific format of timestamp returned by SO_TIMESTAMP"
.It Dv SO_MAX_PACING_RATE "set the maximum transmit rate in bytes per second for the socket"
.El
.Pp
.Dv SO_DEBUG
@ -515,6 +516,10 @@ returns the maximal number of queued connections, as set by
returns the number of unaccepted complete connections.
.Dv SO_LISTENINCQLEN
returns the number of unaccepted incomplete connections.
.Pp
.Dv SO_MAX_PACING_RATE
instruct the socket and underlying network adapter layers to limit the
transfer rate to the given unsigned 32-bit value in bytes per second.
.Sh RETURN VALUES
.Rv -std
.Sh ERRORS

View File

@ -28,7 +28,7 @@
.\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94
.\" $FreeBSD$
.\"
.Dd September 17, 2016
.Dd January 18, 2017
.Dt IFCONFIG 8
.Os
.Sh NAME
@ -460,6 +460,8 @@ this directive is used to select between 802.11a
and 802.11g
.Pq Cm 11g
operating modes.
.It Cm txrtlmt
Set if the driver supports TX rate limiting.
.It Cm inst Ar minst , Cm instance Ar minst
Set the media instance to
.Ar minst .

View File

@ -1145,7 +1145,7 @@ unsetifdescr(const char *val, int value, int s, const struct afswtch *afp)
"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
"\26RXCSUM_IPV6\27TXCSUM_IPV6"
"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT"
/*
* Print the status of the interface. If an address family was
@ -1453,6 +1453,8 @@ static struct cmd basic_cmds[] = {
DEF_CMD("-wol_mcast", -IFCAP_WOL_MCAST, setifcap),
DEF_CMD("wol_magic", IFCAP_WOL_MAGIC, setifcap),
DEF_CMD("-wol_magic", -IFCAP_WOL_MAGIC, setifcap),
DEF_CMD("txrtlmt", IFCAP_TXRTLMT, setifcap),
DEF_CMD("-txrtlmt", -IFCAP_TXRTLMT, setifcap),
DEF_CMD("normal", -IFF_LINK0, setifflags),
DEF_CMD("compress", IFF_LINK0, setifflags),
DEF_CMD("noicmp", IFF_LINK1, setifflags),

View File

@ -619,6 +619,8 @@ options HWPMC_HOOKS # Other necessary kernel hooks
options INET #Internet communications protocols
options INET6 #IPv6 communications protocols
options RATELIMIT # TX rate limiting support
options ROUTETABLES=2 # allocated fibs up to 65536. default is 1.
# but that would be a bad idea as they are large.

View File

@ -19,6 +19,10 @@ opt_inet.h:
opt_inet6.h:
@echo "#define INET6 1" > ${.TARGET}
.endif
.if ${MK_RATELIMIT} != "no"
opt_ratelimit.h:
@echo "#define RATELIMIT 1" > ${.TARGET}
.endif
.if ${MK_EISA} != "no"
opt_eisa.h:
@echo "#define DEV_EISA 1" > ${.TARGET}

View File

@ -48,6 +48,7 @@ __DEFAULT_NO_OPTIONS = \
EXTRA_TCP_STACKS \
NAND \
OFED \
RATELIMIT \
REPRODUCIBLE_BUILD
# Some options are totally broken on some architectures. We disable

View File

@ -412,6 +412,7 @@ BOOTP_NFSV3 opt_bootp.h
BOOTP_WIRED_TO opt_bootp.h
DEVICE_POLLING
DUMMYNET opt_ipdn.h
RATELIMIT opt_ratelimit.h
INET opt_inet.h
INET6 opt_inet6.h
IPDIVERT

View File

@ -2699,6 +2699,14 @@ sosetopt(struct socket *so, struct sockopt *sopt)
so->so_ts_clock = optval;
break;
case SO_MAX_PACING_RATE:
error = sooptcopyin(sopt, &val32, sizeof(val32),
sizeof(val32));
if (error)
goto bad;
so->so_max_pacing_rate = val32;
break;
default:
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
@ -2890,6 +2898,10 @@ sogetopt(struct socket *so, struct sockopt *sopt)
optval = so->so_ts_clock;
goto integer;
case SO_MAX_PACING_RATE:
optval = so->so_max_pacing_rate;
goto integer;
default:
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,

View File

@ -2,6 +2,6 @@
.PATH: ${.CURDIR}/../../net
KMOD= if_lagg
SRCS= if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h
SRCS= if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h opt_ratelimit.h
.include <bsd.kmod.mk>

View File

@ -4,6 +4,6 @@
KMOD= if_vlan
SRCS= if_vlan.c
SRCS+= opt_inet.h opt_vlan.h
SRCS+= opt_inet.h opt_vlan.h opt_ratelimit.h
.include <bsd.kmod.mk>

View File

@ -30,6 +30,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/callout.h>
#include <sys/eventhandler.h>
@ -853,6 +855,35 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m)
return (lp->lp_lagg);
}
#ifdef RATELIMIT
struct lagg_port *
lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid)
{
struct lacp_softc *lsc = LACP_SOFTC(sc);
struct lacp_portmap *pm;
struct lacp_port *lp;
uint32_t hash;
if (__predict_false(lsc->lsc_suppress_distributing)) {
LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
return (NULL);
}
pm = &lsc->lsc_pmap[lsc->lsc_activemap];
if (pm->pm_count == 0) {
LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__));
return (NULL);
}
hash = flowid >> sc->flowid_shift;
hash %= pm->pm_count;
lp = pm->pm_map[hash];
return (lp->lp_lagg);
}
#endif
/*
* lacp_suppress_distributing: drop transmit packets for a while
* to preserve packet ordering.

View File

@ -284,6 +284,9 @@ struct lacp_softc {
struct mbuf *lacp_input(struct lagg_port *, struct mbuf *);
struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *);
#ifdef RATELIMIT
struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t);
#endif
void lacp_attach(struct lagg_softc *);
void lacp_detach(void *);
void lacp_init(struct lagg_softc *);

View File

@ -239,6 +239,7 @@ struct if_data {
#define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */
#define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */
#define IFCAP_HWSTATS 0x800000 /* manages counters internally */
#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)

View File

@ -100,6 +100,30 @@ ifdead_get_counter(struct ifnet *ifp, ift_counter cnt)
return (0);
}
static int
ifdead_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
struct m_snd_tag **ppmt)
{
return (EOPNOTSUPP);
}
static int
ifdead_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
{
return (EOPNOTSUPP);
}
static int
ifdead_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
{
return (EOPNOTSUPP);
}
static void
ifdead_snd_tag_free(struct m_snd_tag *pmt)
{
}
void
if_dead(struct ifnet *ifp)
{
@ -112,4 +136,8 @@ if_dead(struct ifnet *ifp)
ifp->if_qflush = ifdead_qflush;
ifp->if_transmit = ifdead_transmit;
ifp->if_get_counter = ifdead_get_counter;
ifp->if_snd_tag_alloc = ifdead_snd_tag_alloc;
ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
ifp->if_snd_tag_query = ifdead_snd_tag_query;
ifp->if_snd_tag_free = ifdead_snd_tag_free;
}

View File

@ -23,6 +23,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/kernel.h>
@ -118,6 +119,11 @@ static void lagg_port2req(struct lagg_port *, struct lagg_reqport *);
static void lagg_init(void *);
static void lagg_stop(struct lagg_softc *);
static int lagg_ioctl(struct ifnet *, u_long, caddr_t);
#ifdef RATELIMIT
static int lagg_snd_tag_alloc(struct ifnet *,
union if_snd_tag_alloc_params *,
struct m_snd_tag **);
#endif
static int lagg_ether_setmulti(struct lagg_softc *);
static int lagg_ether_cmdmulti(struct lagg_port *, int);
static int lagg_setflag(struct lagg_port *, int, int,
@ -503,7 +509,12 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
ifp->if_ioctl = lagg_ioctl;
ifp->if_get_counter = lagg_get_counter;
ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
#ifdef RATELIMIT
ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT;
#else
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
#endif
/*
* Attach as an ordinary ethernet device, children will be attached
@ -1549,6 +1560,52 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
return (error);
}
#ifdef RATELIMIT
static int
lagg_snd_tag_alloc(struct ifnet *ifp,
union if_snd_tag_alloc_params *params,
struct m_snd_tag **ppmt)
{
struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
struct lagg_port *lp;
struct lagg_lb *lb;
uint32_t p;
switch (sc->sc_proto) {
case LAGG_PROTO_FAILOVER:
lp = lagg_link_active(sc, sc->sc_primary);
break;
case LAGG_PROTO_LOADBALANCE:
if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
params->hdr.flowtype == M_HASHTYPE_NONE)
return (EOPNOTSUPP);
p = params->hdr.flowid >> sc->flowid_shift;
p %= sc->sc_count;
lb = (struct lagg_lb *)sc->sc_psc;
lp = lb->lb_ports[p];
lp = lagg_link_active(sc, lp);
break;
case LAGG_PROTO_LACP:
if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
params->hdr.flowtype == M_HASHTYPE_NONE)
return (EOPNOTSUPP);
lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid);
break;
default:
return (EOPNOTSUPP);
}
if (lp == NULL)
return (EOPNOTSUPP);
ifp = lp->lp_ifp;
if (ifp == NULL || ifp->if_snd_tag_alloc == NULL ||
(ifp->if_capenable & IFCAP_TXRTLMT) == 0)
return (EOPNOTSUPP);
/* forward allocation request */
return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
}
#endif
static int
lagg_ether_setmulti(struct lagg_softc *sc)
{

View File

@ -175,6 +175,49 @@ struct if_encap_req {
#define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */
/*
* Network interface send tag support. The storage of "struct
* m_snd_tag" comes from the network driver and it is free to allocate
* as much additional space as it wants for its own use.
*/
struct m_snd_tag;
#define IF_SND_TAG_TYPE_RATE_LIMIT 0
#define IF_SND_TAG_TYPE_MAX 1
struct if_snd_tag_alloc_header {
uint32_t type; /* send tag type, see IF_SND_TAG_XXX */
uint32_t flowid; /* mbuf hash value */
uint32_t flowtype; /* mbuf hash type */
};
struct if_snd_tag_alloc_rate_limit {
struct if_snd_tag_alloc_header hdr;
uint64_t max_rate; /* in bytes/s */
};
struct if_snd_tag_rate_limit_params {
uint64_t max_rate; /* in bytes/s */
};
union if_snd_tag_alloc_params {
struct if_snd_tag_alloc_header hdr;
struct if_snd_tag_alloc_rate_limit rate_limit;
};
union if_snd_tag_modify_params {
struct if_snd_tag_rate_limit_params rate_limit;
};
union if_snd_tag_query_params {
struct if_snd_tag_rate_limit_params rate_limit;
};
typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
struct m_snd_tag **);
typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
/*
* Structure defining a network interface.
@ -303,13 +346,20 @@ struct ifnet {
u_int if_hw_tsomaxsegcount; /* TSO maximum segment count */
u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */
/*
* Network adapter send tag support:
*/
if_snd_tag_alloc_t *if_snd_tag_alloc;
if_snd_tag_modify_t *if_snd_tag_modify;
if_snd_tag_query_t *if_snd_tag_query;
if_snd_tag_free_t *if_snd_tag_free;
/*
* Spare fields to be added before branching a stable branch, so
* that structure can be enhanced without changing the kernel
* binary interface.
*/
void *if_pspare[4]; /* packet pacing / general use */
int if_ispare[4]; /* packet pacing / general use */
int if_ispare[4]; /* general use */
};
/* for compatibility with other BSDs */

View File

@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_vlan.h"
#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/eventhandler.h>
@ -212,6 +213,10 @@ static void trunk_destroy(struct ifvlantrunk *trunk);
static void vlan_init(void *foo);
static void vlan_input(struct ifnet *ifp, struct mbuf *m);
static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
#ifdef RATELIMIT
static int vlan_snd_tag_alloc(struct ifnet *,
union if_snd_tag_alloc_params *, struct m_snd_tag **);
#endif
static void vlan_qflush(struct ifnet *ifp);
static int vlan_setflag(struct ifnet *ifp, int flag, int status,
int (*func)(struct ifnet *, int));
@ -971,6 +976,9 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
ifp->if_transmit = vlan_transmit;
ifp->if_qflush = vlan_qflush;
ifp->if_ioctl = vlan_ioctl;
#ifdef RATELIMIT
ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
#endif
ifp->if_flags = VLAN_IFFLAGS;
ether_ifattach(ifp, eaddr);
/* Now undo some of the damage... */
@ -1591,6 +1599,15 @@ vlan_capabilities(struct ifvlan *ifv)
TOEDEV(ifp) = TOEDEV(p);
ifp->if_capenable |= p->if_capenable & IFCAP_TOE;
}
#ifdef RATELIMIT
/*
* If the parent interface supports ratelimiting, so does the
* VLAN interface.
*/
ifp->if_capabilities |= (p->if_capabilities & IFCAP_TXRTLMT);
ifp->if_capenable |= (p->if_capenable & IFCAP_TXRTLMT);
#endif
}
static void
@ -1801,3 +1818,19 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
return (error);
}
#ifdef RATELIMIT
static int
vlan_snd_tag_alloc(struct ifnet *ifp,
union if_snd_tag_alloc_params *params,
struct m_snd_tag **ppmt)
{
/* get trunk device */
ifp = vlan_trunkdev(ifp);
if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
return (EOPNOTSUPP);
/* forward allocation request */
return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
}
#endif

View File

@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
#include "opt_ipsec.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ratelimit.h"
#include "opt_pcbgroup.h"
#include "opt_rss.h"
@ -57,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
@ -1140,6 +1142,10 @@ in_pcbdetach(struct inpcb *inp)
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
#ifdef RATELIMIT
if (inp->inp_snd_tag != NULL)
in_pcbdetach_txrtlmt(inp);
#endif
inp->inp_socket->so_pcb = NULL;
inp->inp_socket = NULL;
}
@ -2677,3 +2683,253 @@ DB_SHOW_COMMAND(inpcb, db_show_inpcb)
db_print_inpcb(inp, "inpcb", 0);
}
#endif /* DDB */
#ifdef RATELIMIT
/*
* Modify TX rate limit based on the existing "inp->inp_snd_tag",
* if any.
*/
int
in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
{
union if_snd_tag_modify_params params = {
.rate_limit.max_rate = max_pacing_rate,
};
struct m_snd_tag *mst;
struct ifnet *ifp;
int error;
mst = inp->inp_snd_tag;
if (mst == NULL)
return (EINVAL);
ifp = mst->ifp;
if (ifp == NULL)
return (EINVAL);
if (ifp->if_snd_tag_modify == NULL) {
error = EOPNOTSUPP;
} else {
error = ifp->if_snd_tag_modify(mst, &params);
}
return (error);
}
/*
* Query existing TX rate limit based on the existing
* "inp->inp_snd_tag", if any.
*/
int
in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
{
union if_snd_tag_query_params params = { };
struct m_snd_tag *mst;
struct ifnet *ifp;
int error;
mst = inp->inp_snd_tag;
if (mst == NULL)
return (EINVAL);
ifp = mst->ifp;
if (ifp == NULL)
return (EINVAL);
if (ifp->if_snd_tag_query == NULL) {
error = EOPNOTSUPP;
} else {
error = ifp->if_snd_tag_query(mst, &params);
if (error == 0 && p_max_pacing_rate != NULL)
*p_max_pacing_rate = params.rate_limit.max_rate;
}
return (error);
}
/*
* Allocate a new TX rate limit send tag from the network interface
* given by the "ifp" argument and save it in "inp->inp_snd_tag":
*/
int
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
{
union if_snd_tag_alloc_params params = {
.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
.rate_limit.hdr.flowid = flowid,
.rate_limit.hdr.flowtype = flowtype,
.rate_limit.max_rate = max_pacing_rate,
};
int error;
INP_WLOCK_ASSERT(inp);
if (inp->inp_snd_tag != NULL)
return (EINVAL);
if (ifp->if_snd_tag_alloc == NULL) {
error = EOPNOTSUPP;
} else {
error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
/*
* At success increment the refcount on
* the send tag's network interface:
*/
if (error == 0)
if_ref(inp->inp_snd_tag->ifp);
}
return (error);
}
/*
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
* if any:
*/
void
in_pcbdetach_txrtlmt(struct inpcb *inp)
{
struct m_snd_tag *mst;
struct ifnet *ifp;
INP_WLOCK_ASSERT(inp);
mst = inp->inp_snd_tag;
inp->inp_snd_tag = NULL;
if (mst == NULL)
return;
ifp = mst->ifp;
if (ifp == NULL)
return;
/*
* If the device was detached while we still had reference(s)
* on the ifp, we assume if_snd_tag_free() was replaced with
* stubs.
*/
ifp->if_snd_tag_free(mst);
/* release reference count on network interface */
if_rele(ifp);
}
/*
* This function should be called when the INP_RATE_LIMIT_CHANGED flag
* is set in the fast path and will attach/detach/modify the TX rate
* limit send tag based on the socket's so_max_pacing_rate value.
*/
void
in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
{
struct socket *socket;
uint32_t max_pacing_rate;
bool did_upgrade;
int error;
if (inp == NULL)
return;
socket = inp->inp_socket;
if (socket == NULL)
return;
if (!INP_WLOCKED(inp)) {
/*
* NOTE: If the write locking fails, we need to bail
* out and use the non-ratelimited ring for the
* transmit until there is a new chance to get the
* write lock.
*/
if (!INP_TRY_UPGRADE(inp))
return;
did_upgrade = 1;
} else {
did_upgrade = 0;
}
/*
* NOTE: The so_max_pacing_rate value is read unlocked,
* because atomic updates are not required since the variable
* is checked at every mbuf we send. It is assumed that the
* variable read itself will be atomic.
*/
max_pacing_rate = socket->so_max_pacing_rate;
/*
* NOTE: When attaching to a network interface a reference is
* made to ensure the network interface doesn't go away until
* all ratelimit connections are gone. The network interface
* pointers compared below represent valid network interfaces,
* except when comparing towards NULL.
*/
if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
error = 0;
} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
if (inp->inp_snd_tag != NULL)
in_pcbdetach_txrtlmt(inp);
error = 0;
} else if (inp->inp_snd_tag == NULL) {
/*
* In order to utilize packet pacing with RSS, we need
* to wait until there is a valid RSS hash before we
* can proceed:
*/
if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
error = EAGAIN;
} else {
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
mb->m_pkthdr.flowid, max_pacing_rate);
}
} else {
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
}
if (error == 0 || error == EOPNOTSUPP)
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
if (did_upgrade)
INP_DOWNGRADE(inp);
}
/*
* Track route changes for TX rate limiting.
*/
void
in_pcboutput_eagain(struct inpcb *inp)
{
struct socket *socket;
bool did_upgrade;
if (inp == NULL)
return;
socket = inp->inp_socket;
if (socket == NULL)
return;
if (inp->inp_snd_tag == NULL)
return;
if (!INP_WLOCKED(inp)) {
/*
* NOTE: If the write locking fails, we need to bail
* out and use the non-ratelimited ring for the
* transmit until there is a new chance to get the
* write lock.
*/
if (!INP_TRY_UPGRADE(inp))
return;
did_upgrade = 1;
} else {
did_upgrade = 0;
}
/* detach rate limiting */
in_pcbdetach_txrtlmt(inp);
/* make sure new mbuf send tag allocation is made */
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
if (did_upgrade)
INP_DOWNGRADE(inp);
}
#endif /* RATELIMIT */

View File

@ -181,6 +181,7 @@ struct icmp6_filter;
* read-lock usage during modification, this model can be applied to other
* protocols (especially SCTP).
*/
struct m_snd_tag;
struct inpcb {
LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
@ -202,11 +203,11 @@ struct inpcb {
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
uint32_t inp_flowid; /* (x) flow id / queue id */
u_int inp_refcount; /* (i) refcount */
void *inp_pspare[5]; /* (x) packet pacing / general use */
struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */
void *inp_pspare[4]; /* (x) general use */
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
u_int inp_ispare[4]; /* (x) packet pacing / user cookie /
* general use */
u_int inp_ispare[4]; /* (x) user cookie / general use */
/* Local and foreign ports, local and foreign addr. */
struct in_conninfo inp_inc; /* (i) list for PCB's local port */
@ -616,6 +617,7 @@ short inp_so_options(const struct inpcb *inp);
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
#define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
/*
* Flags passed to in_pcblookup*() functions.
@ -736,6 +738,14 @@ int in_getsockaddr(struct socket *so, struct sockaddr **nam);
struct sockaddr *
in_sockaddr(in_port_t port, struct in_addr *addr);
void in_pcbsosetlabel(struct socket *so);
#ifdef RATELIMIT
int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
void in_pcbdetach_txrtlmt(struct inpcb *);
int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
void in_pcboutput_eagain(struct inpcb *);
#endif
#endif /* _KERNEL */
#endif /* !_NETINET_IN_PCB_H_ */

View File

@ -33,6 +33,7 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_mbuf_stress_test.h"
#include "opt_mpath.h"
@ -661,8 +662,23 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
*/
m_clrprotoflags(m);
IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
#ifdef RATELIMIT
if (inp != NULL) {
if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
in_pcboutput_txrtlmt(inp, ifp, m);
/* stamp send tag on mbuf */
m->m_pkthdr.snd_tag = inp->inp_snd_tag;
} else {
m->m_pkthdr.snd_tag = NULL;
}
#endif
error = (*ifp->if_output)(ifp, m,
(const struct sockaddr *)gw, ro);
#ifdef RATELIMIT
/* check for route change */
if (error == EAGAIN)
in_pcboutput_eagain(inp);
#endif
goto done;
}
@ -698,8 +714,23 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
mtod(m, struct ip *), NULL);
#ifdef RATELIMIT
if (inp != NULL) {
if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
in_pcboutput_txrtlmt(inp, ifp, m);
/* stamp send tag on mbuf */
m->m_pkthdr.snd_tag = inp->inp_snd_tag;
} else {
m->m_pkthdr.snd_tag = NULL;
}
#endif
error = (*ifp->if_output)(ifp, m,
(const struct sockaddr *)gw, ro);
#ifdef RATELIMIT
/* check for route change */
if (error == EAGAIN)
in_pcboutput_eagain(inp);
#endif
} else
m_freem(m);
}
@ -974,6 +1005,16 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(inp);
error = 0;
break;
case SO_MAX_PACING_RATE:
#ifdef RATELIMIT
INP_WLOCK(inp);
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
INP_WUNLOCK(inp);
error = 0;
#else
error = EOPNOTSUPP;
#endif
break;
default:
break;
}

View File

@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_sctp.h"
#include "opt_route.h"
@ -954,8 +955,23 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
m->m_pkthdr.len);
ifa_free(&ia6->ia_ifa);
}
#ifdef RATELIMIT
if (inp != NULL) {
if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
in_pcboutput_txrtlmt(inp, ifp, m);
/* stamp send tag on mbuf */
m->m_pkthdr.snd_tag = inp->inp_snd_tag;
} else {
m->m_pkthdr.snd_tag = NULL;
}
#endif
error = nd6_output_ifp(ifp, origifp, m, dst,
(struct route *)ro);
#ifdef RATELIMIT
/* check for route change */
if (error == EAGAIN)
in_pcboutput_eagain(inp);
#endif
goto done;
}
@ -1054,8 +1070,23 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
counter_u64_add(ia->ia_ifa.ifa_obytes,
m->m_pkthdr.len);
}
#ifdef RATELIMIT
if (inp != NULL) {
if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
in_pcboutput_txrtlmt(inp, ifp, m);
/* stamp send tag on mbuf */
m->m_pkthdr.snd_tag = inp->inp_snd_tag;
} else {
m->m_pkthdr.snd_tag = NULL;
}
#endif
error = nd6_output_ifp(ifp, origifp, m, dst,
(struct route *)ro);
#ifdef RATELIMIT
/* check for route change */
if (error == EAGAIN)
in_pcboutput_eagain(inp);
#endif
} else
m_freem(m);
}
@ -1441,6 +1472,16 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(in6p);
error = 0;
break;
case SO_MAX_PACING_RATE:
#ifdef RATELIMIT
INP_WLOCK(in6p);
in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
INP_WUNLOCK(in6p);
error = 0;
#else
error = EOPNOTSUPP;
#endif
break;
default:
break;
}

View File

@ -129,6 +129,14 @@ struct m_tag {
void (*m_tag_free)(struct m_tag *);
};
/*
* Static network interface owned tag.
* Allocated through ifp->if_snd_tag_alloc().
*/
struct m_snd_tag {
struct ifnet *ifp; /* network interface tag belongs to */
};
/*
* Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
* Size ILP32: 48
@ -137,7 +145,10 @@ struct m_tag {
* they are correct.
*/
struct pkthdr {
struct ifnet *rcvif; /* rcv interface */
union {
struct m_snd_tag *snd_tag; /* send tag, if any */
struct ifnet *rcvif; /* rcv interface */
};
SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
int32_t len; /* total packet length */

View File

@ -159,6 +159,7 @@ typedef __uintptr_t uintptr_t;
#define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */
#define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */
#define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */
#define SO_MAX_PACING_RATE 0x1018 /* socket's max TX pacing rate (Linux name) */
#endif
#if __BSD_VISIBLE

View File

@ -128,9 +128,10 @@ struct socket {
uint32_t so_user_cookie;
int so_ts_clock; /* type of the clock used for timestamps */
uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */
void *so_pspare[2]; /* packet pacing / general use */
int so_ispare[2]; /* packet pacing / general use */
void *so_pspare[2]; /* general use */
int so_ispare[2]; /* general use */
};
/*