This adds the third step in getting BBR into the tree. BBR and
an updated rack depend on having access to the new ratelimit api in this commit. Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D20953
This commit is contained in:
parent
ecb2bbc081
commit
20abea6663
@ -4276,6 +4276,7 @@ netinet/tcp_lro.c optional inet | inet6
|
|||||||
netinet/tcp_output.c optional inet | inet6
|
netinet/tcp_output.c optional inet | inet6
|
||||||
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
|
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
|
||||||
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
|
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
|
||||||
|
netinet/tcp_ratelimit.c optional ratelimit inet | ratelimit inet6
|
||||||
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \
|
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \
|
||||||
compile-with "${NORMAL_C} ${NO_WNONNULL}"
|
compile-with "${NORMAL_C} ${NO_WNONNULL}"
|
||||||
netinet/tcp_reass.c optional inet | inet6
|
netinet/tcp_reass.c optional inet | inet6
|
||||||
|
@ -1247,6 +1247,7 @@ int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *);
|
|||||||
int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
|
int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
|
||||||
void cxgbe_snd_tag_free(struct m_snd_tag *);
|
void cxgbe_snd_tag_free(struct m_snd_tag *);
|
||||||
void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
|
void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
|
||||||
|
void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* t4_filter.c */
|
/* t4_filter.c */
|
||||||
|
@ -1658,6 +1658,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
|
|||||||
ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
|
ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
|
||||||
ifp->if_snd_tag_query = cxgbe_snd_tag_query;
|
ifp->if_snd_tag_query = cxgbe_snd_tag_query;
|
||||||
ifp->if_snd_tag_free = cxgbe_snd_tag_free;
|
ifp->if_snd_tag_free = cxgbe_snd_tag_free;
|
||||||
|
ifp->if_ratelimit_query = cxgbe_ratelimit_query;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ifp->if_capabilities = T4_CAP;
|
ifp->if_capabilities = T4_CAP;
|
||||||
|
@ -903,4 +903,35 @@ cxgbe_snd_tag_free(struct m_snd_tag *mst)
|
|||||||
}
|
}
|
||||||
mtx_unlock(&cst->lock);
|
mtx_unlock(&cst->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define CXGBE_MAX_FLOWS 4000 /* Testing show so far thats all this adapter can do */
|
||||||
|
#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup */
|
||||||
|
|
||||||
|
void
|
||||||
|
cxgbe_ratelimit_query(struct ifnet *ifp __unused,
|
||||||
|
struct if_ratelimit_query_results *q)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* This is a skeleton and needs future work
|
||||||
|
* by the driver supporters. It should be
|
||||||
|
* enhanced to look at the specific type of
|
||||||
|
* interface and select approprate values
|
||||||
|
* for these settings. This example goes
|
||||||
|
* with an earlier card (t5), it has a maximum
|
||||||
|
* number of 16 rates that the first guys in
|
||||||
|
* select (thus the flags value RT_IS_SELECTABLE).
|
||||||
|
* If it was a fixed table then we would setup a
|
||||||
|
* const array (example mlx5). Note the card tested
|
||||||
|
* can only support reasonably 4000 flows before
|
||||||
|
* the adapter has issues with sending so here
|
||||||
|
* we limit the number of flows using hardware
|
||||||
|
* pacing to that number, other cards may
|
||||||
|
* be able to raise or eliminate this limit.
|
||||||
|
*/
|
||||||
|
q->rate_table = NULL;
|
||||||
|
q->flags = RT_IS_SELECTABLE;
|
||||||
|
q->max_flows = CXGBE_MAX_FLOWS;
|
||||||
|
q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT;
|
||||||
|
q->min_segment_burst = 4; /* Driver emits 4 in a burst */
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -4070,6 +4070,48 @@ mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define NUM_HDWR_RATES_MLX 13
|
||||||
|
static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
|
||||||
|
135375, /* 1,083,000 */
|
||||||
|
180500, /* 1,444,000 */
|
||||||
|
270750, /* 2,166,000 */
|
||||||
|
361000, /* 2,888,000 */
|
||||||
|
541500, /* 4,332,000 */
|
||||||
|
721875, /* 5,775,000 */
|
||||||
|
1082875, /* 8,663,000 */
|
||||||
|
1443875, /* 11,551,000 */
|
||||||
|
2165750, /* 17,326,000 */
|
||||||
|
2887750, /* 23,102,000 */
|
||||||
|
4331625, /* 34,653,000 */
|
||||||
|
5775500, /* 46,204,000 */
|
||||||
|
8663125 /* 69,305,000 */
|
||||||
|
};
|
||||||
|
|
||||||
|
static void
|
||||||
|
mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* This function needs updating by the driver maintainer!
|
||||||
|
* For the MLX card there are currently (ConectX-4?) 13
|
||||||
|
* pre-set rates and others i.e. ConnectX-5, 6, 7??
|
||||||
|
*
|
||||||
|
* This will change based on later adapters
|
||||||
|
* and this code should be updated to look at ifp
|
||||||
|
* and figure out the specific adapter type
|
||||||
|
* settings i.e. how many rates as well
|
||||||
|
* as if they are fixed (as is shown here) or
|
||||||
|
* if they are dynamic (example chelsio t4). Also if there
|
||||||
|
* is a maximum number of flows that the adapter
|
||||||
|
* can handle that too needs to be updated in
|
||||||
|
* the max_flows field.
|
||||||
|
*/
|
||||||
|
q->rate_table = adapter_rates_mlx;
|
||||||
|
q->flags = RT_IS_FIXED_TABLE;
|
||||||
|
q->max_flows = 0; /* mlx has no limit */
|
||||||
|
q->number_of_rates = NUM_HDWR_RATES_MLX;
|
||||||
|
q->min_segment_burst = 1;
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
mlx5e_snd_tag_free(struct m_snd_tag *pmt)
|
mlx5e_snd_tag_free(struct m_snd_tag *pmt)
|
||||||
{
|
{
|
||||||
@ -4155,7 +4197,9 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
|
|||||||
ifp->if_snd_tag_free = mlx5e_snd_tag_free;
|
ifp->if_snd_tag_free = mlx5e_snd_tag_free;
|
||||||
ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
|
ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
|
||||||
ifp->if_snd_tag_query = mlx5e_snd_tag_query;
|
ifp->if_snd_tag_query = mlx5e_snd_tag_query;
|
||||||
|
#ifdef RATELIMIT
|
||||||
|
ifp->if_ratelimit_query = mlx5e_ratelimit_query;
|
||||||
|
#endif
|
||||||
/* set TSO limits so that we don't have to drop TX packets */
|
/* set TSO limits so that we don't have to drop TX packets */
|
||||||
ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
|
ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
|
||||||
ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
|
ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
|
||||||
|
@ -126,6 +126,23 @@ ifdead_snd_tag_free(struct m_snd_tag *pmt)
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
ifdead_ratelimit_query(struct ifnet *ifp __unused,
|
||||||
|
struct if_ratelimit_query_results *q)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* This guy does not support
|
||||||
|
* this interface. Not sure
|
||||||
|
* why we would specify a
|
||||||
|
* flag on the interface
|
||||||
|
* that says we do.
|
||||||
|
*/
|
||||||
|
q->rate_table = NULL;
|
||||||
|
q->flags = RT_NOSUPPORT;
|
||||||
|
q->max_flows = 0;
|
||||||
|
q->number_of_rates = 0;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
if_dead(struct ifnet *ifp)
|
if_dead(struct ifnet *ifp)
|
||||||
{
|
{
|
||||||
@ -142,4 +159,5 @@ if_dead(struct ifnet *ifp)
|
|||||||
ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
|
ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
|
||||||
ifp->if_snd_tag_query = ifdead_snd_tag_query;
|
ifp->if_snd_tag_query = ifdead_snd_tag_query;
|
||||||
ifp->if_snd_tag_free = ifdead_snd_tag_free;
|
ifp->if_snd_tag_free = ifdead_snd_tag_free;
|
||||||
|
ifp->if_ratelimit_query = ifdead_ratelimit_query;
|
||||||
}
|
}
|
||||||
|
@ -144,6 +144,8 @@ static int lagg_snd_tag_modify(struct m_snd_tag *,
|
|||||||
static int lagg_snd_tag_query(struct m_snd_tag *,
|
static int lagg_snd_tag_query(struct m_snd_tag *,
|
||||||
union if_snd_tag_query_params *);
|
union if_snd_tag_query_params *);
|
||||||
static void lagg_snd_tag_free(struct m_snd_tag *);
|
static void lagg_snd_tag_free(struct m_snd_tag *);
|
||||||
|
static void lagg_ratelimit_query(struct ifnet *,
|
||||||
|
struct if_ratelimit_query_results *);
|
||||||
#endif
|
#endif
|
||||||
static int lagg_setmulti(struct lagg_port *);
|
static int lagg_setmulti(struct lagg_port *);
|
||||||
static int lagg_clrmulti(struct lagg_port *);
|
static int lagg_clrmulti(struct lagg_port *);
|
||||||
@ -537,6 +539,7 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
|
|||||||
ifp->if_snd_tag_modify = lagg_snd_tag_modify;
|
ifp->if_snd_tag_modify = lagg_snd_tag_modify;
|
||||||
ifp->if_snd_tag_query = lagg_snd_tag_query;
|
ifp->if_snd_tag_query = lagg_snd_tag_query;
|
||||||
ifp->if_snd_tag_free = lagg_snd_tag_free;
|
ifp->if_snd_tag_free = lagg_snd_tag_free;
|
||||||
|
ifp->if_ratelimit_query = lagg_ratelimit_query;
|
||||||
#endif
|
#endif
|
||||||
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
|
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
|
||||||
|
|
||||||
@ -1670,6 +1673,20 @@ lagg_snd_tag_free(struct m_snd_tag *mst)
|
|||||||
free(lst, M_LAGG);
|
free(lst, M_LAGG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* For lagg, we have an indirect
|
||||||
|
* interface. The caller needs to
|
||||||
|
* get a ratelimit tag on the actual
|
||||||
|
* interface the flow will go on.
|
||||||
|
*/
|
||||||
|
q->rate_table = NULL;
|
||||||
|
q->flags = RT_IS_INDIRECT;
|
||||||
|
q->max_flows = 0;
|
||||||
|
q->number_of_rates = 0;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -203,6 +203,8 @@ struct if_snd_tag_alloc_header {
|
|||||||
struct if_snd_tag_alloc_rate_limit {
|
struct if_snd_tag_alloc_rate_limit {
|
||||||
struct if_snd_tag_alloc_header hdr;
|
struct if_snd_tag_alloc_header hdr;
|
||||||
uint64_t max_rate; /* in bytes/s */
|
uint64_t max_rate; /* in bytes/s */
|
||||||
|
uint32_t flags; /* M_NOWAIT or M_WAITOK */
|
||||||
|
uint32_t reserved; /* alignment */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct if_snd_tag_rate_limit_params {
|
struct if_snd_tag_rate_limit_params {
|
||||||
@ -210,7 +212,7 @@ struct if_snd_tag_rate_limit_params {
|
|||||||
uint32_t queue_level; /* 0 (empty) .. 65535 (full) */
|
uint32_t queue_level; /* 0 (empty) .. 65535 (full) */
|
||||||
#define IF_SND_QUEUE_LEVEL_MIN 0
|
#define IF_SND_QUEUE_LEVEL_MIN 0
|
||||||
#define IF_SND_QUEUE_LEVEL_MAX 65535
|
#define IF_SND_QUEUE_LEVEL_MAX 65535
|
||||||
uint32_t reserved; /* padding */
|
uint32_t flags; /* M_NOWAIT or M_WAITOK */
|
||||||
};
|
};
|
||||||
|
|
||||||
union if_snd_tag_alloc_params {
|
union if_snd_tag_alloc_params {
|
||||||
@ -229,11 +231,37 @@ union if_snd_tag_query_params {
|
|||||||
struct if_snd_tag_rate_limit_params unlimited;
|
struct if_snd_tag_rate_limit_params unlimited;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Query return flags */
|
||||||
|
#define RT_NOSUPPORT 0x00000000 /* Not supported */
|
||||||
|
#define RT_IS_INDIRECT 0x00000001 /*
|
||||||
|
* Interface like a lagg, select
|
||||||
|
* the actual interface for
|
||||||
|
* capabilities.
|
||||||
|
*/
|
||||||
|
#define RT_IS_SELECTABLE 0x00000002 /*
|
||||||
|
* No rate table, you select
|
||||||
|
* rates and the first
|
||||||
|
* number_of_rates are created.
|
||||||
|
*/
|
||||||
|
#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */
|
||||||
|
#define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */
|
||||||
|
|
||||||
|
struct if_ratelimit_query_results {
|
||||||
|
const uint64_t *rate_table; /* Pointer to table if present */
|
||||||
|
uint32_t flags; /* Flags indicating results */
|
||||||
|
uint32_t max_flows; /* Max flows using, 0=unlimited */
|
||||||
|
uint32_t number_of_rates; /* How many unique rates can be created */
|
||||||
|
uint32_t min_segment_burst; /* The amount the adapter bursts at each send */
|
||||||
|
};
|
||||||
|
|
||||||
typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
|
typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
|
||||||
struct m_snd_tag **);
|
struct m_snd_tag **);
|
||||||
typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
|
typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
|
||||||
typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
|
typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
|
||||||
typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
|
typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
|
||||||
|
typedef void (if_ratelimit_query_t)(struct ifnet *,
|
||||||
|
struct if_ratelimit_query_results *);
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Structure defining a network interface.
|
* Structure defining a network interface.
|
||||||
@ -374,6 +402,7 @@ struct ifnet {
|
|||||||
if_snd_tag_modify_t *if_snd_tag_modify;
|
if_snd_tag_modify_t *if_snd_tag_modify;
|
||||||
if_snd_tag_query_t *if_snd_tag_query;
|
if_snd_tag_query_t *if_snd_tag_query;
|
||||||
if_snd_tag_free_t *if_snd_tag_free;
|
if_snd_tag_free_t *if_snd_tag_free;
|
||||||
|
if_ratelimit_query_t *if_ratelimit_query;
|
||||||
|
|
||||||
/* Ethernet PCP */
|
/* Ethernet PCP */
|
||||||
uint8_t if_pcp;
|
uint8_t if_pcp;
|
||||||
|
@ -210,6 +210,22 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
|
|||||||
&VNET_NAME(ipport_randomtime), 0,
|
&VNET_NAME(ipport_randomtime), 0,
|
||||||
"Minimum time to keep sequental port "
|
"Minimum time to keep sequental port "
|
||||||
"allocation before switching to a random one");
|
"allocation before switching to a random one");
|
||||||
|
|
||||||
|
#ifdef RATELIMIT
|
||||||
|
counter_u64_t rate_limit_active;
|
||||||
|
counter_u64_t rate_limit_alloc_fail;
|
||||||
|
counter_u64_t rate_limit_set_ok;
|
||||||
|
|
||||||
|
static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0,
|
||||||
|
"IP Rate Limiting");
|
||||||
|
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
|
||||||
|
&rate_limit_active, "Active rate limited connections");
|
||||||
|
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
|
||||||
|
&rate_limit_alloc_fail, "Rate limited connection failures");
|
||||||
|
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
|
||||||
|
&rate_limit_set_ok, "Rate limited setting succeeded");
|
||||||
|
#endif /* RATELIMIT */
|
||||||
|
|
||||||
#endif /* INET */
|
#endif /* INET */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -3170,6 +3186,7 @@ in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
|
|||||||
{
|
{
|
||||||
union if_snd_tag_modify_params params = {
|
union if_snd_tag_modify_params params = {
|
||||||
.rate_limit.max_rate = max_pacing_rate,
|
.rate_limit.max_rate = max_pacing_rate,
|
||||||
|
.rate_limit.flags = M_NOWAIT,
|
||||||
};
|
};
|
||||||
struct m_snd_tag *mst;
|
struct m_snd_tag *mst;
|
||||||
struct ifnet *ifp;
|
struct ifnet *ifp;
|
||||||
@ -3256,7 +3273,8 @@ in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
|
|||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
|
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
|
||||||
uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
|
uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
|
||||||
|
|
||||||
{
|
{
|
||||||
union if_snd_tag_alloc_params params = {
|
union if_snd_tag_alloc_params params = {
|
||||||
.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
|
.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
|
||||||
@ -3264,22 +3282,47 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
|
|||||||
.rate_limit.hdr.flowid = flowid,
|
.rate_limit.hdr.flowid = flowid,
|
||||||
.rate_limit.hdr.flowtype = flowtype,
|
.rate_limit.hdr.flowtype = flowtype,
|
||||||
.rate_limit.max_rate = max_pacing_rate,
|
.rate_limit.max_rate = max_pacing_rate,
|
||||||
|
.rate_limit.flags = M_NOWAIT,
|
||||||
};
|
};
|
||||||
int error;
|
int error;
|
||||||
|
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_WLOCK_ASSERT(inp);
|
||||||
|
|
||||||
if (inp->inp_snd_tag != NULL)
|
if (*st != NULL)
|
||||||
return (EINVAL);
|
return (EINVAL);
|
||||||
|
|
||||||
if (ifp->if_snd_tag_alloc == NULL) {
|
if (ifp->if_snd_tag_alloc == NULL) {
|
||||||
error = EOPNOTSUPP;
|
error = EOPNOTSUPP;
|
||||||
} else {
|
} else {
|
||||||
error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag);
|
error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag);
|
||||||
|
|
||||||
|
if (error == 0) {
|
||||||
|
counter_u64_add(rate_limit_set_ok, 1);
|
||||||
|
counter_u64_add(rate_limit_active, 1);
|
||||||
|
} else
|
||||||
|
counter_u64_add(rate_limit_alloc_fail, 1);
|
||||||
}
|
}
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst)
|
||||||
|
{
|
||||||
|
if (ifp == NULL)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the device was detached while we still had reference(s)
|
||||||
|
* on the ifp, we assume if_snd_tag_free() was replaced with
|
||||||
|
* stubs.
|
||||||
|
*/
|
||||||
|
ifp->if_snd_tag_free(mst);
|
||||||
|
|
||||||
|
/* release reference count on network interface */
|
||||||
|
if_rele(ifp);
|
||||||
|
counter_u64_add(rate_limit_active, -1);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
|
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
|
||||||
* if any:
|
* if any:
|
||||||
@ -3300,6 +3343,56 @@ in_pcbdetach_txrtlmt(struct inpcb *inp)
|
|||||||
m_snd_tag_rele(mst);
|
m_snd_tag_rele(mst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
|
||||||
|
{
|
||||||
|
int error;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the existing send tag is for the wrong interface due to
|
||||||
|
* a route change, first drop the existing tag. Set the
|
||||||
|
* CHANGED flag so that we will keep trying to allocate a new
|
||||||
|
* tag if we fail to allocate one this time.
|
||||||
|
*/
|
||||||
|
if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
|
||||||
|
in_pcbdetach_txrtlmt(inp);
|
||||||
|
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* NOTE: When attaching to a network interface a reference is
|
||||||
|
* made to ensure the network interface doesn't go away until
|
||||||
|
* all ratelimit connections are gone. The network interface
|
||||||
|
* pointers compared below represent valid network interfaces,
|
||||||
|
* except when comparing towards NULL.
|
||||||
|
*/
|
||||||
|
if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
|
||||||
|
error = 0;
|
||||||
|
} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
|
||||||
|
if (inp->inp_snd_tag != NULL)
|
||||||
|
in_pcbdetach_txrtlmt(inp);
|
||||||
|
error = 0;
|
||||||
|
} else if (inp->inp_snd_tag == NULL) {
|
||||||
|
/*
|
||||||
|
* In order to utilize packet pacing with RSS, we need
|
||||||
|
* to wait until there is a valid RSS hash before we
|
||||||
|
* can proceed:
|
||||||
|
*/
|
||||||
|
if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
|
||||||
|
error = EAGAIN;
|
||||||
|
} else {
|
||||||
|
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
|
||||||
|
mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
|
||||||
|
}
|
||||||
|
if (error == 0 || error == EOPNOTSUPP)
|
||||||
|
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
|
||||||
|
|
||||||
|
return (error);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function should be called when the INP_RATE_LIMIT_CHANGED flag
|
* This function should be called when the INP_RATE_LIMIT_CHANGED flag
|
||||||
* is set in the fast path and will attach/detach/modify the TX rate
|
* is set in the fast path and will attach/detach/modify the TX rate
|
||||||
@ -3342,47 +3435,8 @@ in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
|
|||||||
*/
|
*/
|
||||||
max_pacing_rate = socket->so_max_pacing_rate;
|
max_pacing_rate = socket->so_max_pacing_rate;
|
||||||
|
|
||||||
/*
|
error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
|
||||||
* If the existing send tag is for the wrong interface due to
|
|
||||||
* a route change, first drop the existing tag. Set the
|
|
||||||
* CHANGED flag so that we will keep trying to allocate a new
|
|
||||||
* tag if we fail to allocate one this time.
|
|
||||||
*/
|
|
||||||
if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
|
|
||||||
in_pcbdetach_txrtlmt(inp);
|
|
||||||
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* NOTE: When attaching to a network interface a reference is
|
|
||||||
* made to ensure the network interface doesn't go away until
|
|
||||||
* all ratelimit connections are gone. The network interface
|
|
||||||
* pointers compared below represent valid network interfaces,
|
|
||||||
* except when comparing towards NULL.
|
|
||||||
*/
|
|
||||||
if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
|
|
||||||
error = 0;
|
|
||||||
} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
|
|
||||||
if (inp->inp_snd_tag != NULL)
|
|
||||||
in_pcbdetach_txrtlmt(inp);
|
|
||||||
error = 0;
|
|
||||||
} else if (inp->inp_snd_tag == NULL) {
|
|
||||||
/*
|
|
||||||
* In order to utilize packet pacing with RSS, we need
|
|
||||||
* to wait until there is a valid RSS hash before we
|
|
||||||
* can proceed:
|
|
||||||
*/
|
|
||||||
if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
|
|
||||||
error = EAGAIN;
|
|
||||||
} else {
|
|
||||||
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
|
|
||||||
mb->m_pkthdr.flowid, max_pacing_rate);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
|
|
||||||
}
|
|
||||||
if (error == 0 || error == EOPNOTSUPP)
|
|
||||||
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
|
|
||||||
if (did_upgrade)
|
if (did_upgrade)
|
||||||
INP_DOWNGRADE(inp);
|
INP_DOWNGRADE(inp);
|
||||||
}
|
}
|
||||||
@ -3424,4 +3478,14 @@ in_pcboutput_eagain(struct inpcb *inp)
|
|||||||
if (did_upgrade)
|
if (did_upgrade)
|
||||||
INP_DOWNGRADE(inp);
|
INP_DOWNGRADE(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
rl_init(void *st)
|
||||||
|
{
|
||||||
|
rate_limit_active = counter_u64_alloc(M_WAITOK);
|
||||||
|
rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
|
||||||
|
rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
|
||||||
|
}
|
||||||
|
|
||||||
|
SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
|
||||||
#endif /* RATELIMIT */
|
#endif /* RATELIMIT */
|
||||||
|
@ -883,8 +883,13 @@ struct sockaddr *
|
|||||||
in_sockaddr(in_port_t port, struct in_addr *addr);
|
in_sockaddr(in_port_t port, struct in_addr *addr);
|
||||||
void in_pcbsosetlabel(struct socket *so);
|
void in_pcbsosetlabel(struct socket *so);
|
||||||
#ifdef RATELIMIT
|
#ifdef RATELIMIT
|
||||||
int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
|
int
|
||||||
|
in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
|
||||||
|
struct mbuf *, uint32_t);
|
||||||
|
int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
|
||||||
|
uint32_t, struct m_snd_tag **);
|
||||||
void in_pcbdetach_txrtlmt(struct inpcb *);
|
void in_pcbdetach_txrtlmt(struct inpcb *);
|
||||||
|
void in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst);
|
||||||
int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
|
int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
|
||||||
int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
|
int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
|
||||||
int in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
|
int in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
|
||||||
|
1234
sys/netinet/tcp_ratelimit.c
Normal file
1234
sys/netinet/tcp_ratelimit.c
Normal file
File diff suppressed because it is too large
Load Diff
141
sys/netinet/tcp_ratelimit.h
Normal file
141
sys/netinet/tcp_ratelimit.h
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
/*-
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
*
|
||||||
|
* Copyright (c) 2018-2019
|
||||||
|
* Netflix Inc.
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||||
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* __FBSDID("$FreeBSD$");
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* Author: Randall Stewart <rrs@netflix.com>
|
||||||
|
*/
|
||||||
|
#ifndef __tcp_ratelimit_h__
|
||||||
|
#define __tcp_ratelimit_h__
|
||||||
|
|
||||||
|
struct m_snd_tag;
|
||||||
|
|
||||||
|
/* Flags on an individual rate */
|
||||||
|
#define HDWRPACE_INITED 0x0001
|
||||||
|
#define HDWRPACE_TAGPRESENT 0x0002
|
||||||
|
#define HDWRPACE_IFPDEPARTED 0x0004
|
||||||
|
struct tcp_hwrate_limit_table {
|
||||||
|
const struct tcp_rate_set *ptbl; /* Pointer to parent table */
|
||||||
|
struct m_snd_tag *tag; /* Send tag if needed (chelsio) */
|
||||||
|
uint64_t rate; /* Rate we get in Bytes per second (Bps) */
|
||||||
|
uint32_t time_between; /* Time-Gap between packets at this rate */
|
||||||
|
uint32_t flags;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Rateset flags */
|
||||||
|
#define RS_IS_DEFF 0x0001 /* Its a lagg, do a double lookup */
|
||||||
|
#define RS_IS_INTF 0x0002 /* Its a plain interface */
|
||||||
|
#define RS_NO_PRE 0x0004 /* The interfacd has set rates */
|
||||||
|
#define RS_INT_TBL 0x0010 /*
|
||||||
|
* The table is the internal version
|
||||||
|
* which has special setup requirements.
|
||||||
|
*/
|
||||||
|
#define RS_IS_DEAD 0x0020 /* The RS is dead list */
|
||||||
|
#define RS_FUNERAL_SCHD 0x0040 /* Is a epoch call scheduled to bury this guy?*/
|
||||||
|
#define RS_INTF_NO_SUP 0x0100 /* The interface does not support the ratelimiting */
|
||||||
|
|
||||||
|
struct tcp_rate_set {
|
||||||
|
struct sysctl_ctx_list sysctl_ctx;
|
||||||
|
CK_LIST_ENTRY(tcp_rate_set) next;
|
||||||
|
struct ifnet *rs_ifp;
|
||||||
|
struct tcp_hwrate_limit_table *rs_rlt;
|
||||||
|
uint64_t rs_flows_using;
|
||||||
|
uint64_t rs_flow_limit;
|
||||||
|
uint32_t rs_if_dunit;
|
||||||
|
int rs_rate_cnt;
|
||||||
|
int rs_min_seg;
|
||||||
|
int rs_highest_valid;
|
||||||
|
int rs_lowest_valid;
|
||||||
|
int rs_disable;
|
||||||
|
int rs_flags;
|
||||||
|
struct epoch_context rs_epoch_ctx;
|
||||||
|
};
|
||||||
|
|
||||||
|
CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
|
||||||
|
|
||||||
|
/* Request flags */
|
||||||
|
#define RS_PACING_EXACT_MATCH 0x0001 /* Need an exact match for rate */
|
||||||
|
#define RS_PACING_GT 0x0002 /* Greater than requested */
|
||||||
|
#define RS_PACING_GEQ 0x0004 /* Greater than or equal too */
|
||||||
|
#define RS_PACING_LT 0x0008 /* Less than requested rate */
|
||||||
|
#define RS_PACING_SUB_OK 0x0010 /* If a rate can't be found get the
|
||||||
|
* next best rate (highest or lowest). */
|
||||||
|
#ifdef RATELIMIT
|
||||||
|
#ifdef _KERNEL
|
||||||
|
#define DETAILED_RATELIMIT_SYSCTL 1 /*
|
||||||
|
* Undefine this if you don't want
|
||||||
|
* detailed rates to appear in
|
||||||
|
* net.inet.tcp.rl.
|
||||||
|
* With the defintion each rate
|
||||||
|
* shows up in your sysctl tree
|
||||||
|
* this can be big.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const struct tcp_hwrate_limit_table *
|
||||||
|
tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
|
||||||
|
uint64_t bytes_per_sec, int flags, int *error);
|
||||||
|
|
||||||
|
const struct tcp_hwrate_limit_table *
|
||||||
|
tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
|
||||||
|
struct tcpcb *tp, struct ifnet *ifp,
|
||||||
|
uint64_t bytes_per_sec, int flags, int *error);
|
||||||
|
void
|
||||||
|
tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
|
||||||
|
struct tcpcb *tp);
|
||||||
|
#else
|
||||||
|
static inline const struct tcp_hwrate_limit_table *
|
||||||
|
tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
|
||||||
|
uint64_t bytes_per_sec, int flags, int *error)
|
||||||
|
{
|
||||||
|
if (error)
|
||||||
|
*error = EOPNOTSUPP;
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline const struct tcp_hwrate_limit_table *
|
||||||
|
tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
|
||||||
|
struct tcpcb *tp, struct ifnet *ifp,
|
||||||
|
uint64_t bytes_per_sec, int flags, int *error)
|
||||||
|
{
|
||||||
|
if (error)
|
||||||
|
*error = EOPNOTSUPP;
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
|
||||||
|
struct tcpcb *tp)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
Loading…
x
Reference in New Issue
Block a user