From 20abea6663c46423800256b0d29efea32d36a10f Mon Sep 17 00:00:00 2001
From: Randall Stewart <rrs@FreeBSD.org>
Date: Thu, 1 Aug 2019 14:17:31 +0000
Subject: [PATCH] This adds the third step in getting BBR into the tree. BBR
 and an updated rack depend on having access to the new ratelimit api in this
 commit.

Sponsored by:	Netflix Inc.
Differential Revision:	https://reviews.freebsd.org/D20953
---
 sys/conf/files                      |    1 +
 sys/dev/cxgbe/adapter.h             |    1 +
 sys/dev/cxgbe/t4_main.c             |    1 +
 sys/dev/cxgbe/t4_sched.c            |   31 +
 sys/dev/mlx5/mlx5_en/mlx5_en_main.c |   46 +-
 sys/net/if_dead.c                   |   18 +
 sys/net/if_lagg.c                   |   17 +
 sys/net/if_var.h                    |   31 +-
 sys/netinet/in_pcb.c                |  148 +++-
 sys/netinet/in_pcb.h                |    7 +-
 sys/netinet/tcp_ratelimit.c         | 1234 +++++++++++++++++++++++++++
 sys/netinet/tcp_ratelimit.h         |  141 +++
 12 files changed, 1631 insertions(+), 45 deletions(-)
 create mode 100644 sys/netinet/tcp_ratelimit.c
 create mode 100644 sys/netinet/tcp_ratelimit.h

diff --git a/sys/conf/files b/sys/conf/files
index 780676362156..bbc66f88bc41 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4276,6 +4276,7 @@ netinet/tcp_lro.c		optional inet | inet6
 netinet/tcp_output.c		optional inet | inet6
 netinet/tcp_offload.c		optional tcp_offload inet | tcp_offload inet6
 netinet/tcp_hpts.c              optional tcphpts inet | tcphpts inet6
+netinet/tcp_ratelimit.c         optional ratelimit inet | ratelimit inet6
 netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcppcap \
 	compile-with "${NORMAL_C} ${NO_WNONNULL}"
 netinet/tcp_reass.c		optional inet | inet6
diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index e8eeb3333ac0..07c41904df61 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -1247,6 +1247,7 @@ int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *);
 int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
 void cxgbe_snd_tag_free(struct m_snd_tag *);
 void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
+void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *);
 #endif
 
 /* t4_filter.c */
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index 43ea97598be4..654b984a793b 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -1658,6 +1658,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
 	ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
 	ifp->if_snd_tag_query = cxgbe_snd_tag_query;
 	ifp->if_snd_tag_free = cxgbe_snd_tag_free;
+	ifp->if_ratelimit_query = cxgbe_ratelimit_query;
 #endif
 
 	ifp->if_capabilities = T4_CAP;
diff --git a/sys/dev/cxgbe/t4_sched.c b/sys/dev/cxgbe/t4_sched.c
index 31335ae7ce85..b7535912b40d 100644
--- a/sys/dev/cxgbe/t4_sched.c
+++ b/sys/dev/cxgbe/t4_sched.c
@@ -903,4 +903,35 @@ cxgbe_snd_tag_free(struct m_snd_tag *mst)
 	}
 	mtx_unlock(&cst->lock);
 }
+
+#define CXGBE_MAX_FLOWS 4000	/* Testing show so far thats all this adapter can do */
+#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup */
+
+void
+cxgbe_ratelimit_query(struct ifnet *ifp __unused,
+     struct if_ratelimit_query_results *q)
+{
+	/*
+	 * This is a skeleton and needs future work
+	 * by the driver supporters. It should be
+	 * enhanced to look at the specific type of
+	 * interface and select approprate values
+	 * for these settings. This example goes
+	 * with an earlier card (t5), it has a maximum
+	 * number of 16 rates that the first guys in
+	 * select (thus the flags value RT_IS_SELECTABLE).
+	 * If it was a fixed table then we would setup a
+	 * const array (example mlx5). Note the card tested
+	 * can only support reasonably 4000 flows before
+	 * the adapter has issues with sending so here 
+	 * we limit the number of flows using hardware
+	 * pacing to that number, other cards may
+	 * be able to raise or eliminate this limit.
+	 */
+	q->rate_table = NULL;
+	q->flags = RT_IS_SELECTABLE;
+	q->max_flows = CXGBE_MAX_FLOWS;
+	q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT;
+	q->min_segment_burst = 4;	/* Driver emits 4 in a burst */
+}
 #endif
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index edd31392510a..23c91281e8d3 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -4070,6 +4070,48 @@ mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params
 	}
 }
 
+#define NUM_HDWR_RATES_MLX 13
+static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
+	135375,			/* 1,083,000 */
+	180500,			/* 1,444,000 */
+	270750,			/* 2,166,000 */
+	361000,			/* 2,888,000 */
+	541500,			/* 4,332,000 */
+	721875,			/* 5,775,000 */
+	1082875,		/* 8,663,000 */
+	1443875,		/* 11,551,000 */
+	2165750,		/* 17,326,000 */
+	2887750,		/* 23,102,000 */
+	4331625,		/* 34,653,000 */
+	5775500,		/* 46,204,000 */
+	8663125			/* 69,305,000 */
+};
+
+static void
+mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+	/*
+	 * This function needs updating by the driver maintainer!
+	 * For the MLX card there are currently (ConectX-4?) 13 
+	 * pre-set rates and others i.e. ConnectX-5, 6, 7??
+	 *
+	 * This will change based on later adapters
+	 * and this code should be updated to look at ifp
+	 * and figure out the specific adapter type
+	 * settings i.e. how many rates as well
+	 * as if they are fixed (as is shown here) or
+	 * if they are dynamic (example chelsio t4). Also if there
+	 * is a maximum number of flows that the adapter
+	 * can handle that too needs to be updated in
+	 * the max_flows field.
+	 */
+	q->rate_table = adapter_rates_mlx;
+	q->flags = RT_IS_FIXED_TABLE;
+	q->max_flows = 0;	/* mlx has no limit */
+	q->number_of_rates = NUM_HDWR_RATES_MLX;
+	q->min_segment_burst = 1;
+}
+
 static void
 mlx5e_snd_tag_free(struct m_snd_tag *pmt)
 {
@@ -4155,7 +4197,9 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 	ifp->if_snd_tag_free = mlx5e_snd_tag_free;
 	ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
 	ifp->if_snd_tag_query = mlx5e_snd_tag_query;
-
+#ifdef RATELIMIT
+	ifp->if_ratelimit_query = mlx5e_ratelimit_query;
+#endif
 	/* set TSO limits so that we don't have to drop TX packets */
 	ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
 	ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
diff --git a/sys/net/if_dead.c b/sys/net/if_dead.c
index 82d0888f6e7d..b01d17fe9b1b 100644
--- a/sys/net/if_dead.c
+++ b/sys/net/if_dead.c
@@ -126,6 +126,23 @@ ifdead_snd_tag_free(struct m_snd_tag *pmt)
 {
 }
 
+static void
+ifdead_ratelimit_query(struct ifnet *ifp __unused,
+      struct if_ratelimit_query_results *q)
+{
+	/*
+	 * This guy does not support
+	 * this interface. Not sure
+	 * why we would specify a
+	 * flag on the interface
+	 * that says we do.
+	 */
+	q->rate_table = NULL;
+	q->flags = RT_NOSUPPORT;
+	q->max_flows = 0;
+	q->number_of_rates = 0;
+}
+
 void
 if_dead(struct ifnet *ifp)
 {
@@ -142,4 +159,5 @@ if_dead(struct ifnet *ifp)
 	ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
 	ifp->if_snd_tag_query = ifdead_snd_tag_query;
 	ifp->if_snd_tag_free = ifdead_snd_tag_free;
+	ifp->if_ratelimit_query = ifdead_ratelimit_query;
 }
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index 3e1cb2043b08..911f9c0cdbb6 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -144,6 +144,8 @@ static int	lagg_snd_tag_modify(struct m_snd_tag *,
 static int	lagg_snd_tag_query(struct m_snd_tag *,
 		    union if_snd_tag_query_params *);
 static void	lagg_snd_tag_free(struct m_snd_tag *);
+static void     lagg_ratelimit_query(struct ifnet *,
+		    struct if_ratelimit_query_results *);
 #endif
 static int	lagg_setmulti(struct lagg_port *);
 static int	lagg_clrmulti(struct lagg_port *);
@@ -537,6 +539,7 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 	ifp->if_snd_tag_modify = lagg_snd_tag_modify;
 	ifp->if_snd_tag_query = lagg_snd_tag_query;
 	ifp->if_snd_tag_free = lagg_snd_tag_free;
+	ifp->if_ratelimit_query = lagg_ratelimit_query;
 #endif
 	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
 
@@ -1670,6 +1673,20 @@ lagg_snd_tag_free(struct m_snd_tag *mst)
 	free(lst, M_LAGG);
 }
 
+static void
+lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+	/*
+	 * For lagg, we have an indirect
+	 * interface. The caller needs to
+	 * get a ratelimit tag on the actual
+	 * interface the flow will go on.
+	 */
+	q->rate_table = NULL;
+	q->flags = RT_IS_INDIRECT;
+	q->max_flows = 0;
+	q->number_of_rates = 0;
+}
 #endif
 
 static int
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index f6388b147d97..1e81e481f8eb 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -203,6 +203,8 @@ struct if_snd_tag_alloc_header {
 struct if_snd_tag_alloc_rate_limit {
 	struct if_snd_tag_alloc_header hdr;
 	uint64_t max_rate;	/* in bytes/s */
+	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
+	uint32_t reserved;	/* alignment */
 };
 
 struct if_snd_tag_rate_limit_params {
@@ -210,7 +212,7 @@ struct if_snd_tag_rate_limit_params {
 	uint32_t queue_level;	/* 0 (empty) .. 65535 (full) */
 #define	IF_SND_QUEUE_LEVEL_MIN 0
 #define	IF_SND_QUEUE_LEVEL_MAX 65535
-	uint32_t reserved;	/* padding */
+	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
 };
 
 union if_snd_tag_alloc_params {
@@ -229,11 +231,37 @@ union if_snd_tag_query_params {
 	struct if_snd_tag_rate_limit_params unlimited;
 };
 
+/* Query return flags */
+#define RT_NOSUPPORT	  0x00000000	/* Not supported */
+#define RT_IS_INDIRECT    0x00000001	/*
+					 * Interface like a lagg, select
+					 * the actual interface for
+					 * capabilities.
+					 */
+#define RT_IS_SELECTABLE  0x00000002	/*
+					 * No rate table, you select
+					 * rates and the first
+					 * number_of_rates are created.
+					 */
+#define RT_IS_FIXED_TABLE 0x00000004	/* A fixed table is attached */
+#define RT_IS_UNUSABLE	  0x00000008	/* It is not usable for this */
+
+struct if_ratelimit_query_results {
+	const uint64_t *rate_table;	/* Pointer to table if present */
+	uint32_t flags;			/* Flags indicating results */
+	uint32_t max_flows;		/* Max flows using, 0=unlimited */
+	uint32_t number_of_rates;	/* How many unique rates can be created */
+	uint32_t min_segment_burst;	/* The amount the adapter bursts at each send */
+};
+
 typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
 typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
 typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
 typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
+typedef void (if_ratelimit_query_t)(struct ifnet *,
+    struct if_ratelimit_query_results *);
+
 
 /*
  * Structure defining a network interface.
@@ -374,6 +402,7 @@ struct ifnet {
 	if_snd_tag_modify_t *if_snd_tag_modify;
 	if_snd_tag_query_t *if_snd_tag_query;
 	if_snd_tag_free_t *if_snd_tag_free;
+	if_ratelimit_query_t *if_ratelimit_query;
 
 	/* Ethernet PCP */
 	uint8_t if_pcp;
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index b68475afa655..f6e93cb953f9 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -210,6 +210,22 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
+
+#ifdef RATELIMIT
+counter_u64_t rate_limit_active;
+counter_u64_t rate_limit_alloc_fail;
+counter_u64_t rate_limit_set_ok;
+
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0,
+    "IP Rate Limiting");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
+    &rate_limit_active, "Active rate limited connections");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
+   &rate_limit_alloc_fail, "Rate limited connection failures");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
+   &rate_limit_set_ok, "Rate limited setting succeeded");
+#endif /* RATELIMIT */
+
 #endif /* INET */
 
 /*
@@ -3170,6 +3186,7 @@ in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
+		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
 	struct ifnet *ifp;
@@ -3256,7 +3273,8 @@ in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
  */
 int
 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
-    uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+    uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
+
 {
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
@@ -3264,22 +3282,47 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.max_rate = max_pacing_rate,
+		.rate_limit.flags = M_NOWAIT,
 	};
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 
-	if (inp->inp_snd_tag != NULL)
+	if (*st != NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_alloc == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
+
+		if (error == 0) {
+			counter_u64_add(rate_limit_set_ok, 1);
+			counter_u64_add(rate_limit_active, 1);
+		} else
+			counter_u64_add(rate_limit_alloc_fail, 1);
 	}
 	return (error);
 }
 
+void
+in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst)
+{
+	if (ifp == NULL)
+		return;
+
+	/*
+	 * If the device was detached while we still had reference(s)
+	 * on the ifp, we assume if_snd_tag_free() was replaced with
+	 * stubs.
+	 */
+	ifp->if_snd_tag_free(mst);
+
+	/* release reference count on network interface */
+	if_rele(ifp);
+	counter_u64_add(rate_limit_active, -1);
+}
+
 /*
  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
  * if any:
@@ -3300,6 +3343,56 @@ in_pcbdetach_txrtlmt(struct inpcb *inp)
 	m_snd_tag_rele(mst);
 }
 
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
+{
+	int error;
+
+	/*
+	 * If the existing send tag is for the wrong interface due to
+	 * a route change, first drop the existing tag.  Set the
+	 * CHANGED flag so that we will keep trying to allocate a new
+	 * tag if we fail to allocate one this time.
+	 */
+	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
+		in_pcbdetach_txrtlmt(inp);
+		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+	}
+
+	/*
+	 * NOTE: When attaching to a network interface a reference is
+	 * made to ensure the network interface doesn't go away until
+	 * all ratelimit connections are gone. The network interface
+	 * pointers compared below represent valid network interfaces,
+	 * except when comparing towards NULL.
+	 */
+	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
+		error = 0;
+	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
+		if (inp->inp_snd_tag != NULL)
+			in_pcbdetach_txrtlmt(inp);
+		error = 0;
+	} else if (inp->inp_snd_tag == NULL) {
+		/*
+		 * In order to utilize packet pacing with RSS, we need
+		 * to wait until there is a valid RSS hash before we
+		 * can proceed:
+		 */
+		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
+			error = EAGAIN;
+		} else {
+			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
+			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
+		}
+	} else {
+		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
+	}
+	if (error == 0 || error == EOPNOTSUPP)
+		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+
+	return (error);
+}
+
 /*
  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
  * is set in the fast path and will attach/detach/modify the TX rate
@@ -3342,47 +3435,8 @@ in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
 	 */
 	max_pacing_rate = socket->so_max_pacing_rate;
 
-	/*
-	 * If the existing send tag is for the wrong interface due to
-	 * a route change, first drop the existing tag.  Set the
-	 * CHANGED flag so that we will keep trying to allocate a new
-	 * tag if we fail to allocate one this time.
-	 */
-	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
-		in_pcbdetach_txrtlmt(inp);
-		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
-	}
+	error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
 
-	/*
-	 * NOTE: When attaching to a network interface a reference is
-	 * made to ensure the network interface doesn't go away until
-	 * all ratelimit connections are gone. The network interface
-	 * pointers compared below represent valid network interfaces,
-	 * except when comparing towards NULL.
-	 */
-	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
-		error = 0;
-	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
-		if (inp->inp_snd_tag != NULL)
-			in_pcbdetach_txrtlmt(inp);
-		error = 0;
-	} else if (inp->inp_snd_tag == NULL) {
-		/*
-		 * In order to utilize packet pacing with RSS, we need
-		 * to wait until there is a valid RSS hash before we
-		 * can proceed:
-		 */
-		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
-			error = EAGAIN;
-		} else {
-			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
-			    mb->m_pkthdr.flowid, max_pacing_rate);
-		}
-	} else {
-		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
-	}
-	if (error == 0 || error == EOPNOTSUPP)
-		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
@@ -3424,4 +3478,14 @@ in_pcboutput_eagain(struct inpcb *inp)
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
+
+static void
+rl_init(void *st)
+{
+	rate_limit_active = counter_u64_alloc(M_WAITOK);
+	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
+	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
+}
+
+SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
 #endif /* RATELIMIT */
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 1d2ee37ec9dc..321f9a96f3bc 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -883,8 +883,13 @@ struct sockaddr *
 	in_sockaddr(in_port_t port, struct in_addr *addr);
 void	in_pcbsosetlabel(struct socket *so);
 #ifdef RATELIMIT
-int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
+	    struct mbuf *, uint32_t);
+int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
+	    uint32_t, struct m_snd_tag **);
 void	in_pcbdetach_txrtlmt(struct inpcb *);
+void    in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst);
 int	in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
 int	in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
 int	in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c
new file mode 100644
index 000000000000..cd11855e46d8
--- /dev/null
+++ b/sys/netinet/tcp_ratelimit.c
@@ -0,0 +1,1234 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ *	Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/mutex.h>
+#include <sys/ck.h>
+#define TCPSTATES		/* for logging */
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_ratelimit.h>
+#ifndef USECS_IN_SECOND
+#define USECS_IN_SECOND 1000000
+#endif
+/*
+ * For the purposes of each send, what is the size
+ * of an ethernet frame.
+ */
+#ifndef ETHERNET_SEGMENT_SIZE
+#define ETHERNET_SEGMENT_SIZE 1500
+#endif
+MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
+#ifdef RATELIMIT
+
+#define COMMON_RATE 180500
+uint64_t desired_rates[] = {
+	62500,			/* 500Kbps */
+	180500,			/* 1.44Mpbs */
+	375000,			/* 3Mbps */
+	500000,			/* 4Mbps */
+	625000,			/* 5Mbps */
+	750000,			/* 6Mbps */
+	1000000,		/* 8Mbps */
+	1250000,		/* 10Mbps */
+	2500000,		/* 20Mbps */
+	3750000,		/* 30Mbps */
+	5000000,		/* 40Meg */
+	6250000,		/* 50Mbps */
+	12500000,		/* 100Mbps */
+	25000000,		/* 200Mbps */
+	50000000,		/* 400Mbps */
+	100000000,		/* 800Mbps */
+	12500,			/* 100kbps */
+	25000,			/* 200kbps */
+	875000,			/* 7Mbps */
+	1125000,		/* 9Mbps */
+	1875000,		/* 15Mbps */
+	3125000,		/* 25Mbps */
+	8125000,		/* 65Mbps */
+	10000000,		/* 80Mbps */
+	18750000,		/* 150Mbps */
+	20000000,		/* 250Mbps */
+	37500000,		/* 350Mbps */
+	62500000,		/* 500Mbps */
+	78125000,		/* 625Mbps */
+	125000000,		/* 1Gbps */
+};
+#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
+#define RS_ORDERED_COUNT 16	/*
+				 * Number that are in order
+				 * at the beginning of the table,
+				 * over this a sort is required.
+				 */
+#define RS_NEXT_ORDER_GROUP 16	/*
+				 * The point in our table where
+				 * we come fill in a second ordered
+				 * group (index wise means -1).
+				 */
+#define ALL_HARDWARE_RATES 1004 /*
+				 * 1Meg - 1Gig in 1 Meg steps
+				 * plus 100, 200k  and 500k and
+				 * 10Gig
+				 */
+
+#define RS_ONE_MEGABIT_PERSEC 1000000
+#define RS_ONE_GIGABIT_PERSEC 1000000000
+#define RS_TEN_GIGABIT_PERSEC 10000000000
+
+static struct head_tcp_rate_set int_rs;
+static struct mtx rs_mtx;
+uint32_t rs_number_alive;
+uint32_t rs_number_dead;
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
+    "TCP Ratelimit stats");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
+    &rs_number_alive, 0,
+    "Number of interfaces initialized for ratelimiting");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
+    &rs_number_dead, 0,
+    "Number of interfaces departing from ratelimiting");
+
+static void
+rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
+{
+	/*
+	 * Add sysctl entries for thus interface.
+	 */
+	if (rs->rs_flags & RS_INTF_NO_SUP) {
+		SYSCTL_ADD_S32(&rs->sysctl_ctx,
+		   SYSCTL_CHILDREN(rl_sysctl_root),
+		   OID_AUTO, "disable", CTLFLAG_RD,
+		   &rs->rs_disable, 0,
+		   "Disable this interface from new hdwr limiting?");
+	} else {
+		SYSCTL_ADD_S32(&rs->sysctl_ctx,
+		   SYSCTL_CHILDREN(rl_sysctl_root),
+		   OID_AUTO, "disable", CTLFLAG_RW,
+		   &rs->rs_disable, 0,
+		   "Disable this interface from new hdwr limiting?");
+	}
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "minseg", CTLFLAG_RW,
+	    &rs->rs_min_seg, 0,
+	    "What is the minimum we need to send on this interface?");
+	SYSCTL_ADD_U64(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "flow_limit", CTLFLAG_RW,
+	    &rs->rs_flow_limit, 0,
+	    "What is the limit for number of flows (0=unlimited)?");
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "highest", CTLFLAG_RD,
+	    &rs->rs_highest_valid, 0,
+	    "Highest valid rate");
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "lowest", CTLFLAG_RD,
+	    &rs->rs_lowest_valid, 0,
+	    "Lowest valid rate");
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "flags", CTLFLAG_RD,
+	    &rs->rs_flags, 0,
+	    "What lags are on the entry?");
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "numrates", CTLFLAG_RD,
+	    &rs->rs_rate_cnt, 0,
+	    "How many rates re there?");
+	SYSCTL_ADD_U64(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "flows_using", CTLFLAG_RD,
+	    &rs->rs_flows_using, 0,
+	    "How many flows are using this interface now?");
+#ifdef DETAILED_RATELIMIT_SYSCTL
+	if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
+		/*  Lets display the rates */
+		int i;
+		struct sysctl_oid *rl_rates;
+		struct sysctl_oid *rl_rate_num;
+		char rate_num[16];
+		rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+					    SYSCTL_CHILDREN(rl_sysctl_root),
+					    OID_AUTO,
+					    "rate",
+					    CTLFLAG_RW, 0,
+					    "Ratelist");
+		for( i = 0; i < rs->rs_rate_cnt; i++) {
+			sprintf(rate_num, "%d", i);
+			rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+					    SYSCTL_CHILDREN(rl_rates),
+					    OID_AUTO,
+					    rate_num,
+					    CTLFLAG_RW, 0,
+					    "Individual Rate");
+			SYSCTL_ADD_U32(&rs->sysctl_ctx,
+				       SYSCTL_CHILDREN(rl_rate_num),
+				       OID_AUTO, "flags", CTLFLAG_RD,
+				       &rs->rs_rlt[i].flags, 0,
+				       "Flags on this rate");
+			SYSCTL_ADD_U32(&rs->sysctl_ctx,
+				       SYSCTL_CHILDREN(rl_rate_num),
+				       OID_AUTO, "pacetime", CTLFLAG_RD,
+				       &rs->rs_rlt[i].time_between, 0,
+				       "Time hardware inserts between 1500 byte sends");
+			SYSCTL_ADD_U64(&rs->sysctl_ctx,
+				       SYSCTL_CHILDREN(rl_rate_num),
+				       OID_AUTO, "rate", CTLFLAG_RD,
+				       &rs->rs_rlt[i].rate, 0,
+				       "Rate in bytes per second");
+		}
+	}
+#endif
+}
+
+static void
+rs_destroy(epoch_context_t ctx)
+{
+	struct tcp_rate_set *rs;
+
+	rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
+	mtx_lock(&rs_mtx);
+	rs->rs_flags &= ~RS_FUNERAL_SCHD;
+	if (rs->rs_flows_using == 0) {
+		/*
+		 * In theory its possible (but unlikely)
+		 * that while the delete was occuring
+		 * and we were applying the DEAD flag
+		 * someone slipped in and found the
+		 * interface in a lookup. While we
+		 * decided rs_flows_using were 0 and
+		 * scheduling the epoch_call, the other
+		 * thread incremented rs_flow_using. This
+		 * is because users have a pointer and
+		 * we only use the rs_flows_using in an
+		 * atomic fashion, i.e. the other entities
+		 * are not protected. To assure this did
+		 * not occur, we check rs_flows_using here
+		 * before deleteing.
+		 */
+		sysctl_ctx_free(&rs->sysctl_ctx);
+		free(rs->rs_rlt, M_TCPPACE);
+		free(rs, M_TCPPACE);
+		rs_number_dead--;
+	}
+	mtx_unlock(&rs_mtx);
+
+}
+
+extern counter_u64_t rate_limit_set_ok;
+extern counter_u64_t rate_limit_active;
+extern counter_u64_t rate_limit_alloc_fail;
+
+static int
+rl_attach_txrtlmt(struct ifnet *ifp,
+    uint32_t flowtype,
+    int flowid,
+    uint64_t cfg_rate,
+    struct m_snd_tag **tag)
+{
+	int error;
+	union if_snd_tag_alloc_params params = {
+		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+		.rate_limit.hdr.flowid = flowid,
+		.rate_limit.hdr.flowtype = flowtype,
+		.rate_limit.max_rate = cfg_rate,
+		.rate_limit.flags = M_NOWAIT,
+	};
+
+	if (ifp->if_snd_tag_alloc == NULL) {
+		error = EOPNOTSUPP;
+	} else {
+		error = ifp->if_snd_tag_alloc(ifp, &params, tag);
+		if (error == 0) {
+			if_ref((*tag)->ifp);
+			counter_u64_add(rate_limit_set_ok, 1);
+			counter_u64_add(rate_limit_active, 1);
+		} else
+			counter_u64_add(rate_limit_alloc_fail, 1);
+	}
+	return (error);
+}
+
+static void
+populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
+{
+	/*
+	 * The internal table is "special", it
+	 * is two seperate ordered tables that
+	 * must be merged. We get here when the
+	 * adapter specifies a number of rates that
+	 * covers both ranges in the table in some
+	 * form.
+	 */
+	int i, at_low, at_high;
+	uint8_t low_disabled = 0, high_disabled = 0;
+
+	for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
+		rs->rs_rlt[i].flags = 0;
+		rs->rs_rlt[i].time_between = 0;
+		if ((low_disabled == 0) &&
+		    (high_disabled ||
+		     (rate_table_act[at_low] < rate_table_act[at_high]))) {
+			rs->rs_rlt[i].rate = rate_table_act[at_low];
+			at_low++;
+			if (at_low == RS_NEXT_ORDER_GROUP)
+				low_disabled = 1;
+		} else if (high_disabled == 0) {
+			rs->rs_rlt[i].rate = rate_table_act[at_high];
+			at_high++;
+			if (at_high == MAX_HDWR_RATES)
+				high_disabled = 1;
+		}
+	}
+}
+
+static struct tcp_rate_set *
+rt_setup_new_rs(struct ifnet *ifp, int *error)
+{
+	struct tcp_rate_set *rs;
+	const uint64_t *rate_table_act;
+	uint64_t lentim, res;
+	size_t sz;
+	uint32_t hash_type;
+	int i;
+	struct if_ratelimit_query_results rl;
+	struct sysctl_oid *rl_sysctl_root;
+	/*
+	 * We expect to enter with the 
+	 * mutex locked.
+	 */
+
+	if (ifp->if_ratelimit_query == NULL) {
+		/*
+		 * We can do nothing if we cannot
+		 * get a query back from the driver.
+		 */
+		return (NULL);
+	}
+	rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
+	if (rs == NULL) {
+		if (error)
+			*error = ENOMEM;
+		return (NULL);
+	}
+	rl.flags = RT_NOSUPPORT;
+	ifp->if_ratelimit_query(ifp, &rl);
+	if (rl.flags & RT_IS_UNUSABLE) {
+		/* 
+		 * The interface does not really support 
+		 * the rate-limiting.
+		 */
+		memset(rs, 0, sizeof(struct tcp_rate_set));
+		rs->rs_ifp = ifp;
+		rs->rs_if_dunit = ifp->if_dunit;
+		rs->rs_flags = RS_INTF_NO_SUP;
+		rs->rs_disable = 1;
+		rs_number_alive++;
+		sysctl_ctx_init(&rs->sysctl_ctx);
+		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+		    OID_AUTO,
+		    rs->rs_ifp->if_xname,
+		    CTLFLAG_RW, 0,
+		    "");
+		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+		/* Unlock to allow the sysctl stuff to allocate */
+		mtx_unlock(&rs_mtx);
+		rl_add_syctl_entries(rl_sysctl_root, rs);
+		/* re-lock for our caller */
+		mtx_lock(&rs_mtx);
+		return (rs);
+	} else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
+		memset(rs, 0, sizeof(struct tcp_rate_set));
+		rs->rs_ifp = ifp;
+		rs->rs_if_dunit = ifp->if_dunit;
+		rs->rs_flags = RS_IS_DEFF;
+		rs_number_alive++;
+		sysctl_ctx_init(&rs->sysctl_ctx);
+		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+		    OID_AUTO,
+		    rs->rs_ifp->if_xname,
+		    CTLFLAG_RW, 0,
+		    "");
+		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+		/* Unlock to allow the sysctl stuff to allocate */
+		mtx_unlock(&rs_mtx);
+		rl_add_syctl_entries(rl_sysctl_root, rs);
+		/* re-lock for our caller */
+		mtx_lock(&rs_mtx);
+		return (rs);
+	} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
+		/* Mellanox most likely */
+		rs->rs_ifp = ifp;
+		rs->rs_if_dunit = ifp->if_dunit;
+		rs->rs_rate_cnt = rl.number_of_rates;
+		rs->rs_min_seg = rl.min_segment_burst;
+		rs->rs_highest_valid = 0;
+		rs->rs_flow_limit = rl.max_flows;
+		rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
+		rs->rs_disable = 0;
+		rate_table_act = rl.rate_table;
+	} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
+		/* Chelsio */
+		rs->rs_ifp = ifp;
+		rs->rs_if_dunit = ifp->if_dunit;
+		rs->rs_rate_cnt = rl.number_of_rates;
+		rs->rs_min_seg = rl.min_segment_burst;
+		rs->rs_disable = 0;
+		rs->rs_flow_limit = rl.max_flows;
+		rate_table_act = desired_rates;
+		if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
+		    (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
+			/*
+			 * Our desired table is not big
+			 * enough, do what we can.
+			 */
+			rs->rs_rate_cnt = MAX_HDWR_RATES;
+		 }
+		if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
+			rs->rs_flags = RS_IS_INTF;
+		else
+			rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
+		if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
+			rs->rs_rate_cnt = ALL_HARDWARE_RATES;
+	} else {
+		printf("Interface:%s unit:%d not one known to have rate-limits\n",
+		    ifp->if_dname,
+		    ifp->if_dunit);
+		free(rs, M_TCPPACE);
+		return (NULL);
+	}
+	sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
+	rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
+	if (rs->rs_rlt == NULL) {
+		if (error)
+			*error = ENOMEM;
+bail:
+		free(rs, M_TCPPACE);
+		return (NULL);
+	}
+	if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
+		/*
+		 * The interface supports all
+		 * the rates we could possibly want.
+		 */
+		uint64_t rat;
+
+		rs->rs_rlt[0].rate = 12500;	/* 100k */
+		rs->rs_rlt[1].rate = 25000;	/* 200k */
+		rs->rs_rlt[2].rate = 62500;	/* 500k */
+		/* Note 125000 == 1Megabit
+		 * populate 1Meg - 1000meg.
+		 */
+		for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
+			rs->rs_rlt[i].rate = rat;
+			rat += 125000;
+		}
+		rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
+	} else if (rs->rs_flags & RS_INT_TBL) {
+		/* We populate this in a special way */
+		populate_canned_table(rs, rate_table_act);
+	} else {
+		/*
+		 * Just copy in the rates from
+		 * the table, it is in order.
+		 */
+		for (i=0; i<rs->rs_rate_cnt; i++) {
+			rs->rs_rlt[i].rate = rate_table_act[i];
+			rs->rs_rlt[i].time_between = 0;
+			rs->rs_rlt[i].flags = 0;
+		}
+	}
+	for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
+		/*
+		 * We go backwards through the list so that if we can't get
+		 * a rate and fail to init one, we have at least a chance of
+		 * getting the highest one.
+		 */
+		rs->rs_rlt[i].ptbl = rs;
+		rs->rs_rlt[i].tag = NULL;
+		/*
+		 * Calculate the time between.
+		 */
+		lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
+		res = lentim / rs->rs_rlt[i].rate;
+		if (res > 0)
+			rs->rs_rlt[i].time_between = res;
+		else
+			rs->rs_rlt[i].time_between = 1;
+		if (rs->rs_flags & RS_NO_PRE) {
+			rs->rs_rlt[i].flags = HDWRPACE_INITED;
+			rs->rs_lowest_valid = i;
+		} else {
+			int err;
+#ifdef RSS
+			hash_type = M_HASHTYPE_RSS_TCP_IPV4;
+#else
+			hash_type = M_HASHTYPE_OPAQUE_HASH;
+#endif
+			err = rl_attach_txrtlmt(ifp,
+			    hash_type,
+			    (i + 1),
+			    rs->rs_rlt[i].rate,
+			    &rs->rs_rlt[i].tag);
+			if (err) {
+				if (i == (rs->rs_rate_cnt - 1)) {
+					/*
+					 * Huh - first rate and we can't get
+					 * it?
+					 */
+					free(rs->rs_rlt, M_TCPPACE);
+					if (error)
+						*error = err;
+					goto bail;
+				} else {
+					if (error)
+						*error = err;
+				}
+				break;
+			} else {
+				rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
+				rs->rs_lowest_valid = i;
+			}
+		}
+	}
+	/* Did we get at least 1 rate? */
+	if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
+		rs->rs_highest_valid = rs->rs_rate_cnt - 1;
+	else {
+		free(rs->rs_rlt, M_TCPPACE);
+		goto bail;
+	}
+	rs_number_alive++;
+	CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+	sysctl_ctx_init(&rs->sysctl_ctx);
+	rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+	    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+	    OID_AUTO,
+	    rs->rs_ifp->if_xname,
+	    CTLFLAG_RW, 0,
+	    "");
+	/* Unlock to allow the sysctl stuff to allocate */
+	mtx_unlock(&rs_mtx);
+	rl_add_syctl_entries(rl_sysctl_root, rs);
+	/* re-lock for our caller */
+	mtx_lock(&rs_mtx);
+	return (rs);
+}
+
+static const struct tcp_hwrate_limit_table *
+tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
+    uint64_t bytes_per_sec, uint32_t flags)
+{
+	struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
+	uint64_t mbits_per_sec, ind_calc;
+	int i;
+
+	mbits_per_sec = (bytes_per_sec * 8);
+	if (flags & RS_PACING_LT) {
+		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+		    (rs->rs_lowest_valid <= 2)){
+			/*
+			 * Smaller than 1Meg, only
+			 * 3 entries can match it.
+			 */
+			for(i = rs->rs_lowest_valid; i < 3; i++) {
+				if (bytes_per_sec <= rs->rs_rlt[i].rate) {
+					rte = &rs->rs_rlt[i];
+					break;
+				} else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
+					arte = &rs->rs_rlt[i];
+				}
+			}
+			goto done;
+		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
+			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
+			/*
+			 * Larger than 1G (the majority of
+			 * our table.
+			 */
+			if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
+				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+			else
+				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+			goto done;
+		}
+		/*
+		 * If we reach here its in our table (between 1Meg - 1000Meg),
+		 * just take the rounded down mbits per second, and add
+		 * 1Megabit to it, from this we can calculate
+		 * the index in the table.
+		 */
+		ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+		if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
+			ind_calc++;
+		/* our table is offset by 3, we add 2 */
+		ind_calc += 2;
+		if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+			/* This should not happen */
+			ind_calc = ALL_HARDWARE_RATES-1;
+		}
+		if ((ind_calc >= rs->rs_lowest_valid) &&
+		    (ind_calc <= rs->rs_highest_valid))
+		rte = &rs->rs_rlt[ind_calc];
+	} else if (flags & RS_PACING_EXACT_MATCH) {
+		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+		    (rs->rs_lowest_valid <= 2)){
+			for(i = rs->rs_lowest_valid; i < 3; i++) {
+				if (bytes_per_sec == rs->rs_rlt[i].rate) {
+					rte = &rs->rs_rlt[i];
+					break;
+				}
+			}
+		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
+			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
+			/* > 1Gbps only one rate */
+			if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
+				/* Its 10G wow */
+				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+			}
+		} else {
+			/* Ok it must be a exact meg (its between 1G and 1Meg) */
+			ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+			if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
+				/* its an exact Mbps */
+				ind_calc += 2;
+				if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+					/* This should not happen */
+					ind_calc = ALL_HARDWARE_RATES-1;
+				}
+				if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
+					rte = &rs->rs_rlt[ind_calc];
+			}
+		}
+	} else {
+		/* we want greater than the requested rate */
+		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+		    (rs->rs_lowest_valid <= 2)){
+			arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
+			for (i=2; i>=rs->rs_lowest_valid; i--) {
+				if (bytes_per_sec < rs->rs_rlt[i].rate) {
+					rte = &rs->rs_rlt[i];
+					break;
+				} else if ((flags & RS_PACING_GEQ) &&
+					   (bytes_per_sec == rs->rs_rlt[i].rate)) {
+					rte = &rs->rs_rlt[i];
+					break;
+				} else {
+					arte = &rs->rs_rlt[i]; /* new alternate */
+				}
+			}
+		} else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
+			if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
+			    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
+				/* Our top rate is larger than the request */
+				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+			} else if ((flags & RS_PACING_GEQ) &&
+				   (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
+				   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
+				/* It matches our top rate */
+				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+			} else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
+				/* The top rate is an alternative */
+				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+			}
+		} else {
+			/* Its in our range 1Meg - 1Gig */
+			if (flags & RS_PACING_GEQ) {
+				ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+				if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
+					if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+						/* This should not happen */
+						ind_calc = (ALL_HARDWARE_RATES-1);
+					}
+					rte = &rs->rs_rlt[ind_calc];
+				}
+				goto done;
+			}
+			ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
+			ind_calc += 2;
+			if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+				/* This should not happen */
+				ind_calc = ALL_HARDWARE_RATES-1;
+			}
+			if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
+				rte = &rs->rs_rlt[ind_calc];
+		}
+	}
+done:
+	if ((rte == NULL) &&
+	    (arte != NULL) &&
+	    (flags & RS_PACING_SUB_OK)) {
+		/* We can use the substitute */
+		rte = arte;
+	}
+	return (rte);
+}
+
+static const struct tcp_hwrate_limit_table *
+tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
+{
+	/**
+	 * Hunt the rate table with the restrictions in flags and find a
+	 * suitable rate if possible.
+	 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
+	 * RS_PACING_GT     - must be greater than.
+	 * RS_PACING_GEQ    - must be greater than or equal.
+	 * RS_PACING_LT     - must be less than.
+	 * RS_PACING_SUB_OK - If we don't meet criteria a
+	 *                    substitute is ok.
+	 */
+	int i, matched;
+	struct tcp_hwrate_limit_table *rte = NULL;
+
+
+	if ((rs->rs_flags & RS_INT_TBL) &&
+	    (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
+		/*
+		 * Here we don't want to paw thru
+		 * a big table, we have everything
+		 * from 1Meg - 1000Meg in 1Meg increments.
+		 * Use an alternate method to "lookup".
+		 */
+		return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
+	}
+	if ((flags & RS_PACING_LT) ||
+	    (flags & RS_PACING_EXACT_MATCH)) {
+		/*
+		 * For exact and less than we go forward through the table.
+		 * This way when we find one larger we stop (exact was a
+		 * toss up).
+		 */
+		for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
+			if ((flags & RS_PACING_EXACT_MATCH) &&
+			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
+				rte = &rs->rs_rlt[i];
+				matched = 1;
+				break;
+			} else if ((flags & RS_PACING_LT) &&
+			    (bytes_per_sec <= rs->rs_rlt[i].rate)) {
+				rte = &rs->rs_rlt[i];
+				matched = 1;
+				break;
+			}
+			if (bytes_per_sec > rs->rs_rlt[i].rate)
+				break;
+		}
+		if ((matched == 0) &&
+		    (flags & RS_PACING_LT) &&
+		    (flags & RS_PACING_SUB_OK)) {
+			/* Kick in a substitute (the lowest) */
+			rte = &rs->rs_rlt[rs->rs_lowest_valid];
+		}
+	} else {
+		/*
+		 * Here we go backward through the table so that we can find
+		 * the one greater in theory faster (but its probably a
+		 * wash).
+		 */
+		for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
+			if (rs->rs_rlt[i].rate > bytes_per_sec) {
+				/* A possible candidate */
+				rte = &rs->rs_rlt[i];
+			}
+			if ((flags & RS_PACING_GEQ) &&
+			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
+				/* An exact match and we want equal */
+				matched = 1;
+				rte = &rs->rs_rlt[i];
+				break;
+			} else if (rte) {
+				/*
+				 * Found one that is larger than but don't
+				 * stop, there may be a more closer match.
+				 */
+				matched = 1;
+			}
+			if (rs->rs_rlt[i].rate < bytes_per_sec) {
+				/*
+				 * We found a table entry that is smaller,
+				 * stop there will be none greater or equal.
+				 */
+				break;
+			}
+		}
+		if ((matched == 0) &&
+		    (flags & RS_PACING_SUB_OK)) {
+			/* Kick in a substitute (the highest) */
+			rte = &rs->rs_rlt[rs->rs_highest_valid];
+		}
+	}
+	return (rte);
+}
+
+static struct ifnet *
+rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
+{
+	struct ifnet *tifp;
+	struct m_snd_tag *tag;
+	union if_snd_tag_alloc_params params = {
+		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+		.rate_limit.hdr.flowid = 1,
+		.rate_limit.max_rate = COMMON_RATE,
+		.rate_limit.flags = M_NOWAIT,
+	};
+	int err;
+#ifdef RSS
+	params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
+	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
+#else
+	params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
+#endif
+	tag = NULL;
+	if (ifp->if_snd_tag_alloc) {
+		if (error)
+			*error = ENODEV;
+		return (NULL);
+	}
+	err = ifp->if_snd_tag_alloc(ifp, &params, &tag);
+	if (err) {
+		/* Failed to setup a tag? */
+		if (error)
+			*error = err;
+		return (NULL);
+	}
+	tifp = tag->ifp;
+	tifp->if_snd_tag_free(tag);
+	return (tifp);
+}
+
+static const struct tcp_hwrate_limit_table *
+rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
+    uint32_t flags, int *error)
+{
+	/* First lets find the interface if it exists */
+	const struct tcp_hwrate_limit_table *rte;
+	struct tcp_rate_set *rs;
+	struct epoch_tracker et;
+	int err;
+
+	epoch_enter_preempt(net_epoch_preempt, &et);
+use_real_interface:
+	CK_LIST_FOREACH(rs, &int_rs, next) {
+		/*
+		 * Note we don't look with the lock since we either see a
+		 * new entry or will get one when we try to add it.
+		 */
+		if (rs->rs_flags & RS_IS_DEAD) {
+			/* The dead are not looked at */
+			continue;
+		}
+		if ((rs->rs_ifp == ifp) &&
+		    (rs->rs_if_dunit == ifp->if_dunit)) {
+			/* Ok we found it */
+			break;
+		}
+	}
+	if ((rs == NULL) ||
+	    (rs->rs_flags & RS_INTF_NO_SUP) ||
+	    (rs->rs_flags & RS_IS_DEAD)) {
+		/*
+		 * This means we got a packet *before*
+		 * the IF-UP was processed below, <or>
+		 * while or after we already received an interface
+		 * departed event. In either case we really don't
+		 * want to do anything with pacing, in
+		 * the departing case the packet is not
+		 * going to go very far. The new case
+		 * might be arguable, but its impossible
+		 * to tell from the departing case.
+		 */
+		if (rs->rs_disable && error)
+			*error = ENODEV;
+		epoch_exit_preempt(net_epoch_preempt, &et);
+		return (NULL);
+	}
+
+	if ((rs == NULL) || (rs->rs_disable != 0)) {
+		if (rs->rs_disable && error)
+			*error = ENOSPC;
+		epoch_exit_preempt(net_epoch_preempt, &et);
+		return (NULL);
+	}
+	if (rs->rs_flags & RS_IS_DEFF) {
+		/* We need to find the real interface */
+		struct ifnet *tifp;
+
+		tifp = rt_find_real_interface(ifp, inp, error);
+		if (tifp == NULL) {
+			if (rs->rs_disable && error)
+				*error = ENOTSUP;
+			epoch_exit_preempt(net_epoch_preempt, &et);
+			return (NULL);
+		}
+		goto use_real_interface;
+	}
+	if (rs->rs_flow_limit &&
+	    ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
+		if (error)
+			*error = ENOSPC;
+		epoch_exit_preempt(net_epoch_preempt, &et);
+		return (NULL);
+	}
+	rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
+	if (rte) {
+		err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
+		    inp->inp_flowtype,
+		    inp->inp_flowid,
+		    rte->rate,
+		    &inp->inp_snd_tag);
+		if (err) {
+			/* Failed to attach */
+			if (error)
+				*error = err;
+			rte = NULL;
+		}
+	}
+	if (rte) {
+		/*
+		 * We use an atomic here for accounting so we don't have to
+		 * use locks when freeing.
+		 */
+		atomic_add_long(&rs->rs_flows_using, 1);
+	}
+	epoch_exit_preempt(net_epoch_preempt, &et);
+	return (rte);
+}
+
+static void
+tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
+{
+	int error;
+	struct tcp_rate_set *rs;
+
+	if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
+	    (link_state != LINK_STATE_UP)) {
+		/*
+		 * We only care on an interface going up that is rate-limit
+		 * capable.
+		 */
+		return;
+	}
+	mtx_lock(&rs_mtx);
+	CK_LIST_FOREACH(rs, &int_rs, next) {
+		if ((rs->rs_ifp == ifp) &&
+		    (rs->rs_if_dunit == ifp->if_dunit)) {
+			/* We already have initialized this guy */
+			mtx_unlock(&rs_mtx);
+			return;
+		}
+	}
+	rt_setup_new_rs(ifp, &error);
+	mtx_unlock(&rs_mtx);
+}
+
+static void
+tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
+{
+	struct tcp_rate_set *rs, *nrs;
+	struct ifnet *tifp;
+	int i;
+
+	mtx_lock(&rs_mtx);
+	CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
+		if ((rs->rs_ifp == ifp) &&
+		    (rs->rs_if_dunit == ifp->if_dunit)) {
+			CK_LIST_REMOVE(rs, next);
+			rs_number_alive--;
+			rs_number_dead++;
+			rs->rs_flags |= RS_IS_DEAD;
+			for (i = 0; i < rs->rs_rate_cnt; i++) {
+				if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
+					tifp = rs->rs_rlt[i].tag->ifp;
+					in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
+					rs->rs_rlt[i].tag = NULL;
+				}
+				rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
+			}
+			if (rs->rs_flows_using == 0) {
+				/*
+				 * No references left, so we can schedule the
+				 * destruction after the epoch (with a caveat).
+				 */
+				rs->rs_flags |= RS_FUNERAL_SCHD;
+				epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+			}
+			break;
+		}
+	}
+	mtx_unlock(&rs_mtx);
+}
+
+static void
+tcp_rl_shutdown(void *arg __unused, int howto __unused)
+{
+	struct tcp_rate_set *rs, *nrs;
+	struct ifnet *tifp;
+	int i;
+
+	mtx_lock(&rs_mtx);
+	CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
+		CK_LIST_REMOVE(rs, next);
+		rs_number_alive--;
+		rs_number_dead++;
+		rs->rs_flags |= RS_IS_DEAD;
+		for (i = 0; i < rs->rs_rate_cnt; i++) {
+			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
+				tifp = rs->rs_rlt[i].tag->ifp;
+				in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
+				rs->rs_rlt[i].tag = NULL;
+			}
+			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
+		}
+		if (rs->rs_flows_using != 0) {
+			/*
+			 * We dont hold a reference
+			 * so we have nothing left to
+			 * do.
+			 */
+		} else {
+			/*
+			 * No references left, so we can destroy it
+			 * after the epoch.
+			 */
+			rs->rs_flags |= RS_FUNERAL_SCHD;
+			epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+		}
+	}
+	mtx_unlock(&rs_mtx);
+}
+
+const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+    uint64_t bytes_per_sec, int flags, int *error)
+{
+	const struct tcp_hwrate_limit_table *rte;
+
+	if (tp->t_inpcb->inp_snd_tag == NULL) {
+		/*
+		 * We are setting up a rate for the first time.
+		 */
+		if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
+			/* Not supported by the egress */
+			if (error)
+				*error = ENODEV;
+			return (NULL);
+		}
+#ifdef KERN_TLS
+		if (tp->t_inpcb->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) {
+			/*
+			 * We currently can't do both TLS and hardware
+			 * pacing
+			 */
+			if (error)
+				*error = EINVAL;
+			return (NULL);
+		}
+#endif
+		rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
+	} else {
+		/*
+		 * We are modifying a rate, wrong interface?
+		 */
+		if (error)
+			*error = EINVAL;
+		rte = NULL;
+	}
+	return (rte);
+}
+
+const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+    struct tcpcb *tp, struct ifnet *ifp,
+    uint64_t bytes_per_sec, int flags, int *error)
+{
+	const struct tcp_hwrate_limit_table *nrte;
+	const struct tcp_rate_set *rs;
+	int is_indirect = 0;
+	int err;
+
+
+	if ((tp->t_inpcb->inp_snd_tag == NULL) ||
+	    (crte == NULL)) {
+		/* Wrong interface */
+		if (error)
+			*error = EINVAL;
+		return (NULL);
+	}
+	rs = crte->ptbl;
+	if ((rs->rs_flags & RS_IS_DEAD) ||
+	    (crte->flags & HDWRPACE_IFPDEPARTED)) {
+		/* Release the rate, and try anew */
+re_rate:
+		tcp_rel_pacing_rate(crte, tp);
+		nrte = tcp_set_pacing_rate(tp, ifp,
+		    bytes_per_sec, flags, error);
+		return (nrte);
+	}
+	if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
+		is_indirect = 1;
+	else
+		is_indirect = 0;
+	if ((is_indirect == 0) &&
+	    ((ifp != rs->rs_ifp) ||
+	    (ifp->if_dunit != rs->rs_if_dunit))) {
+		/*
+		 * Something changed, the user is not pointing to the same
+		 * ifp? Maybe a route updated on this guy?
+		 */
+		goto re_rate;
+	} else if (is_indirect) {
+		/*
+		 * For indirect we have to dig in and find the real interface.
+		 */
+		struct ifnet *rifp;
+
+		rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
+		if (rifp == NULL) {
+			/* Can't find it? */
+			goto re_rate;
+		}
+		if ((rifp != rs->rs_ifp) ||
+		    (ifp->if_dunit != rs->rs_if_dunit)) {
+			goto re_rate;
+		}
+	}
+	nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
+	if (nrte == crte) {
+		/* No change */
+		if (error)
+			*error = 0;
+		return (crte);
+	}
+	if (nrte == NULL) {
+		/* Release the old rate */
+		tcp_rel_pacing_rate(crte, tp);
+		return (NULL);
+	}
+	/* Change rates to our new entry */
+	err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
+	if (err) {
+		if (error)
+			*error = err;
+		return (NULL);
+	}
+	if (error)
+		*error = 0;
+	return (nrte);
+}
+
+void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
+{
+	const struct tcp_rate_set *crs;
+	struct tcp_rate_set *rs;
+	uint64_t pre;
+
+	crs = crte->ptbl;
+	/*
+	 * Now we must break the const
+	 * in order to release our refcount.
+	 */
+	rs = __DECONST(struct tcp_rate_set *, crs);
+	pre = atomic_fetchadd_long(&rs->rs_flows_using, -1);
+	if (pre == 1) {
+		mtx_lock(&rs_mtx);
+		/*
+		 * Is it dead?
+		 */
+		if ((rs->rs_flags & RS_IS_DEAD) &&
+		    ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){
+			/*
+			 * We were the last,
+			 * and a funeral is not pending, so
+			 * we must schedule it.
+			 */
+			rs->rs_flags |= RS_FUNERAL_SCHD;
+			epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+		}
+		mtx_unlock(&rs_mtx);
+	}
+	in_pcbdetach_txrtlmt(tp->t_inpcb);
+}
+
+static eventhandler_tag rl_ifnet_departs;
+static eventhandler_tag rl_ifnet_arrives;
+static eventhandler_tag rl_shutdown_start;
+
+static void
+tcp_rs_init(void *st __unused)
+{
+	CK_LIST_INIT(&int_rs);
+	rs_number_alive = 0;
+	rs_number_dead = 0;;
+	mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
+	rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
+	    tcp_rl_ifnet_departure,
+	    NULL, EVENTHANDLER_PRI_ANY);
+	rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
+	    tcp_rl_ifnet_link,
+	    NULL, EVENTHANDLER_PRI_ANY);
+	rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
+	    tcp_rl_shutdown, NULL,
+	    SHUTDOWN_PRI_FIRST);
+	printf("TCP_ratelimit: Is now initialized\n");
+}
+
+SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
+#endif
diff --git a/sys/netinet/tcp_ratelimit.h b/sys/netinet/tcp_ratelimit.h
new file mode 100644
index 000000000000..49e407c03928
--- /dev/null
+++ b/sys/netinet/tcp_ratelimit.h
@@ -0,0 +1,141 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ *	Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * __FBSDID("$FreeBSD$");
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ */
+#ifndef __tcp_ratelimit_h__
+#define __tcp_ratelimit_h__
+
+struct m_snd_tag;
+
+/* Flags on an individual rate */
+#define HDWRPACE_INITED 	0x0001
+#define HDWRPACE_TAGPRESENT	0x0002
+#define HDWRPACE_IFPDEPARTED	0x0004
+struct tcp_hwrate_limit_table {
+	const struct tcp_rate_set *ptbl;	/* Pointer to parent table */
+	struct m_snd_tag *tag;	/* Send tag if needed (chelsio) */
+	uint64_t rate;		/* Rate we get in Bytes per second (Bps) */
+	uint32_t time_between;	/* Time-Gap between packets at this rate */
+	uint32_t flags;
+};
+
+/* Rateset flags */
+#define RS_IS_DEFF      0x0001	/* Its a lagg, do a double lookup */
+#define RS_IS_INTF      0x0002	/* Its a plain interface */
+#define RS_NO_PRE       0x0004	/* The interfacd has set rates */
+#define RS_INT_TBL      0x0010	/*
+				 * The table is the internal version
+				 * which has special setup requirements.
+				 */
+#define RS_IS_DEAD      0x0020	/* The RS is dead list */
+#define RS_FUNERAL_SCHD 0x0040  /* Is a epoch call scheduled to bury this guy?*/
+#define RS_INTF_NO_SUP  0x0100 	/* The interface does not support the ratelimiting */
+
+struct tcp_rate_set {
+	struct sysctl_ctx_list sysctl_ctx;
+	CK_LIST_ENTRY(tcp_rate_set) next;
+	struct ifnet *rs_ifp;
+	struct tcp_hwrate_limit_table *rs_rlt;
+	uint64_t rs_flows_using;
+	uint64_t rs_flow_limit;
+	uint32_t rs_if_dunit;
+	int rs_rate_cnt;
+	int rs_min_seg;
+	int rs_highest_valid;
+	int rs_lowest_valid;
+	int rs_disable;
+	int rs_flags;
+	struct epoch_context rs_epoch_ctx;
+};
+
+CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
+
+/* Request flags */
+#define RS_PACING_EXACT_MATCH	0x0001	/* Need an exact match for rate */
+#define RS_PACING_GT		0x0002	/* Greater than requested */
+#define RS_PACING_GEQ		0x0004	/* Greater than or equal too */
+#define RS_PACING_LT		0x0008	/* Less than requested rate */
+#define RS_PACING_SUB_OK	0x0010	/* If a rate can't be found get the
+					 * next best rate (highest or lowest). */
+#ifdef RATELIMIT
+#ifdef _KERNEL
+#define DETAILED_RATELIMIT_SYSCTL 1	/*
+					 * Undefine this if you don't want
+					 * detailed rates to appear in
+					 * net.inet.tcp.rl.
+					 * With the defintion each rate
+					 * shows up in your sysctl tree
+					 * this can be big.
+					 */
+
+const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+    uint64_t bytes_per_sec, int flags, int *error);
+
+const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+    struct tcpcb *tp, struct ifnet *ifp,
+    uint64_t bytes_per_sec, int flags, int *error);
+void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+    struct tcpcb *tp);
+#else
+static inline const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+    uint64_t bytes_per_sec, int flags, int *error)
+{
+	if (error)
+		*error = EOPNOTSUPP;
+	return (NULL);
+}
+
+static inline const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+    struct tcpcb *tp, struct ifnet *ifp,
+    uint64_t bytes_per_sec, int flags, int *error)
+{
+	if (error)
+		*error = EOPNOTSUPP;
+	return (NULL);
+}
+
+static inline void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+    struct tcpcb *tp)
+{
+	return;
+}
+
+#endif
+#endif
+#endif