Update rack to the latest code used at NF.

There have been many changes to rack over the last couple of years, including: a) Ability when switching stacks to have one stack query another. b) Internal use of micro-second timers instead of ticks. c) Many changes to pacing in forms of 1) Improvements to Dynamic Goodput Pacing (DGP) 2) Improvements to fixed rate paciing 3) A new feature called hybrid pacing where the requestor can get a combination of DGP and fixed rate pacing with deadlines for delivery that can dynamically speed things up. d) All kinds of bugs found during extensive testing and use of the rack stack for streaming video and in fact all data transferred by NF Reviewed by: glebius, gallatin, tuexen Sponsored By: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D39402
2023-04-04 16:05:46 -04:00 · 2023-04-04 16:05:46 -04:00 · 030434acaf
commit 030434acaf
parent 2ff8187efd
8 changed files with 5231 additions and 1428 deletions
--- a/sys/modules/tcp/rack/Makefile
+++ b/sys/modules/tcp/rack/Makefile
@ -6,7 +6,7 @@

 STACKNAME=	rack
 KMOD=	tcp_${STACKNAME}
-SRCS=	rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c
+SRCS=	rack.c sack_filter.c rack_bbr_common.c tailq_hash.c

 SRCS+=	opt_inet.h opt_inet6.h opt_ipsec.h
 SRCS+=	opt_kern_tls.h
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_ratelimit.h"
-#include "opt_kern_tls.h"
 #include <sys/param.h>
 #include <sys/arb.h>
 #include <sys/module.h>
@ -51,9 +50,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/qmath.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
-#ifdef KERN_TLS
-#include <sys/ktls.h>
-#endif
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tree.h>
@ -130,36 +126,6 @@ __FBSDID("$FreeBSD$");
 * Common TCP Functions - These are shared by borth
 * rack and BBR.
 */
-#ifdef KERN_TLS
-uint32_t
-ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
-{
-	struct ktls_session *tls;
-	uint32_t len;
-
-again:
-	tls = so->so_snd.sb_tls_info;
-	len = tls->params.max_frame_len;         /* max tls payload */
-	len += tls->params.tls_hlen;      /* tls header len  */
-	len += tls->params.tls_tlen;      /* tls trailer len */
-	if ((len * 4) > rwnd) {
-		/*
-		 * Stroke this will suck counter and what
-		 * else should we do Drew? From the
-		 * TCP perspective I am not sure
-		 * what should be done...
-		 */
-		if (tls->params.max_frame_len > 4096) {
-			tls->params.max_frame_len -= 4096;
-			if (tls->params.max_frame_len < 4096)
-				tls->params.max_frame_len = 4096;
-			goto again;
-		}
-	}
-	return (len);
-}
-#endif
-
 static int
 ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m)
 {
--- a/sys/netinet/tcp_stacks/rack_bbr_common.h
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.h
@ -87,9 +87,6 @@
 #ifdef _KERNEL
 /* We have only 7 bits in rack so assert its true */
 CTASSERT((PACE_TMR_MASK & 0x80) == 0);
-#ifdef KERN_TLS
-uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd);
-#endif
 int
 ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so,
    struct mbuf *m, int has_pkt);
--- a/sys/netinet/tcp_stacks/tailq_hash.c
+++ b/sys/netinet/tcp_stacks/tailq_hash.c
@ -0,0 +1,344 @@
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_ratelimit.h"
+#include "opt_kern_tls.h"
+#include <sys/param.h>
+#include <sys/arb.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>		/* for proc0 declaration */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#ifdef STATS
+#include <sys/qmath.h>
+#include <sys/tree.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
+#else
+#include <sys/tree.h>
+#endif
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/tim_filter.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/kern_prefetch.h>
+#include <sys/protosw.h>
+#ifdef TCP_ACCOUNTING
+#include <sys/sched.h>
+#include <machine/cpu.h>
+#endif
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/vnet.h>
+
+#define TCPSTATES		/* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
+#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet/tcp.h>
+#define	TCPOUTFLAGS
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_ratelimit.h>
+#include <netinet/tcp_accounting.h>
+#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#include <netinet/cc/cc_newreno.h>
+#include <netinet/tcp_fastopen.h>
+#include <netinet/tcp_lro.h>
+#ifdef NETFLIX_SHARED_CWND
+#include <netinet/tcp_shared_cwnd.h>
+#endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_ecn.h>
+
+#include <netipsec/ipsec_support.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif				/* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+#include "sack_filter.h"
+#include "tcp_rack.h"
+#include "tailq_hash.h"
+
+
+struct rack_sendmap *
+tqhash_min(struct tailq_hash *hs)
+{
+	struct rack_sendmap *rsm;
+
+	rsm = tqhash_find(hs, hs->min);
+	return(rsm);
+}
+
+struct rack_sendmap *
+tqhash_max(struct tailq_hash *hs)
+{
+	struct rack_sendmap *rsm;
+
+	rsm = tqhash_find(hs, (hs->max - 1));
+	return (rsm);
+}
+
+int
+tqhash_empty(struct tailq_hash *hs)
+{
+	if (hs->count == 0)
+		return(1);
+	return(0);
+}
+
+struct rack_sendmap *
+tqhash_find(struct tailq_hash *hs, uint32_t seq)
+{
+	struct rack_sendmap *e;
+	int bindex, pbucket, fc = 1;
+
+	if ((SEQ_LT(seq, hs->min)) ||
+	    (hs->count == 0) ||
+	    (SEQ_GEQ(seq, hs->max))) {
+		/* Not here */
+		return (NULL);
+	}
+	bindex = seq / SEQ_BUCKET_SIZE;
+	bindex %= MAX_HASH_ENTRIES;
+	/* Lets look through the bucket it belongs to */
+	if (TAILQ_EMPTY(&hs->ht[bindex])) {
+		goto look_backwards;
+	}
+	TAILQ_FOREACH(e, &hs->ht[bindex], next) {
+		if (fc == 1) {
+			/*
+			 * Special check for when a cum-ack
+			 * as moved up over a seq and now its
+			 * a bucket behind where it belongs. In
+			 * the case of SACKs which create new rsm's
+			 * this won't occur.
+			 */
+			if (SEQ_GT(e->r_start, seq)) {
+				goto look_backwards;
+			}
+			fc = 0;
+		}
+		if (SEQ_GEQ(seq, e->r_start) &&
+		    (SEQ_LT(seq, e->r_end))) {
+			/* Its in this block */
+			return (e);
+		}
+	}
+	/* Did not find it */
+	return (NULL);
+look_backwards:
+	if (bindex == 0)
+		pbucket = MAX_HASH_ENTRIES - 1;
+	else
+		pbucket = bindex - 1;
+	TAILQ_FOREACH_REVERSE(e, &hs->ht[pbucket], rack_head, next) {
+		if (SEQ_GEQ(seq, e->r_start) &&
+		    (SEQ_LT(seq, e->r_end))) {
+			/* Its in this block */
+			return (e);
+		}
+		if (SEQ_GEQ(e->r_end, seq))
+			break;
+	}
+	return (NULL);
+}
+
+struct rack_sendmap *
+tqhash_next(struct tailq_hash *hs, struct rack_sendmap *rsm)
+{
+	struct rack_sendmap *e;
+
+	e = TAILQ_NEXT(rsm, next);
+	if (e == NULL) {
+		/* Move to next bucket */
+		int nxt;
+
+		nxt = rsm->bindex + 1;
+		if (nxt >= MAX_HASH_ENTRIES)
+			nxt = 0;
+		e = TAILQ_FIRST(&hs->ht[nxt]);
+	}
+	return(e);
+}
+
+struct rack_sendmap *
+tqhash_prev(struct tailq_hash *hs, struct rack_sendmap *rsm)
+{
+	struct rack_sendmap *e;
+
+	e = TAILQ_PREV(rsm, rack_head, next);
+	if (e == NULL) {
+		int prev;
+
+		if (rsm->bindex > 0)
+			prev = rsm->bindex - 1;
+		else
+			prev = MAX_HASH_ENTRIES - 1;
+		e = TAILQ_LAST(&hs->ht[prev], rack_head);
+	}
+	return (e);
+}
+
+void
+tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type)
+{
+	TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next);
+	hs->count--;
+	if (hs->count == 0) {
+		hs->min = hs->max;
+	} else if (type == REMOVE_TYPE_CUMACK) {
+		hs->min = rsm->r_end;
+	}
+}
+
+int
+tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm)
+{
+	struct rack_sendmap *e, *l;
+	int inserted = 0;
+	uint32_t ebucket;
+
+	if (hs->count > 0) {
+		if ((rsm->r_end - hs->min) >  MAX_ALLOWED_SEQ_RANGE) {
+			return (-1);
+		}
+		e = tqhash_find(hs, rsm->r_start);
+		if (e) {
+			return (-2);
+		}
+	}
+	rsm->bindex = rsm->r_start / SEQ_BUCKET_SIZE;
+	rsm->bindex %= MAX_HASH_ENTRIES;
+	ebucket = rsm->r_end / SEQ_BUCKET_SIZE;
+	ebucket %= MAX_HASH_ENTRIES;
+	if (ebucket != rsm->bindex) {
+		/* This RSM straddles the bucket boundary */
+		rsm->r_flags |= RACK_STRADDLE;
+	} else {
+		rsm->r_flags &= ~RACK_STRADDLE;
+	}
+	if (hs->count == 0) {
+		/* Special case */
+		hs->min = rsm->r_start;
+		hs->max = rsm->r_end;
+		hs->count = 1;
+	} else {
+		hs->count++;
+		if (SEQ_GT(rsm->r_end, hs->max))
+			hs->max = rsm->r_end;
+		if (SEQ_LT(rsm->r_start, hs->min))
+			hs->min = rsm->r_start;
+	}
+	/* Check the common case of inserting at the end */
+	l = TAILQ_LAST(&hs->ht[rsm->bindex], rack_head);
+	if ((l == NULL) || (SEQ_GT(rsm->r_start, l->r_start))) {
+		TAILQ_INSERT_TAIL(&hs->ht[rsm->bindex], rsm, next);
+		return (0);
+	}
+	TAILQ_FOREACH(e, &hs->ht[rsm->bindex], next) {
+		if (SEQ_LEQ(rsm->r_start, e->r_start)) {
+			inserted = 1;
+			TAILQ_INSERT_BEFORE(e, rsm, next);
+			break;
+		}
+	}
+	if (inserted == 0) {
+		TAILQ_INSERT_TAIL(&hs->ht[rsm->bindex], rsm, next);
+	}
+	return (0);
+}
+
+void
+tqhash_init(struct tailq_hash *hs)
+{
+	int i;
+
+	for(i = 0; i < MAX_HASH_ENTRIES; i++) {
+		TAILQ_INIT(&hs->ht[i]);
+	}
+	hs->min = hs->max = 0;
+	hs->count = 0;
+}
+
+int
+tqhash_trim(struct tailq_hash *hs, uint32_t th_ack)
+{
+	struct rack_sendmap *rsm;
+
+	if (SEQ_LT(th_ack, hs->min)) {
+		/* It can't be behind our current min */
+		return (-1);
+	}
+	if (SEQ_GEQ(th_ack, hs->max)) {
+		/*  It can't be beyond or at our current max */
+		return (-2);
+	}
+	rsm = tqhash_min(hs);
+	if (rsm == NULL) {
+		/* nothing to trim */
+		return (-3);
+	}
+	if (SEQ_GEQ(th_ack, rsm->r_end)) {
+		/*
+		 * You can't trim all bytes instead
+		 * you need to remove it.
+		 */
+		return (-4);
+	}
+	if (SEQ_GT(th_ack, hs->min))
+	    hs->min = th_ack;
+	/*
+	 * Should we trim it for the caller?
+	 * they may have already which is ok...
+	 */
+	if (SEQ_GT(th_ack, rsm->r_start)) {
+		rsm->r_start = th_ack;
+	}
+	return (0);
+}
+
--- a/sys/netinet/tcp_stacks/tailq_hash.h
+++ b/sys/netinet/tcp_stacks/tailq_hash.h
@ -0,0 +1,73 @@
+#ifndef __tailq_hash__
+#define __tailq_hash__
+
+/* Must be powers of 2 */
+#define MAX_HASH_ENTRIES 128
+#define SEQ_BUCKET_SIZE 262144
+/*
+ * The max seq range that can be stored is
+ * 64 x 262144 or 16Meg. We have one extra slot
+ * for fall-over but must keep it so we never have
+ * wrap in hashing over valid other entries.
+ */
+#define MAX_ALLOWED_SEQ_RANGE (SEQ_BUCKET_SIZE * (MAX_HASH_ENTRIES-1))
+
+struct tailq_hash {
+	struct rack_head ht[MAX_HASH_ENTRIES];
+	uint32_t min;
+	uint32_t max;
+	uint32_t count;
+};
+
+struct rack_sendmap *
+tqhash_min(struct tailq_hash *hs);
+
+struct rack_sendmap *
+tqhash_max(struct tailq_hash *hs);
+
+int
+tqhash_empty(struct tailq_hash *hs);
+
+struct rack_sendmap *
+tqhash_find(struct tailq_hash *hs, uint32_t seq);
+
+struct rack_sendmap *
+tqhash_next(struct tailq_hash *hs, struct rack_sendmap *rsm);
+
+struct rack_sendmap *
+tqhash_prev(struct tailq_hash *hs, struct rack_sendmap *rsm);
+
+#define REMOVE_TYPE_CUMACK	1	/* Cumack moved */
+#define REMOVE_TYPE_MERGE	2	/* Merging two blocks */
+#define REMOVE_TYPE_FINI	3	/* The connection is over */
+
+void
+tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type);
+
+int
+tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm);
+
+void
+tqhash_init(struct tailq_hash *hs);
+
+int
+tqhash_trim(struct tailq_hash *hs, uint32_t th_ack);
+
+
+#define	TQHASH_FOREACH(var, head) \
+	for ((var) = tqhash_min((head));		\
+	     (var);					\
+	     (var) = tqhash_next((head), (var)))
+
+#define TQHASH_FOREACH_FROM(var, head, fvar)					\
+	for ((var) = ((fvar) ? (fvar) : tqhash_min((head)));		\
+	    (var);							\
+	     (var) = tqhash_next((head), (var)))
+
+#define	TQHASH_FOREACH_REVERSE_FROM(var, head)		\
+	for ((var) = ((var) ? (var) : tqhash_max((head)));		\
+	    (var);							\
+	    (var) = tqhash_prev((head), (var)))
+
+
+#endif
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@ -45,6 +45,11 @@
 #define RACK_SENT_FP        0x004000/* sent in fast path */
 #define RACK_HAD_PUSH	    0x008000/* Push was sent on original send */
 #define RACK_MUST_RXT	    0x010000/* We must retransmit this rsm (non-sack/mtu chg)*/
+#define RACK_IN_GP_WIN	    0x020000/* Send was in GP window when sent */
+#define RACK_SHUFFLED	    0x040000/* The RSM was shuffled some data from one to another */
+#define RACK_MERGED	    0x080000/* The RSM was merged */
+#define RACK_PMTU_CHG	    0x100000/* The path mtu changed on this guy */
+#define RACK_STRADDLE	    0x200000/* The seq straddles the bucket line */
 #define RACK_NUM_OF_RETRANS 3

 #define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */
@ -52,7 +57,9 @@
 #define RACK_REQ_AVG 3 	/* Must be less than 256 */

 struct rack_sendmap {
+	TAILQ_ENTRY(rack_sendmap) next;
 	TAILQ_ENTRY(rack_sendmap) r_tnext;	/* Time of transmit based next */
+	uint32_t bindex;
 	uint32_t r_start;	/* Sequence number of the segment */
 	uint32_t r_end;		/* End seq, this is 1 beyond actually */
 	uint32_t r_rtr_bytes;	/* How many bytes have been retransmitted */
@ -60,7 +67,8 @@ struct rack_sendmap {
 		 r_rtr_cnt : 8;	/* Retran count, index this -1 to get time */
 	struct mbuf *m;
 	uint32_t soff;
-	uint32_t orig_m_len;
+	uint32_t orig_m_len;	/* The original mbuf len when we sent (can update) */
+	uint32_t orig_t_space;	/* The original trailing space when we sent (can update) */
 	uint32_t r_nseq_appl;	/* If this one is app limited, this is the nxt seq limited */
 	uint8_t r_dupack;	/* Dup ack count */
 	uint8_t r_in_tmap;	/* Flag to see if its in the r_tnext array */
@ -72,8 +80,8 @@ struct rack_sendmap {
 		r_avail : 4;
 	uint64_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
 	uint64_t r_ack_arrival;	/* This is the time of ack-arrival (if SACK'd) */
-	RB_ENTRY(rack_sendmap) r_next;		/* RB Tree next */
 	uint32_t r_fas;		/* Flight at send */
+	uint8_t r_bas;		/* The burst size (burst at send = bas)  */
 };

 struct deferred_opt_list {
@ -201,11 +209,11 @@ struct rack_opts_stats {
 	uint64_t tcp_rack_pace_rate_ss;
 	uint64_t tcp_rack_pace_rate_rec;
 	/* Temp counters for dsack */
-	uint64_t tcp_sack_path_1;
-	uint64_t tcp_sack_path_2a;
-	uint64_t tcp_sack_path_2b;
-	uint64_t tcp_sack_path_3;
-	uint64_t tcp_sack_path_4;
+	uint64_t tcp_sack_path_1; /* not used */
+	uint64_t tcp_sack_path_2a; /* not used */
+	uint64_t tcp_sack_path_2b; /* not used */
+	uint64_t tcp_sack_path_3; /* not used */
+	uint64_t tcp_sack_path_4; /* not used */
 	/* non temp counters */
 	uint64_t tcp_rack_scwnd;
 	uint64_t tcp_rack_noprr;
@ -227,11 +235,16 @@ struct rack_opts_stats {
 	uint64_t tcp_rack_rtt_use;
 	uint64_t tcp_data_after_close;
 	uint64_t tcp_defer_opt;
-	uint64_t tcp_rack_fastrsm_hack;
+	uint64_t tcp_rxt_clamp;
 	uint64_t tcp_rack_beta;
 	uint64_t tcp_rack_beta_ecn;
 	uint64_t tcp_rack_timer_slop;
 	uint64_t tcp_rack_dsack_opt;
+	uint64_t tcp_rack_hi_beta;
+	uint64_t tcp_split_limit;
+	uint64_t tcp_rack_pacing_divisor;
+	uint64_t tcp_rack_min_seg;
+	uint64_t tcp_dgp_in_rec;
 };

 /* RTT shrink reasons */
@ -261,38 +274,6 @@ struct rack_opts_stats {
 #define RACK_QUALITY_PROBERTT	4	/* A measurement where we went into or exited probe RTT */
 #define RACK_QUALITY_ALLACKED	5	/* All data is now acknowledged */

-/*********************/
-/* Rack Trace points */
-/*********************/
-/*
- * Rack trace points are interesting points within
- * the rack code that the author/debugger may want
- * to have BB logging enabled if we hit that point.
- * In order to enable a trace point you set the
- * sysctl var net.inet.tcp.<stack>.tp.number to
- * one of the numbers listed below. You also
- * must make sure net.inet.tcp.<stack>.tp.bbmode is
- * non-zero, the default is 4 for continuous tracing.
- * You also set in the number of connections you want
- * have get BB logs in net.inet.tcp.<stack>.tp.count.
- *
- * Count will decrement every time BB logging is assigned
- * to a connection that hit your tracepoint.
- *
- * You can enable all trace points by setting the number
- * to 0xffffffff. You can disable all trace points by
- * setting number to zero (or count to 0).
- *
- * Below are the enumerated list of tracepoints that
- * have currently been defined in the code. Add more
- * as you add a call to rack_trace_point(rack, <name>);
- * where <name> is defined below.
- */
-#define RACK_TP_HWENOBUF	0x00000001	/* When we are doing hardware pacing and hit enobufs */
-#define RACK_TP_ENOBUF		0x00000002	/* When we hit enobufs with software pacing */
-#define RACK_TP_COLLAPSED_WND	0x00000003	/* When a peer to collapses its rwnd on us */
-#define RACK_TP_COLLAPSED_RXT	0x00000004	/* When we actually retransmit a collapsed window rsm */
-
 #define MIN_GP_WIN 6	/* We need at least 6 MSS in a GP measurement */
 #ifdef _KERNEL
 #define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t))
@ -356,14 +337,17 @@ struct rack_fast_send_blk {
 	struct udphdr *udp;
 	struct mbuf *m;
 	uint32_t o_m_len;
+	uint32_t o_t_len;
 	uint32_t rfo_apply_push : 1,
 		hw_tls : 1,
 		unused : 30;
 };

+struct tailq_hash;
+
 struct rack_control {
 	/* Second cache line 0x40 from tcp_rack */
-	struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */
+	struct tailq_hash *tqh; /* Tree of all segments Lock(a) */
 	struct rack_head rc_tmap;	/* List in transmit order Lock(a) */
 	struct rack_sendmap *rc_tlpsend;	/* Remembered place for
 						 * tlp_sending Lock(a) */
@ -371,8 +355,8 @@ struct rack_control {
 					 * resend */
 	struct rack_fast_send_blk fsb;	/* The fast-send block */
 	uint32_t timer_slop;
-	uint32_t input_pkt;
-	uint32_t saved_input_pkt;
+	uint16_t pace_len_divisor;
+	uint16_t rc_user_set_min_segs;
 	uint32_t rc_hpts_flags;
 	uint32_t rc_fixed_pacing_rate_ca;
 	uint32_t rc_fixed_pacing_rate_rec;
@ -387,6 +371,7 @@ struct rack_control {
 	uint64_t last_hw_bw_req;
 	uint64_t crte_prev_rate;
 	uint64_t bw_rate_cap;
+	uint64_t last_cumack_advance; /* Last time cumack moved forward */
 	uint32_t rc_reorder_ts;	/* Last time we saw reordering Lock(a) */

 	uint32_t rc_tlp_new_data;	/* we need to send new-data on a TLP
@ -401,6 +386,7 @@ struct rack_control {
 	uint32_t last_sent_tlp_seq;	/* Last tlp sequence that was retransmitted Lock(a) */

 	uint32_t rc_prr_delivered;	/* during recovery prr var Lock(a) */
+
 	uint16_t rc_tlp_cnt_out;	/* count of times we have sent a TLP without new data */
 	uint16_t last_sent_tlp_len;	/* Number of bytes in the last sent tlp */

@ -418,6 +404,7 @@ struct rack_control {
 					 * have allocated */
 	uint32_t rc_rcvtime;	/* When we last received data */
 	uint32_t rc_num_split_allocs;	/* num split map entries allocated */
+	uint32_t rc_split_limit;	/* Limit from control var can be set by socket opt */

 	uint32_t rc_last_output_to;
 	uint32_t rc_went_idle_time;
@ -462,7 +449,20 @@ struct rack_control {
 	uint64_t last_max_bw;	/* Our calculated max b/w last */
 	struct time_filter_small rc_gp_min_rtt;
 	struct def_opt_head opt_list;
+	uint64_t lt_bw_time;	/* Total time with data outstanding (lt_bw = long term bandwidth)  */
+	uint64_t lt_bw_bytes;	/* Total bytes acked */
+	uint64_t lt_timemark;	/* 64 bit timestamp when we started sending */
+	struct http_sendfile_track *rc_last_sft;
+	uint32_t lt_seq;	/* Seq at start of lt_bw gauge */
 	int32_t rc_rtt_diff;		/* Timely style rtt diff of our gp_srtt */
+	uint64_t last_sndbytes;
+	uint64_t last_snd_rxt_bytes;
+	uint64_t rxt_threshold;
+	uint32_t last_rnd_rxt_clamped;
+	uint32_t num_of_clamps_applied;
+	uint32_t clamp_options;
+	uint32_t max_clamps;
+
 	uint32_t rc_gp_srtt;		/* Current GP srtt */
 	uint32_t rc_prev_gp_srtt;	/* Previous RTT */
 	uint32_t rc_entry_gp_rtt;	/* Entry to PRTT gp-rtt */
@ -502,6 +502,10 @@ struct rack_control {
 	uint32_t rc_min_to;	/* Socket option value Lock(a) */
 	uint32_t rc_pkt_delay;	/* Socket option value Lock(a) */
 	uint32_t persist_lost_ends;
+	uint32_t ack_during_sd;
+	uint32_t input_pkt;
+	uint32_t saved_input_pkt;
+	uint32_t saved_rxt_clamp_val; 	/* The encoded value we used to setup clamping */
 	struct newreno rc_saved_beta;	/*
 					 * For newreno cc:
 					 * rc_saved_cc are the values we have had
@ -516,6 +520,8 @@ struct rack_control {
 					 */
 	uint16_t rc_early_recovery_segs;	/* Socket option value Lock(a) */
 	uint16_t rc_reorder_shift;	/* Socket option value Lock(a) */
+	uint8_t rack_per_upper_bound_ss;
+	uint8_t rack_per_upper_bound_ca;
 	uint8_t dsack_persist;
 	uint8_t rc_no_push_at_mrtt;	/* No push when we exceed max rtt */
 	uint8_t num_measurements;	/* Number of measurements (up to 0xff, we freeze at 0xff)  */
@ -523,9 +529,55 @@ struct rack_control {
 	uint8_t rc_tlp_cwnd_reduce;	/* Socket option value Lock(a) */
 	uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
 	uint8_t rc_rate_sample_method;
+	uint8_t rc_dgp_bl_agg;		/* Buffer Level aggression during DGP */
+	uint8_t full_dgp_in_rec;	/* Flag to say if we do full DGP in recovery */
+	uint8_t client_suggested_maxseg;	/* Not sure what to do with this yet */
+	uint8_t pacing_discount_amm;	/*
+					 * This is a multipler to the base discount that
+					 * can be used to increase the discount.
+					 */
+	uint8_t already_had_a_excess;
 };
 #endif

+/* DGP with no buffer level mitigations */
+#define DGP_LEVEL0	0
+
+/*
+ * DGP with buffer level mitigation where BL:4 caps fillcw and BL:5
+ * turns off fillcw.
+ */
+#define DGP_LEVEL1	1
+
+/*
+ * DGP with buffer level mitigation where BL:3 caps fillcw and BL:4 turns off fillcw
+ * and BL:5 reduces by 10%
+ */
+#define DGP_LEVEL2	2
+
+/*
+ * DGP with buffer level mitigation where BL:2 caps fillcw and BL:3 turns off
+ * fillcw  BL:4 reduces by 10% and BL:5 reduces by 20%
+ */
+#define DGP_LEVEL3	3
+
+/* Hybrid pacing log defines */
+#define HYBRID_LOG_NO_ROOM	0	/* No room for the clients request */
+#define HYBRID_LOG_TURNED_OFF	1	/* Turned off hybrid pacing */
+#define HYBRID_LOG_NO_PACING	2	/* Failed to set pacing on */
+#define HYBRID_LOG_RULES_SET	3	/* Hybrid pacing for this chunk is set */
+#define HYBRID_LOG_NO_RANGE	4	/* In DGP mode, no range found */
+#define HYBRID_LOG_RULES_APP	5	/* The specified rules were applied */
+#define HYBRID_LOG_REQ_COMP	6	/* The request completed */
+#define HYBRID_LOG_BW_MEASURE	7	/* Follow up b/w measurements to the previous completed log */
+#define HYBRID_LOG_RATE_CAP	8	/* We had a rate cap apply */
+#define HYBRID_LOG_CAP_CALC	9	/* How we calculate the cap */
+#define HYBRID_LOG_ISSAME	10	/* Same as before  -- temp */
+#define HYBRID_LOG_ALLSENT	11	/* We sent it all no more rate-cap */
+#define HYBRID_LOG_OUTOFTIME	12	/* We are past the deadline DGP */
+#define HYBRID_LOG_CAPERROR	13	/* Hit one of the TSNH cases */
+#define HYBRID_LOG_EXTEND	14	/* We extended the end */
+
 #define RACK_TIMELY_CNT_BOOST 5	/* At 5th increase boost */
 #define RACK_MINRTT_FILTER_TIM 10 /* Seconds */

@ -558,11 +610,11 @@ struct tcp_rack {
 		shape_rxt_to_pacing_min : 1,
 	        /* ******************************************************************** */
 		rc_ack_required: 1,
-		spare : 1;
+		r_pacing_discount : 1;
 	uint8_t no_prr_addback : 1,
 		gp_ready : 1,
 		defer_options: 1,
-		fast_rsm_hack: 1,
+		excess_rxt_on: 1,	/* Are actions on for excess retransmissions? */
 		rc_ack_can_sendout_data: 1, /*
 					     * If set it will override pacing restrictions on not sending
 					     * data when the pacing timer is running. I.e. you set this
@ -590,7 +642,8 @@ struct tcp_rack {
 		rc_last_sent_tlp_seq_valid: 1,
 		rc_last_sent_tlp_past_cumack: 1,
 		probe_not_answered: 1,
-		avail_bytes : 2;
+		rack_hibeta : 1,
+		lt_bw_up : 1;
 	uint32_t rc_rack_rtt;	/* RACK-RTT Lock(a) */
 	uint16_t r_mbuf_queue : 1,	/* Do we do mbuf queue for non-paced */
 		 rtt_limit_mul : 4,	/* muliply this by low rtt */
@ -616,11 +669,15 @@ struct tcp_rack {
 		r_use_labc_for_rec: 1,
 		rc_highly_buffered: 1,		/* The path is highly buffered */
 		rc_dragged_bottom: 1,
-		rc_dack_mode : 1,		/* Mac O/S emulation of d-ack */
-		rc_dack_toggle : 1,		/* For Mac O/S emulation of d-ack */
+		rc_pace_dnd : 1,		/* The pace do not disturb bit */
+		rc_avali2 : 1,
 		rc_gp_filled : 1,
-		rc_is_spare : 1;
-	uint8_t r_state;	/* Current rack state Lock(a) */
+		rc_hw_nobuf : 1;
+	uint8_t r_state : 4, 	/* Current rack state Lock(a) */
+		rc_catch_up : 1,	/* catch up mode in dgp */
+		rc_hybrid_mode : 1,	/* We are in hybrid mode */
+		rc_suspicious : 1,	/* Suspect sacks have been given */
+		rc_new_rnd_needed: 1;
 	uint8_t rc_tmr_stopped : 7,
 		t_timers_stopped : 1;
 	uint8_t rc_enobuf : 7,	/* count of enobufs on connection provides */
@ -636,8 +693,8 @@ struct tcp_rack {
 	uint8_t app_limited_needs_set : 1,
 		use_fixed_rate : 1,
 		rc_has_collapsed : 1,
-		r_rep_attack : 1,
-		r_rep_reverse : 1,
+		r_cwnd_was_clamped : 1,
+		r_clamped_gets_lower : 1,
 		rack_hdrw_pacing : 1,  /* We are doing Hardware pacing */
 		rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */
 		rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */
@ -660,8 +717,8 @@ struct tcp_rack {
 		r_wanted_output: 1,
 		r_rr_config : 2,
 		r_persist_lt_bw_off : 1,
- 		r_collapse_point_valid : 1,
-		rc_avail_bit : 2;
+		r_collapse_point_valid : 1,
+		dgp_on : 1;
 	uint16_t rc_init_win : 8,
 		rc_gp_rtt_set : 1,
 		rc_gp_dyn_mul : 1,
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@ -154,6 +154,11 @@ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection,
    &tcp_force_detection, 0,
    "Do we force detection even if the INP has it off?");
 int32_t tcp_sad_limit = 10000;
+SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit,
+    CTLFLAG_RW,
+    &tcp_sad_limit, 10000,
+    "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?");
+int32_t tcp_sad_limit = 10000;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit,
    CTLFLAG_RW,
    &tcp_sad_limit, 10000,