This brings into sync FreeBSD with the netflix versions of rack and bbr.

This fixes several breakages (panics) since the tcp_lro code was committed that have been reported. Quite a few new features are now in rack (prefecting of DGP -- Dynamic Goodput Pacing among the largest). There is also support for ack-war prevention. Documents comming soon on rack.. Sponsored by: Netflix Reviewed by: rscheff, mtuexen Differential Revision: https://reviews.freebsd.org/D30036
2021-05-06 11:22:26 -04:00 · 2021-05-06 11:22:26 -04:00 · 5d8fd932e4
commit 5d8fd932e4
parent 0ec3e99111
18 changed files with 7282 additions and 1885 deletions
--- a/sys/netinet/cc/cc.h
+++ b/sys/netinet/cc/cc.h
@ -91,15 +91,20 @@ struct cc_var {
 		struct sctp_nets	*sctp;
 	} ccvc;
 	uint16_t	nsegs; /* # segments coalesced into current chain. */
+	uint8_t		labc;  /* Dont use system abc use passed in */
 };

 /* cc_var flags. */
 #define	CCF_ABC_SENTAWND	0x0001	/* ABC counted cwnd worth of bytes? */
 #define	CCF_CWND_LIMITED	0x0002	/* Are we currently cwnd limited? */
-#define	CCF_UNUSED1		0x0004	/* unused */
+#define	CCF_USE_LOCAL_ABC       0x0004  /* Dont use the system l_abc val */
 #define	CCF_ACKNOW		0x0008	/* Will this ack be sent now? */
 #define	CCF_IPHDR_CE		0x0010	/* Does this packet set CE bit? */
 #define	CCF_TCPHDR_CWR		0x0020	/* Does this packet set CWR bit? */
+#define	CCF_MAX_CWND		0x0040	/* Have we reached maximum cwnd? */
+#define	CCF_CHG_MAX_CWND	0x0080	/* Cubic max_cwnd changed, for K */
+#define	CCF_USR_IWND		0x0100	/* User specified initial window */
+#define	CCF_USR_IWND_INIT_NSEG	0x0200	/* Convert segs to bytes on conn init */

 /* ACK types passed to the ack_received() hook. */
 #define	CC_ACK		0x0001	/* Regular in sequence ACK. */
--- a/sys/netinet/cc/cc_newreno.c
+++ b/sys/netinet/cc/cc_newreno.c
@ -86,8 +86,8 @@ static void	newreno_cong_signal(struct cc_var *ccv, uint32_t type);
 static void	newreno_post_recovery(struct cc_var *ccv);
 static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf);

-VNET_DEFINE_STATIC(uint32_t, newreno_beta) = 50;
-VNET_DEFINE_STATIC(uint32_t, newreno_beta_ecn) = 80;
+VNET_DEFINE(uint32_t, newreno_beta) = 50;
+VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
 #define V_newreno_beta VNET(newreno_beta)
 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)

@ -101,11 +101,6 @@ struct cc_algo newreno_cc_algo = {
 	.ctl_output = newreno_ctl_output,
 };

-struct newreno {
-	uint32_t beta;
-	uint32_t beta_ecn;
-};
-
 static inline struct newreno *
 newreno_malloc(struct cc_var *ccv)
 {
@ -182,9 +177,15 @@ newreno_ack_received(struct cc_var *ccv, uint16_t type)
 			 * XXXLAS: Find a way to signal SS after RTO that
 			 * doesn't rely on tcpcb vars.
 			 */
+			uint16_t abc_val;
+
+			if (ccv->flags & CCF_USE_LOCAL_ABC)
+				abc_val = ccv->labc;
+			else
+				abc_val = V_tcp_abc_l_var;
 			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
 				incr = min(ccv->bytes_this_ack,
-				    ccv->nsegs * V_tcp_abc_l_var *
+				    ccv->nsegs * abc_val *
 				    CCV(ccv, t_maxseg));
 			else
 				incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
@ -237,11 +238,19 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type)
 	u_int mss;

 	cwin = CCV(ccv, snd_cwnd);
-	mss = tcp_maxseg(ccv->ccvc.tcp);
+	mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
 	nreno = ccv->cc_data;
 	beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;
 	beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn;
-	if (V_cc_do_abe && type == CC_ECN)
+
+	/*
+	 * Note that we only change the backoff for ECN if the
+	 * global sysctl V_cc_do_abe is set <or> the stack itself
+	 * has set a flag in our newreno_flags (due to pacing) telling
+	 * us to use the lower valued back-off.
+	 */
+	if (V_cc_do_abe ||
+	    (nreno && (nreno->newreno_flags & CC_NEWRENO_BETA_ECN) && (type == CC_ECN)))
 		factor = beta_ecn;
 	else
 		factor = beta;
@ -260,8 +269,7 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type)
 			    V_cc_do_abe && V_cc_abe_frlossreduce)) {
 				CCV(ccv, snd_ssthresh) =
 				    ((uint64_t)CCV(ccv, snd_ssthresh) *
-				    (uint64_t)beta) /
-				    (100ULL * (uint64_t)beta_ecn);
+				     (uint64_t)beta) / (uint64_t)beta_ecn;
 			}
 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
 				CCV(ccv, snd_ssthresh) = cwin;
@ -344,7 +352,7 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf)
 			nreno->beta = opt->val;
 			break;
 		case CC_NEWRENO_BETA_ECN:
-			if (!V_cc_do_abe)
+			if ((!V_cc_do_abe) && ((nreno->newreno_flags & CC_NEWRENO_BETA_ECN) == 0))
 				return (EACCES);
 			nreno->beta_ecn = opt->val;
 			break;
--- a/sys/netinet/cc/cc_newreno.h
+++ b/sys/netinet/cc/cc_newreno.h
@ -31,12 +31,17 @@

 #define CCALGONAME_NEWRENO "newreno"

+struct newreno {
+	uint32_t beta;
+	uint32_t beta_ecn;
+	uint32_t newreno_flags;
+};
+
 struct cc_newreno_opts {
-	int			name;
+	int		name;
 	uint32_t	val;
 };

-#define CC_NEWRENO_BETA		1
-#define CC_NEWRENO_BETA_ECN	2
-
+#define CC_NEWRENO_BETA		1	/* Beta for normal DUP-ACK/Sack recovery */
+#define CC_NEWRENO_BETA_ECN	2	/* ECN Beta for Abe */
 #endif /* _CC_NEWRENO_H */
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@ -181,13 +181,24 @@ struct tcphdr {
 #define	TCP_TXTLS_MODE	40	/* Transmit TLS mode */
 #define	TCP_RXTLS_ENABLE 41	/* TLS framing and encryption for receive */
 #define	TCP_RXTLS_MODE	42	/* Receive TLS mode */
+#define	TCP_IWND_NB	43	/* Override initial window (units: bytes) */
+#define	TCP_IWND_NSEG	44	/* Override initial window (units: MSS segs) */
+#define	TCP_LOGID_CNT	46	/* get number of connections with the same ID */
+#define	TCP_LOG_TAG	47	/* configure tag for grouping logs */
+#define	TCP_USER_LOG	48	/* userspace log event */
 #define	TCP_CONGESTION	64	/* get/set congestion control algorithm */
 #define	TCP_CCALGOOPT	65	/* get/set cc algorithm specific options */
+#define	TCP_MAXUNACKTIME 68	/* maximum time without making progress (sec) */
+#define	TCP_MAXPEAKRATE 69	/* maximum peak rate allowed (kbps) */
+#define TCP_IDLE_REDUCE 70	/* Reduce cwnd on idle input */
 #define TCP_REMOTE_UDP_ENCAPS_PORT 71	/* Enable TCP over UDP tunneling via the specified port */
 #define TCP_DELACK  	72	/* socket option for delayed ack */
 #define TCP_FIN_IS_RST 73	/* A fin from the peer is treated has a RST */
 #define TCP_LOG_LIMIT  74	/* Limit to number of records in tcp-log */
 #define TCP_SHARED_CWND_ALLOWED 75 	/* Use of a shared cwnd is allowed */
+#define TCP_PROC_ACCOUNTING 76	/* Do accounting on tcp cpu usage and counts */
+#define TCP_USE_CMP_ACKS 77 	/* The transport can handle the Compressed mbuf acks */
+#define	TCP_PERF_INFO	78	/* retrieve accounting counters */
 #define	TCP_KEEPINIT	128	/* N, time to establish connection */
 #define	TCP_KEEPIDLE	256	/* L,N,X start keeplives after this period */
 #define	TCP_KEEPINTVL	512	/* L,N interval between keepalives */
@ -201,7 +212,7 @@ struct tcphdr {
 #define TCP_RACK_MBUF_QUEUE   1050 /* Do we allow mbuf queuing if supported */
 #define TCP_RACK_PROP	      1051 /* RACK proportional rate reduction (bool) */
 #define TCP_RACK_TLP_REDUCE   1052 /* RACK TLP cwnd reduction (bool) */
-#define TCP_RACK_PACE_REDUCE  1053 /* RACK Pacing reduction factor (divisor) */
+#define TCP_RACK_PACE_REDUCE  1053 /* RACK Pacingv reduction factor (divisor) */
 #define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send  */
 #define TCP_RACK_PACE_ALWAYS  1055 /* Use the always pace method */
 #define TCP_RACK_PROP_RATE    1056 /* The proportional reduction rate */
@ -284,6 +295,16 @@ struct tcphdr {
 #define TCP_RACK_PACE_TO_FILL 1127 /* If we are not in recovery, always pace to fill the cwnd in 1 RTT */
 #define TCP_SHARED_CWND_TIME_LIMIT 1128 /* we should limit to low time values the scwnd life */
 #define TCP_RACK_PROFILE 1129	/* Select a profile that sets multiple options */
+#define TCP_HDWR_RATE_CAP 1130 /* Allow hardware rates to cap pacing rate */
+#define TCP_PACING_RATE_CAP 1131 /* Highest rate allowed in pacing in bytes per second (uint64_t) */
+#define TCP_HDWR_UP_ONLY 1132	/* Allow the pacing rate to climb but not descend (with the exception of fill-cw */
+#define TCP_RACK_ABC_VAL 1133	/* Set a local ABC value different then the system default */
+#define TCP_REC_ABC_VAL 1134	/* Do we use the ABC value for recovery or the override one from sysctl  */
+#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */
+#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */
+#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */
+#define TCP_RACK_PACING_BETA 1138	/* Changing the beta for pacing */
+#define TCP_RACK_PACING_BETA_ECN 1139	/* Changing the beta for ecn with pacing */

 /* Start of reserved space for third-party user-settable options. */
 #define	TCP_VENDOR	SO_VENDOR
@ -295,6 +316,7 @@ struct tcphdr {
 #define	TCPI_OPT_WSCALE		0x04
 #define	TCPI_OPT_ECN		0x08
 #define	TCPI_OPT_TOE		0x10
+#define	TCPI_OPT_TFO		0x20

 /* Maximum length of log ID. */
 #define TCP_LOG_ID_LEN	64
--- a/sys/netinet/tcp_accounting.h
+++ b/sys/netinet/tcp_accounting.h
@ -0,0 +1,39 @@
+#ifndef __tcp_accounting_h__
+#define __tcp_accounting_h__
+/*
+ * Return values from tcp_do_ack_accounting
+ * and indexs to the into the tcp_proc_time[]
+ * array.
+ */
+#define ACK_BEHIND	0
+#define ACK_SACK	1
+#define ACK_CUMACK	2
+#define ACK_CUMACK_SACK	3
+#define ACK_DUPACK	4
+#define ACK_RWND	5
+/* Added values for tracking output too  */
+#define SND_BLOCKED	6
+#define SND_LIMITED	7
+#define SND_OUT_DATA 	8
+#define SND_OUT_ACK	9
+#define SND_OUT_FAIL	10
+/* We also count in the counts array two added (MSS sent and ACKS In) */
+#define CNT_OF_MSS_OUT 11
+#define CNT_OF_ACKS_IN 12
+
+/* for the tcpcb we add two more cycle counters */
+#define CYC_HANDLE_MAP 11
+#define CYC_HANDLE_ACK 12
+
+/* Should the tp->xxx array's be alloc'ed? */
+/* #define TCP_NUM_PROC_COUNTERS 11 defined in tcp_var.h */
+/* #define TCP_NUM_CNT_COUNTERS 13 defined in tcp_var.h */
+
+#ifdef _KERNEL
+#ifdef TCP_ACCOUNTING
+extern counter_u64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS];
+extern counter_u64_t tcp_proc_time[TCP_NUM_PROC_COUNTERS];
+#endif
+#endif
+
+#endif
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@ -526,7 +526,7 @@ cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))

 void inline
-cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
+cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);

@ -544,7 +544,7 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
 			break;
 		}

-		if (th->th_flags & TH_CWR)
+		if (flags & TH_CWR)
 			tp->ccv->flags |= CCF_TCPHDR_CWR;
 		else
 			tp->ccv->flags &= ~CCF_TCPHDR_CWR;
@ -558,6 +558,12 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
 	}
 }

+void inline
+cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
+{
+	cc_ecnpkt_handler_flags(tp, th->th_flags, iptos);
+}
+
 /*
 * TCP input handling is split into multiple parts:
 *   tcp6_input is a thin wrapper around tcp_input for the extended
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@ -174,7 +174,7 @@ enum tcp_log_events {
 	TCP_LOG_IN = 1,		/* Incoming packet                   1 */
 	TCP_LOG_OUT,		/* Transmit (without other event)    2 */
 	TCP_LOG_RTO,		/* Retransmit timeout                3 */
-	TCP_LOG_TF_ACK,		/* Transmit due to TF_ACK            4 */
+	TCP_LOG_SB_WAKE,	/* Awaken socket buffer              4 */
 	TCP_LOG_BAD_RETRAN,	/* Detected bad retransmission       5 */
 	TCP_LOG_PRR,		/* Doing PRR                         6 */
 	TCP_LOG_REORDER,	/* Detected reorder                  7 */
@ -200,7 +200,7 @@ enum tcp_log_events {
 	BBR_LOG_DOSEG_DONE,     /* hpts do_segment completes        27 */
 	BBR_LOG_EXIT_GAIN,      /* hpts do_segment completes        28 */
 	BBR_LOG_THRESH_CALC,    /* Doing threshold calculation      29 */
-	BBR_LOG_EXTRACWNDGAIN,	/* Removed                          30 */
+	TCP_LOG_MAPCHG,		/* Map Changes to the sendmap       30 */
 	TCP_LOG_USERSEND, 	/* User level sends data            31 */
 	BBR_RSM_CLEARED,	/* RSM cleared of ACK flags         32 */
 	BBR_LOG_STATE_TARGET, 	/* Log of target at state           33 */
@ -232,7 +232,9 @@ enum tcp_log_events {
 	TCP_LOG_USER_EVENT,	/* User space event data            59 */
 	TCP_LOG_SENDFILE,	/* sendfile() logging for TCP connections 60 */
 	TCP_LOG_HTTP_T,		/* logging of http request tracking 61 */
-	TCP_LOG_END		/* End (keep at end)                62 */
+	TCP_LOG_ACCOUNTING,	/* Log of TCP Accounting data 62 */
+	TCP_LOG_FSB,		/* FSB information 63 */
+	TCP_LOG_END		/* End (keep at end)                64 */
 };

 enum tcp_log_states {
--- a/sys/netinet/tcp_ratelimit.c
+++ b/sys/netinet/tcp_ratelimit.c
@ -367,11 +367,22 @@ rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
 				       OID_AUTO, "pacetime", CTLFLAG_RD,
 				       &rs->rs_rlt[i].time_between, 0,
 				       "Time hardware inserts between 1500 byte sends");
-			SYSCTL_ADD_U64(&rs->sysctl_ctx,
+			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "rate", CTLFLAG_RD,
-				       &rs->rs_rlt[i].rate, 0,
+				       &rs->rs_rlt[i].rate,
 				       "Rate in bytes per second");
+			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
+				       SYSCTL_CHILDREN(rl_rate_num),
+				       OID_AUTO, "using", CTLFLAG_RD,
+				       &rs->rs_rlt[i].using,
+				       "Number of flows using");
+			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
+				       SYSCTL_CHILDREN(rl_rate_num),
+				       OID_AUTO, "enobufs", CTLFLAG_RD,
+				       &rs->rs_rlt[i].rs_num_enobufs,
+				       "Number of enobufs logged on this rate");
+
 		}
 	}
 #endif
@ -667,6 +678,8 @@ bail:
 		 */
 		rs->rs_rlt[i].ptbl = rs;
 		rs->rs_rlt[i].tag = NULL;
+		rs->rs_rlt[i].using = 0;
+		rs->rs_rlt[i].rs_num_enobufs = 0;
 		/*
 		 * Calculate the time between.
 		 */
@ -1063,16 +1076,28 @@ rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
 static void
 rl_increment_using(const struct tcp_hwrate_limit_table *rte)
 {
+	struct tcp_hwrate_limit_table *decon_rte;
+
+	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
+	atomic_add_long(&decon_rte->using, 1);
 }

 static void
 rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
 {
+	struct tcp_hwrate_limit_table *decon_rte;
+
+	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
+	atomic_subtract_long(&decon_rte->using, 1);
 }

 void
 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
 {
+	struct tcp_hwrate_limit_table *decon_rte;
+
+	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
+	atomic_add_long(&decon_rte->rs_num_enobufs, 1);
 }

 /*
--- a/sys/netinet/tcp_ratelimit.h
+++ b/sys/netinet/tcp_ratelimit.h
@ -43,7 +43,9 @@ struct m_snd_tag;
 struct tcp_hwrate_limit_table {
 	const struct tcp_rate_set *ptbl;	/* Pointer to parent table */
 	struct m_snd_tag *tag;	/* Send tag if needed (chelsio) */
-	uint64_t rate;		/* Rate we get in Bytes per second (Bps) */
+	long	 rate;		/* Rate we get in Bytes per second (Bps) */
+	long	 using;		/* How many flows are using this hdwr rate. */
+	long	 rs_num_enobufs;
 	uint32_t time_between;	/* Time-Gap between packets at this rate */
 	uint32_t flags;
 };
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@ -156,6 +156,17 @@ SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD,
    &VNET_NAME(tcp_sack_globalholes), 0,
    "Global number of TCP SACK holes currently allocated");

+int
+tcp_dsack_block_exists(struct tcpcb *tp)
+{
+	/* Return true if a DSACK block exists */
+	if (tp->rcv_numsacks == 0)
+		return (0);
+	if (SEQ_LEQ(tp->sackblks[0].end, tp->rcv_nxt))
+		return(1);
+	return (0);
+}
+
 /*
 * This function will find overlaps with the currently stored sackblocks
 * and add any overlap as a dsack block upfront
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@ -3930,6 +3930,9 @@ bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_s
 	struct tcp_bbr *bbr;

 	INP_WLOCK_ASSERT(tp->t_inpcb);
+#ifdef STATS
+	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
+#endif
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	switch (type) {
 	case CC_NDUPACK:
@ -4403,6 +4406,7 @@ bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap
 	nrsm->r_start = start;
 	nrsm->r_end = rsm->r_end;
 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+	nrsm-> r_rtt_not_allowed = rsm->r_rtt_not_allowed;
 	nrsm->r_flags = rsm->r_flags;
 	/* We don't transfer forward the SYN flag */
 	nrsm->r_flags &= ~BBR_HAS_SYN;
@ -6429,65 +6433,6 @@ tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts)
 		bbr->r_ctl.bbr_smallest_srtt_this_state = rtt;
 }

-static void
-bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm,
-		   uint32_t t, uint32_t cts, int ack_type)
-{
-	/*
-	 * For this RSM, we acknowledged the data from a previous
-	 * transmission, not the last one we made. This means we did a false
-	 * retransmit.
-	 */
-	if (rsm->r_flags & BBR_HAS_FIN) {
-		/*
-		 * The sending of the FIN often is multiple sent when we
-		 * have everything outstanding ack'd. We ignore this case
-		 * since its over now.
-		 */
-		return;
-	}
-	if (rsm->r_flags & BBR_TLP) {
-		/*
-		 * We expect TLP's to have this occur often
-		 */
-		bbr->rc_tlp_rtx_out = 0;
-		return;
-	}
-	if (ack_type != BBR_CUM_ACKED) {
-		/*
-		 * If it was not a cum-ack we
-		 * don't really know for sure since
-		 * the timestamp could be from some
-		 * other transmission.
-		 */
-		return;
-	}
-
-	if (rsm->r_flags & BBR_WAS_SACKPASS) {
-		/*
-		 * We retransmitted based on a sack and the earlier
-		 * retransmission ack'd it - re-ordering is occuring.
-		 */
-		BBR_STAT_INC(bbr_reorder_seen);
-		bbr->r_ctl.rc_reorder_ts = cts;
-	}
-	/* Back down the loss count */
-	if (rsm->r_flags & BBR_MARKED_LOST) {
-		bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
-		bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
-		rsm->r_flags &= ~BBR_MARKED_LOST;
-		if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
-			/* LT sampling also needs adjustment */
-			bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
-	}
-	/***** RRS HERE ************************/
-	/* Do we need to do this???            */
-	/* bbr_reset_lt_bw_sampling(bbr, cts); */
-	/***** RRS HERE ************************/
-	BBR_STAT_INC(bbr_badfr);
-	BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start));
-}
-
 static void
 bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line)
 {
@ -6869,6 +6814,10 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
 		/* Already done */
 		return (0);
 	}
+	if (rsm->r_rtt_not_allowed) {
+		/* Not allowed */
+		return (0);
+	}
 	if (rsm->r_rtr_cnt == 1) {
 		/*
 		 * Only one transmit. Hopefully the normal case.
@ -6926,7 +6875,7 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
 						    rsm->r_tim_lastsent[i], ack_type, to);
 				if ((i + 1) < rsm->r_rtr_cnt) {
 					/* Likely */
-					bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
+					return (0);
 				} else if (rsm->r_flags & BBR_TLP) {
 					bbr->rc_tlp_rtx_out = 0;
 				}
@ -6974,7 +6923,7 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
 				t = 1;
 			bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET,
 					    rsm->r_tim_lastsent[i], ack_type, to);
-			bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
+			return (0);
 		} else {
 			/*
 			 * Too many prior transmissions, just
@ -10207,7 +10156,7 @@ bbr_init(struct tcpcb *tp)
 			tp->t_fb_ptr = NULL;
 			return (ENOMEM);
 		}
-		rsm->r_flags = BBR_OVERMAX;
+		rsm->r_rtt_not_allowed = 1;
 		rsm->r_tim_lastsent[0] = cts;
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
@ -10320,6 +10269,10 @@ bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
 			counter_u64_add(bbr_flows_whdwr_pacing, -1);
 		else
 			counter_u64_add(bbr_flows_nohdwr_pacing, -1);
+		if (bbr->r_ctl.crte != NULL) {
+			tcp_rel_pacing_rate(bbr->r_ctl.crte, tp);
+			bbr->r_ctl.crte = NULL;
+		}
 		rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 		while (rsm) {
 			TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
@ -13463,15 +13416,6 @@ send:
 				th->th_seq = htonl(tp->snd_max);
 				bbr_seq = tp->snd_max;
 			}
-		} else if (flags & TH_RST) {
-			/*
-			 * For a Reset send the last cum ack in sequence
-			 * (this like any other choice may still generate a
-			 * challenge ack, if a ack-update packet is in
-			 * flight).
-			 */
-			th->th_seq = htonl(tp->snd_una);
-			bbr_seq = tp->snd_una;
 		} else {
 			/*
 			 * len == 0 and not persist we use snd_max, sending
@ -14536,9 +14480,9 @@ bbr_set_sockopt(struct socket *so, struct sockopt *sopt,
 		} else {
 			bbr->bbr_hdw_pace_ena = 0;
 #ifdef RATELIMIT
-			if (bbr->bbr_hdrw_pacing) {
-				bbr->bbr_hdrw_pacing = 0;
-				in_pcbdetach_txrtlmt(bbr->rc_inp);
+			if (bbr->r_ctl.crte != NULL) {
+				tcp_rel_pacing_rate(bbr->r_ctl.crte, tp);
+				bbr->r_ctl.crte = NULL;
 			}
 #endif
 		}
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@ -96,6 +96,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_hpts.h>
+#include <netinet/tcp_lro.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_log_buf.h>
 #ifdef TCPDEBUG
@ -161,6 +162,130 @@ again:
 }
 #endif

+static int
+ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m)
+{
+	struct ether_header *eh;
+	struct tcphdr *th;
+#ifdef INET6
+	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
+#endif
+#ifdef INET
+	struct ip *ip = NULL;		/* Keep compiler happy. */
+#endif
+	int32_t tlen;
+	uint16_t drop_hdrlen;
+	uint16_t etype;
+	uint8_t iptos;
+
+	/* Is it the easy way? */
+	if (m->m_flags & M_LRO_EHDRSTRP)
+		return (m->m_pkthdr.lro_etype);
+	/*
+	 * Ok this is the old style call, the ethernet header is here.
+	 * This also means no checksum or BPF were done. This
+	 * can happen if the race to setup the inp fails and
+	 * LRO sees no INP at packet input, but by the time
+	 * we queue the packets an INP gets there. Its rare
+	 * but it can occur so we will handle it. Note that
+	 * this means duplicated work but with the rarity of it
+	 * its not worth worrying about.
+	 */
+	/* Let the BPF see the packet */
+	if (bpf_peers_present(ifp->if_bpf))
+		ETHER_BPF_MTAP(ifp, m);
+	/* Now the csum */
+	eh = mtod(m, struct ether_header *);
+	etype = ntohs(eh->ether_type);
+	m_adj(m,  sizeof(*eh));
+	switch (etype) {
+#ifdef INET6
+		case ETHERTYPE_IPV6:
+		{
+			if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
+				m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
+				if (m == NULL) {
+					KMOD_TCPSTAT_INC(tcps_rcvshort);
+					m_freem(m);
+					return (-1);
+				}
+			}
+			ip6 = (struct ip6_hdr *)(eh + 1);
+			th = (struct tcphdr *)(ip6 + 1);
+			drop_hdrlen = sizeof(*ip6);
+			tlen = ntohs(ip6->ip6_plen);
+			if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
+				if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+					th->th_sum = m->m_pkthdr.csum_data;
+				else
+					th->th_sum = in6_cksum_pseudo(ip6, tlen,
+								      IPPROTO_TCP,
+								      m->m_pkthdr.csum_data);
+				th->th_sum ^= 0xffff;
+			} else
+				th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
+			if (th->th_sum) {
+				KMOD_TCPSTAT_INC(tcps_rcvbadsum);
+				m_freem(m);
+				return (-1);
+			}
+			return (etype);
+		}
+#endif
+#ifdef INET
+		case ETHERTYPE_IP:
+		{
+			if (m->m_len < sizeof (struct tcpiphdr)) {
+				m = m_pullup(m, sizeof (struct tcpiphdr));
+				if (m == NULL) {
+					KMOD_TCPSTAT_INC(tcps_rcvshort);
+					m_freem(m);
+					return (-1);
+				}
+			}
+			ip = (struct ip *)(eh + 1);
+			th = (struct tcphdr *)(ip + 1);
+			drop_hdrlen = sizeof(*ip);
+			iptos = ip->ip_tos;
+			tlen = ntohs(ip->ip_len) - sizeof(struct ip);
+			if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+				if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+					th->th_sum = m->m_pkthdr.csum_data;
+				else
+					th->th_sum = in_pseudo(ip->ip_src.s_addr,
+							       ip->ip_dst.s_addr,
+							       htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP));
+				th->th_sum ^= 0xffff;
+			} else {
+				int len;
+				struct ipovly *ipov = (struct ipovly *)ip;
+				/*
+				 * Checksum extended TCP header and data.
+				 */
+				len = drop_hdrlen + tlen;
+				bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+				ipov->ih_len = htons(tlen);
+				th->th_sum = in_cksum(m, len);
+				/* Reset length for SDT probes. */
+				ip->ip_len = htons(len);
+				/* Reset TOS bits */
+				ip->ip_tos = iptos;
+				/* Re-initialization for later version check */
+				ip->ip_v = IPVERSION;
+				ip->ip_hl = sizeof(*ip) >> 2;
+			}
+			if (th->th_sum) {
+				KMOD_TCPSTAT_INC(tcps_rcvbadsum);
+				m_freem(m);
+				return (-1);
+			}
+			break;
+		}
+#endif
+	};
+	return (etype);
+}
+
 /*
 * The function ctf_process_inbound_raw() is used by
 * transport developers to do the steps needed to
@ -170,6 +295,7 @@ again:
 * - INP_SUPPORTS_MBUFQ
 * - INP_MBUF_QUEUE_READY
 * - INP_DONT_SACK_QUEUE
+ * - INP_MBUF_ACKCMP
 *
 * These flags help control how LRO will deliver
 * packets to the transport. You first set in inp_flags2
@ -186,6 +312,18 @@ again:
 * In some transport designs this is important since knowing
 * the actual time we got the packet is useful information.
 *
+ * A new special type of mbuf may also be supported by the transport
+ * if it has set the INP_MBUF_ACKCMP flag. If its set, LRO will
+ * possibly create a M_ACKCMP type mbuf. This is a mbuf with
+ * an array of "acks". One thing also to note is that when this
+ * occurs a subsequent LRO may find at the back of the untouched
+ * mbuf queue chain a M_ACKCMP and append on to it. This means
+ * that until the transport pulls in the mbuf chain queued
+ * for it more ack's may get on the mbufs that were already
+ * delivered. There currently is a limit of 6 acks condensed
+ * into 1 mbuf which means often when this is occuring, we
+ * don't get that effect but it does happen.
+ *
 * Now there are some interesting Caveats that the transport
 * designer needs to take into account when using this feature.
 *
@ -247,7 +385,6 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
 	 *    shipped in, the tcb has been destroyed (or about to be destroyed).
 	 */
 	struct mbuf *m_save;
-	struct ether_header *eh;
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
@ -257,20 +394,18 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
 #endif
 	struct ifnet *ifp;
 	struct timeval tv;
+	struct inpcb *inp;
 	int32_t retval, nxt_pkt, tlen, off;
-	uint16_t etype;
+	int etype = 0;
 	uint16_t drop_hdrlen;
-	uint8_t iptos, no_vn=0, bpf_req=0;
+	uint8_t iptos, no_vn=0;

 	NET_EPOCH_ASSERT();
-
-	if (m && m->m_pkthdr.rcvif)
-		ifp = m->m_pkthdr.rcvif;
+	if (m)
+		ifp = m_rcvif(m);
 	else
 		ifp = NULL;
-	if (ifp) {
-		bpf_req = bpf_peers_present(ifp->if_bpf);
-	} else  {
+	if (ifp == NULL) {
 		/*
 		 * We probably should not work around
 		 * but kassert, since lro alwasy sets rcvif.
@ -280,147 +415,86 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
 	}
 	CURVNET_SET(ifp->if_vnet);
 skip_vnet:
+	tcp_get_usecs(&tv);
 	while (m) {
 		m_save = m->m_nextpkt;
 		m->m_nextpkt = NULL;
-		/* Now lets get the ether header */
-		eh = mtod(m, struct ether_header *);
-		etype = ntohs(eh->ether_type);
-		/* Let the BPF see the packet */
-		if (bpf_req && ifp)
-			ETHER_BPF_MTAP(ifp, m);
-		m_adj(m,  sizeof(*eh));
-		/* Trim off the ethernet header */
-		switch (etype) {
+		if ((m->m_flags & M_ACKCMP) == 0) {
+			/* Now lets get the ether header */
+			etype = ctf_get_enet_type(ifp, m);
+			if (etype == -1) {
+				/* Skip this packet it was freed by checksum */
+				goto skipped_pkt;
+			}
+			KASSERT(((etype == ETHERTYPE_IPV6) || (etype == ETHERTYPE_IP)),
+				("tp:%p m:%p etype:0x%x -- not IP or IPv6", tp, m, etype));
+			/* Trim off the ethernet header */
+			switch (etype) {
 #ifdef INET6
-		case ETHERTYPE_IPV6:
-		{
-			if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
-				m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
-				if (m == NULL) {
-					KMOD_TCPSTAT_INC(tcps_rcvshort);
-					m_freem(m);
-					goto skipped_pkt;
-				}
-			}
-			ip6 = (struct ip6_hdr *)(eh + 1);
-			th = (struct tcphdr *)(ip6 + 1);
-			tlen = ntohs(ip6->ip6_plen);
-			drop_hdrlen = sizeof(*ip6);
-			if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
-				if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
-					th->th_sum = m->m_pkthdr.csum_data;
-				else
-					th->th_sum = in6_cksum_pseudo(ip6, tlen,
-								      IPPROTO_TCP, m->m_pkthdr.csum_data);
-				th->th_sum ^= 0xffff;
-			} else
-				th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
-			if (th->th_sum) {
-				KMOD_TCPSTAT_INC(tcps_rcvbadsum);
-				m_freem(m);
-				goto skipped_pkt;
-			}
-			/*
-			 * Be proactive about unspecified IPv6 address in source.
-			 * As we use all-zero to indicate unbounded/unconnected pcb,
-			 * unspecified IPv6 address can be used to confuse us.
-			 *
-			 * Note that packets with unspecified IPv6 destination is
-			 * already dropped in ip6_input.
-			 */
-			if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
-				/* XXX stat */
-				m_freem(m);
-				goto skipped_pkt;
-			}
-			iptos = IPV6_TRAFFIC_CLASS(ip6);
-			break;
-		}
+			case ETHERTYPE_IPV6:
+				ip6 = mtod(m, struct ip6_hdr *);
+				th = (struct tcphdr *)(ip6 + 1);
+				tlen = ntohs(ip6->ip6_plen);
+				drop_hdrlen = sizeof(*ip6);
+				iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+				break;
 #endif
 #ifdef INET
-		case ETHERTYPE_IP:
-		{
-			if (m->m_len < sizeof (struct tcpiphdr)) {
-				if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
-				    == NULL) {
-					KMOD_TCPSTAT_INC(tcps_rcvshort);
-					m_freem(m);
-					goto skipped_pkt;
-				}
-			}
-			ip = (struct ip *)(eh + 1);
-			th = (struct tcphdr *)(ip + 1);
-			drop_hdrlen = sizeof(*ip);
-			iptos = ip->ip_tos;
-			tlen = ntohs(ip->ip_len) - sizeof(struct ip);
-			if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
-				if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
-					th->th_sum = m->m_pkthdr.csum_data;
-				else
-					th->th_sum = in_pseudo(ip->ip_src.s_addr,
-							       ip->ip_dst.s_addr,
-							       htonl(m->m_pkthdr.csum_data + tlen +
-								     IPPROTO_TCP));
-				th->th_sum ^= 0xffff;
-			} else {
-				int len;
-				struct ipovly *ipov = (struct ipovly *)ip;
-				/*
-				 * Checksum extended TCP header and data.
-				 */
-				len = drop_hdrlen + tlen;
-				bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
-				ipov->ih_len = htons(tlen);
-				th->th_sum = in_cksum(m, len);
-				/* Reset length for SDT probes. */
-				ip->ip_len = htons(len);
-				/* Reset TOS bits */
-				ip->ip_tos = iptos;
-				/* Re-initialization for later version check */
-				ip->ip_v = IPVERSION;
-				ip->ip_hl = sizeof(*ip) >> 2;
-			}
-			if (th->th_sum) {
-				KMOD_TCPSTAT_INC(tcps_rcvbadsum);
-				m_freem(m);
-				goto skipped_pkt;
-			}
-			break;
-		}
+			case ETHERTYPE_IP:
+				ip = mtod(m, struct ip *);
+				th = (struct tcphdr *)(ip + 1);
+				drop_hdrlen = sizeof(*ip);
+				iptos = ip->ip_tos;
+				tlen = ntohs(ip->ip_len) - sizeof(struct ip);
+				break;
 #endif
-		}
-		/*
-		 * Convert TCP protocol specific fields to host format.
-		 */
-		tcp_fields_to_host(th);
-
-		off = th->th_off << 2;
-		if (off < sizeof (struct tcphdr) || off > tlen) {
-			KMOD_TCPSTAT_INC(tcps_rcvbadoff);
+			} /* end switch */
+			/*
+			 * Convert TCP protocol specific fields to host format.
+			 */
+			tcp_fields_to_host(th);
+			off = th->th_off << 2;
+			if (off < sizeof (struct tcphdr) || off > tlen) {
+				printf("off:%d < hdrlen:%zu || > tlen:%u -- dump\n",
+				       off,
+				       sizeof(struct tcphdr),
+				       tlen);
+				KMOD_TCPSTAT_INC(tcps_rcvbadoff);
 				m_freem(m);
 				goto skipped_pkt;
-		}
-		tlen -= off;
-		drop_hdrlen += off;
-		/*
-		 * Now lets setup the timeval to be when we should
-		 * have been called (if we can).
-		 */
-		m->m_pkthdr.lro_nsegs = 1;
-		if (m->m_flags & M_TSTMP_LRO) {
-			tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
-			tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
+			}
+			tlen -= off;
+			drop_hdrlen += off;
+			/*
+			 * Now lets setup the timeval to be when we should
+			 * have been called (if we can).
+			 */
+			m->m_pkthdr.lro_nsegs = 1;
+			/* Now what about next packet? */
 		} else {
-			/* Should not be should we kassert instead? */
-			tcp_get_usecs(&tv);
+			/*
+			 * This mbuf is an array of acks that have
+			 * been compressed. We assert the inp has
+			 * the flag set to enable this!
+			 */
+			KASSERT((tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP),
+				("tp:%p inp:%p no INP_MBUF_ACKCMP flags?", tp, tp->t_inpcb));
+			tlen = 0;
+			drop_hdrlen = 0;
+			th = NULL;
+			iptos = 0;
 		}
-		/* Now what about next packet? */
+		tcp_get_usecs(&tv);
 		if (m_save || has_pkt)
 			nxt_pkt = 1;
 		else
 			nxt_pkt = 0;
-		KMOD_TCPSTAT_INC(tcps_rcvtotal);
+		if ((m->m_flags & M_ACKCMP) == 0)
+			KMOD_TCPSTAT_INC(tcps_rcvtotal);
+		else
+			KMOD_TCPSTAT_ADD(tcps_rcvtotal, (m->m_len / sizeof(struct tcp_ackent)));
+		inp = tp->t_inpcb;
+		INP_WLOCK_ASSERT(inp);
 		retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
 							      iptos, nxt_pkt, &tv);
 		if (retval) {
@ -434,6 +508,7 @@ skip_vnet:
 			}
 			if (no_vn == 0)
 				CURVNET_RESTORE();
+			INP_UNLOCK_ASSERT(inp);
 			return(retval);
 		}
 skipped_pkt:
@ -482,11 +557,6 @@ ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
 	if (rc_sacked <= ctf_outstanding(tp))
 		return(ctf_outstanding(tp) - rc_sacked);
 	else {
-		/* TSNH */
-#ifdef INVARIANTS
-		panic("tp:%p rc_sacked:%d > out:%d",
-		      tp, rc_sacked, ctf_outstanding(tp));
-#endif
 		return (0);
 	}
 }
@ -502,6 +572,36 @@ ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 }

+void
+ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt)
+{
+	if ((ts != NULL) && (cnt != NULL) &&
+	    (tcp_ack_war_time_window > 0) &&
+	    (tcp_ack_war_cnt > 0)) {
+		/* We are possibly doing ack war prevention */
+		uint32_t cts;
+
+		/*
+		 * We use a msec tick here which gives us
+		 * roughly 49 days. We don't need the
+		 * precision of a microsecond timestamp which
+		 * would only give us hours.
+		 */
+		cts = tcp_ts_getticks();
+		if (TSTMP_LT((*ts), cts)) {
+			/* Timestamp is in the past */
+			*cnt = 0;
+			*ts = (cts + tcp_ack_war_time_window);
+		}
+		if (*cnt < tcp_ack_war_cnt) {
+			*cnt = (*cnt + 1);
+			tp->t_flags |= TF_ACKNOW;
+		} else
+			tp->t_flags &= ~TF_ACKNOW;
+	} else
+		tp->t_flags |= TF_ACKNOW;
+}
+
 /*
 * ctf_drop_checks returns 1 for you should not proceed. It places
 * in ret_val what should be returned 1/0 by the caller. The 1 indicates
@ -509,7 +609,10 @@ ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
 * TCB is still valid and locked.
 */
 int
-ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp,  int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
+		 struct tcpcb *tp, int32_t *tlenp,
+		 int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val,
+		 uint32_t *ts, uint32_t *cnt)
 {
 	int32_t todrop;
 	int32_t thflags;
@ -543,7 +646,7 @@ ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcp
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
-			tp->t_flags |= TF_ACKNOW;
+			ctf_ack_war_checks(tp, ts, cnt);
 			todrop = tlen;
 			KMOD_TCPSTAT_INC(tcps_rcvduppack);
 			KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
@ -555,13 +658,14 @@ ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcp
 		 * DSACK - add SACK block for dropped range
 		 */
 		if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
-			tcp_update_sack_list(tp, th->th_seq,
-			    th->th_seq + todrop);
 			/*
 			 * ACK now, as the next in-sequence segment
 			 * will clear the DSACK block again
 			 */
-			tp->t_flags |= TF_ACKNOW;
+			ctf_ack_war_checks(tp, ts, cnt);
+			if (tp->t_flags & TF_ACKNOW)
+				tcp_update_sack_list(tp, th->th_seq,
+						     th->th_seq + todrop);
 		}
 		*drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
@ -590,10 +694,10 @@ ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcp
 			 * ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
-				tp->t_flags |= TF_ACKNOW;
+				ctf_ack_war_checks(tp, ts, cnt);
 				KMOD_TCPSTAT_INC(tcps_rcvwinprobe);
 			} else {
-				ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+				__ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, ts, cnt);
 				return (1);
 			}
 		} else
@ -614,7 +718,7 @@ ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcp
 * and valid.
 */
 void
-ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
+__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val, uint32_t *ts, uint32_t *cnt)
 {
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies sequence
@ -638,7 +742,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t
 		return;
 	} else
 		*ret_val = 0;
-	tp->t_flags |= TF_ACKNOW;
+	ctf_ack_war_checks(tp, ts, cnt);
 	if (m)
 		m_freem(m);
 }
@ -671,7 +775,7 @@ ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcp
 	 */
 	int dropped = 0;

-	if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
+	if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 	    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 		KASSERT(tp->t_state != TCPS_SYN_SENT,
@ -680,8 +784,7 @@ ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcp

 		if (V_tcp_insecure_rst ||
 		    (tp->last_ack_sent == th->th_seq) ||
-		    (tp->rcv_nxt == th->th_seq) ||
-		    ((tp->last_ack_sent - 1) == th->th_seq)) {
+		    (tp->rcv_nxt == th->th_seq)) {
 			KMOD_TCPSTAT_INC(tcps_drops);
 			/* Drop the connection. */
 			switch (tp->t_state) {
@ -748,7 +851,7 @@ ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t *
 }

 /*
- * bbr_ts_check returns 1 for you should not proceed, the state
+ * ctf_ts_check returns 1 for you should not proceed, the state
 * machine should return. It places in ret_val what should
 * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
 * that the TCB is unlocked and probably dropped. The 0 indicates the
@ -786,6 +889,32 @@ ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
 	return (0);
 }

+int
+ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags)
+{
+
+	if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+		/*
+		 * Invalidate ts_recent.  If this segment updates ts_recent,
+		 * the age will be reset later and ts_recent will get a
+		 * valid value.  If it does not, setting ts_recent to zero
+		 * will at least satisfy the requirement that zero be placed
+		 * in the timestamp echo reply when ts_recent isn't valid.
+		 * The age isn't reset until we get a valid ts_recent
+		 * because we don't want out-of-order segments to be dropped
+		 * when ts_recent is old.
+		 */
+		tp->ts_recent = 0;
+	} else {
+		KMOD_TCPSTAT_INC(tcps_rcvduppack);
+		KMOD_TCPSTAT_INC(tcps_pawsdrop);
+		return (1);
+	}
+	return (0);
+}
+
+
+
 void
 ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
 {
@ -817,45 +946,7 @@ ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
 uint32_t
 ctf_fixed_maxseg(struct tcpcb *tp)
 {
-	int optlen;
-
-	if (tp->t_flags & TF_NOOPT)
-		return (tp->t_maxseg);
-
-	/*
-	 * Here we have a simplified code from tcp_addoptions(),
-	 * without a proper loop, and having most of paddings hardcoded.
-	 * We only consider fixed options that we would send every
-	 * time I.e. SACK is not considered.
-	 *
-	 */
-#define	PAD(len)	((((len) / 4) + !!((len) % 4)) * 4)
-	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
-		if (tp->t_flags & TF_RCVD_TSTMP)
-			optlen = TCPOLEN_TSTAMP_APPA;
-		else
-			optlen = 0;
-#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
-		if (tp->t_flags & TF_SIGNATURE)
-			optlen += PAD(TCPOLEN_SIGNATURE);
-#endif
-	} else {
-		if (tp->t_flags & TF_REQ_TSTMP)
-			optlen = TCPOLEN_TSTAMP_APPA;
-		else
-			optlen = PAD(TCPOLEN_MAXSEG);
-		if (tp->t_flags & TF_REQ_SCALE)
-			optlen += PAD(TCPOLEN_WINDOW);
-#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
-		if (tp->t_flags & TF_SIGNATURE)
-			optlen += PAD(TCPOLEN_SIGNATURE);
-#endif
-		if (tp->t_flags & TF_SACK_PERMIT)
-			optlen += PAD(TCPOLEN_SACK_PERMITTED);
-	}
-#undef PAD
-	optlen = min(optlen, TCP_MAXOLEN);
-	return (tp->t_maxseg - optlen);
+	return (tcp_fixed_maxseg(tp));
 }

 void
--- a/sys/netinet/tcp_stacks/rack_bbr_common.h
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.h
@ -98,12 +98,20 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt);
 uint32_t ctf_outstanding(struct tcpcb *tp);
 uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
 int
-ctf_drop_checks(struct tcpopt *to, struct mbuf *m,
-    struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
-    int32_t * drop_hdrlen, int32_t * ret_val);
+_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
+    struct tcpcb *tp, int32_t *tlenp,
+    int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val,
+    uint32_t *ts, uint32_t *cnt);
+void ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt);
+#define ctf_drop_checks(a, b, c, d, e, f, g, h) _ctf_drop_checks(a, b, c, d, e, f, g, h, NULL, NULL)
+
 void
-ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
-    struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
+__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
+      struct tcphdr *th, int32_t thflags, int32_t tlen,
+      int32_t *ret_val, uint32_t *ts, uint32_t *cnt);
+
+#define ctf_do_dropafterack(a, b, c, d, e, f) __ctf_do_dropafterack(a, b, c, d, e, f, NULL, NULL)
+
 void
 ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
 	struct tcphdr *th, int32_t rstreason, int32_t tlen);
@ -122,6 +130,9 @@ int
 ctf_ts_check(struct mbuf *m, struct tcphdr *th,
    struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);

+int
+ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags);
+
 void
 ctf_calc_rwin(struct socket *so, struct tcpcb *tp);

--- a/sys/netinet/tcp_stacks/tcp_bbr.h
+++ b/sys/netinet/tcp_stacks/tcp_bbr.h
@ -71,7 +71,7 @@ struct bbr_sendmap {
 	uint32_t r_del_time;	/* The time of the last delivery update */
 	uint8_t r_rtr_cnt:4,	/* Retran count, index this -1 to get time
 				 * sent */
-		unused_bit:1,
+		r_rtt_not_allowed:1,	/* No rtt measurement allowed */
 	        r_is_drain:1,	/* In a draining cycle */
 		r_app_limited:1,/* We went app limited */
 	        r_ts_valid:1;	/* Timestamp field is valid (r_del_ack_ts) */
@ -588,9 +588,9 @@ struct bbr_control {

 	uint32_t rc_reorder_ts;	/* Last time we saw reordering Lock(a) */
 	uint32_t rc_init_rwnd;	/* Initial rwnd when we transitioned */
-                                /*- ---
+				/*- ---
 				 * used only initial and close
-                                 */
+				 */
 	uint32_t rc_high_rwnd;	/* Highest rwnd seen */
 	uint32_t rc_lowest_rtt;	/* Smallest RTT we have seen */

--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@ -29,7 +29,7 @@
 #define _NETINET_TCP_RACK_H_

 #define RACK_ACKED	    0x0001/* The remote endpoint acked this */
-#define RACK_TO_MIXED	    0x0002/* A timeout occurred that mixed the send order - not used */
+#define RACK_TO_REXT	    0x0002/* A timeout occured on this sendmap entry */
 #define RACK_DEFERRED	    0x0004/* We can't use this for RTT calc - not used */
 #define RACK_OVERMAX	    0x0008/* We have more retran's then we can fit */
 #define RACK_SACK_PASSED    0x0010/* A sack was done above this block */
@ -39,37 +39,94 @@
 #define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */
 #define RACK_APP_LIMITED    0x0200/* We went app limited after this send */
 #define RACK_WAS_ACKED	    0x0400/* a RTO undid the ack, but it already had a rtt calc done */
-#define RACK_HAS_SIN	    0x0800/* SIN is on this guy */
+#define RACK_HAS_SYN	    0x0800/* SYN is on this guy */
+#define RACK_SENT_W_DSACK   0x1000/* Sent with a dsack */
+#define RACK_SENT_SP	    0x2000/* sent in slow path */
+#define RACK_SENT_FP        0x4000/* sent in fast path */
+#define RACK_HAD_PUSH	    0x8000/* Push was sent on original send */
 #define RACK_NUM_OF_RETRANS 3

-#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */
+#define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */

-#define RACK_REQ_AVG 4 	/* Must be less than 256 */
+#define RACK_REQ_AVG 3 	/* Must be less than 256 */

 struct rack_sendmap {
+	TAILQ_ENTRY(rack_sendmap) r_tnext;	/* Time of transmit based next */
 	uint32_t r_start;	/* Sequence number of the segment */
 	uint32_t r_end;		/* End seq, this is 1 beyond actually */
-	TAILQ_ENTRY(rack_sendmap) r_tnext;	/* Time of transmit based next */
-	RB_ENTRY(rack_sendmap) r_next;		/* RB Tree next */
 	uint32_t r_rtr_bytes;	/* How many bytes have been retransmitted */
 	uint16_t r_rtr_cnt;	/* Retran count, index this -1 to get time
 				 * sent */
 	uint16_t r_flags;	/* Flags as defined above */
-	uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
-	uint32_t usec_orig_send;	/* time of orginal send in useconds */
+	struct mbuf *m;
+	uint32_t soff;
+	uint32_t orig_m_len;
 	uint32_t r_nseq_appl;	/* If this one is app limited, this is the nxt seq limited */
-	uint32_t r_ack_arrival;	/* This is the time of ack-arrival (if SACK'd) */
 	uint8_t r_dupack;	/* Dup ack count */
 	uint8_t r_in_tmap;	/* Flag to see if its in the r_tnext array */
 	uint8_t r_limit_type;	/* is this entry counted against a limit? */
 	uint8_t r_just_ret : 1, /* After sending, the next pkt was just returned, i.e. limited  */
 		r_one_out_nr : 1,	/* Special case 1 outstanding and not in recovery */
-		r_avail : 6;
-	uint8_t r_resv[36];
+		r_no_rtt_allowed : 1, /* No rtt measurement allowed */
+		r_avail : 5;
+	uint64_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
+	uint64_t r_ack_arrival;	/* This is the time of ack-arrival (if SACK'd) */
+	RB_ENTRY(rack_sendmap) r_next;		/* RB Tree next */
 };

+struct deferred_opt_list {
+	TAILQ_ENTRY(deferred_opt_list) next;
+	int optname;
+	uint64_t optval;
+};
+
+/*
+ * Timestamps in the rack sendmap are now moving to be
+ * uint64_t's. This means that if you want a uint32_t
+ * usec timestamp (the old usecond timestamp) you simply have
+ * to cast it to uint32_t. The reason we do this is not for
+ * wrap, but we need to get back, at times, to the millisecond
+ * timestamp that is used in the TSTMP option. To do this we
+ * can use the rack_ts_to_msec() inline below which can take
+ * the 64bit ts and make into the correct timestamp millisecond
+ * wise. Thats not possible with the 32bit usecond timestamp since
+ * the seconds wrap too quickly to cover all bases.
+ *
+ * There are quite a few places in rack where I simply cast
+ * back to uint32_t and then end up using the TSTMP_XX()
+ * macros. This is ok, but we could do simple compares if
+ * we ever decided to move all of those variables to 64 bits
+ * as well.
+ */
+
+inline uint64_t
+rack_to_usec_ts(struct timeval *tv)
+{
+	return ((tv->tv_sec * HPTS_USEC_IN_SEC) + tv->tv_usec);
+}
+
+inline uint32_t
+rack_ts_to_msec(uint64_t ts)
+{
+	return((uint32_t)(ts / HPTS_MSEC_IN_SEC));
+}
+
+
 RB_HEAD(rack_rb_tree_head, rack_sendmap);
 TAILQ_HEAD(rack_head, rack_sendmap);
+TAILQ_HEAD(def_opt_head, deferred_opt_list);
+
+/* Map change logging */
+#define MAP_MERGE	0x01
+#define MAP_SPLIT	0x02
+#define MAP_NEW		0x03
+#define MAP_SACK_M1	0x04
+#define MAP_SACK_M2	0x05
+#define MAP_SACK_M3	0x06
+#define MAP_SACK_M4	0x07
+#define MAP_SACK_M5	0x08
+#define MAP_FREE	0x09
+#define MAP_TRIM_HEAD	0x0a

 #define RACK_LIMIT_TYPE_SPLIT	1

@ -128,10 +185,7 @@ struct rack_log {
 #define RACK_TO_FRM_DELACK 6

 struct rack_opts_stats {
-	uint64_t tcp_rack_prop_rate;
- 	uint64_t tcp_rack_prop;
 	uint64_t tcp_rack_tlp_reduce;
-	uint64_t tcp_rack_early_recov;
 	uint64_t tcp_rack_pace_always;
 	uint64_t tcp_rack_pace_reduce;
 	uint64_t tcp_rack_max_seg;
@ -177,6 +231,20 @@ struct rack_opts_stats {
 	uint64_t tcp_npush;
 	uint64_t tcp_lscwnd;
 	uint64_t tcp_profile;
+	uint64_t tcp_hdwr_rate_cap;
+	uint64_t tcp_pacing_rate_cap;
+	uint64_t tcp_pacing_up_only;
+	uint64_t tcp_use_cmp_acks;
+	uint64_t tcp_rack_abc_val;
+	uint64_t tcp_rec_abc_val;
+	uint64_t tcp_rack_measure_cnt;
+	uint64_t tcp_rack_delayed_ack;
+	uint64_t tcp_rack_rtt_use;
+	uint64_t tcp_data_after_close;
+	uint64_t tcp_defer_opt;
+	uint64_t tcp_rack_fastrsm_hack;
+	uint64_t tcp_rack_beta;
+	uint64_t tcp_rack_beta_ecn;
 };

 /* RTT shrink reasons */
@ -247,6 +315,23 @@ extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 */
 #define RACK_GP_HIST 4	/* How much goodput history do we maintain? */

+#define RACK_NUM_FSB_DEBUG 16
+struct rack_fast_send_blk {
+	uint32_t left_to_send;
+	uint16_t tcp_ip_hdr_len;
+	uint8_t tcp_flags;
+	uint8_t hoplimit;
+	uint8_t *tcp_ip_hdr;
+	uint32_t recwin;
+	uint32_t off;
+	struct tcphdr *th;
+	struct udphdr *udp;
+	struct mbuf *m;
+	uint32_t o_m_len;
+	uint32_t rfo_apply_push : 1,
+		unused : 31;
+};
+
 struct rack_control {
 	/* Second cache line 0x40 from tcp_rack */
 	struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */
@ -255,6 +340,7 @@ struct rack_control {
 						 * tlp_sending Lock(a) */
 	struct rack_sendmap *rc_resend;	/* something we have been asked to
 					 * resend */
+	struct rack_fast_send_blk fsb;	/* The fast-send block */
 	uint32_t input_pkt;
 	uint32_t saved_input_pkt;
 	uint32_t rc_hpts_flags;
@ -268,6 +354,9 @@ struct rack_control {

 	/* Third Cache line 0x80 */
 	struct rack_head rc_free;	/* Allocation array */
+	uint64_t last_hw_bw_req;
+	uint64_t crte_prev_rate;
+	uint64_t bw_rate_cap;
 	uint32_t rc_time_last_sent;	/* Time we last sent some data and
 					 * logged it Lock(a). */
 	uint32_t rc_reorder_ts;	/* Last time we saw reordering Lock(a) */
@ -342,8 +431,8 @@ struct rack_control {
 	uint32_t rc_agg_delayed;
 	uint32_t rc_tlp_rxt_last_time;
 	uint32_t rc_saved_cwnd;
-	uint32_t rc_gp_output_ts;
-	uint32_t rc_gp_cumack_ts;
+	uint64_t rc_gp_output_ts; /* chg*/
+	uint64_t rc_gp_cumack_ts; /* chg*/
 	struct timeval act_rcv_time;
 	struct timeval rc_last_time_decay;	/* SAD time decay happened here */
 	uint64_t gp_bw;
@ -354,6 +443,7 @@ struct rack_control {
 	uint64_t last_gp_comp_bw;
 	uint64_t last_max_bw;	/* Our calculated max b/w last */
 	struct time_filter_small rc_gp_min_rtt;
+	struct def_opt_head opt_list;
 	int32_t rc_rtt_diff;		/* Timely style rtt diff of our gp_srtt */
 	uint32_t rc_gp_srtt;		/* Current GP srtt */
 	uint32_t rc_prev_gp_srtt;	/* Previous RTT */
@ -370,21 +460,40 @@ struct rack_control {
 	uint32_t rc_time_of_last_probertt;
 	uint32_t rc_target_probertt_flight;
 	uint32_t rc_probertt_sndmax_atexit;	/* Highest sent to in probe-rtt */
+	uint32_t rc_cwnd_at_erec;
+	uint32_t rc_ssthresh_at_erec;
+	uint32_t dsack_byte_cnt;
+	uint32_t retran_during_recovery;
 	uint32_t rc_gp_lowrtt;			/* Lowest rtt seen during GPUT measurement */
 	uint32_t rc_gp_high_rwnd;		/* Highest rwnd seen during GPUT measurement */
+	uint32_t rc_snd_max_at_rto;	/* For non-sack when the RTO occured what was snd-max */
+	uint32_t rc_out_at_rto;
 	int32_t rc_scw_index;
 	uint32_t rc_tlp_threshold;	/* Socket option value Lock(a) */
+	uint32_t rc_last_timeout_snduna;
+	uint32_t challenge_ack_ts;
+	uint32_t challenge_ack_cnt;
+	uint32_t rc_min_to;	/* Socket option value Lock(a) */
+	uint32_t rc_pkt_delay;	/* Socket option value Lock(a) */
+	struct newreno rc_saved_beta;	/*
+					 * For newreno cc:
+					 * rc_saved_cc are the values we have had
+					 * set by the user, if pacing is not happening
+					 * (i.e. its early and we have not turned on yet
+					 *  or it was turned off). The minute pacing
+					 * is turned on we pull out the values currently
+					 * being used by newreno and replace them with
+					 * these values, then save off the old values here,
+					 * we also set the flag (if ecn_beta is set) to make
+					 * new_reno do less of a backoff for ecn (think abe).
+					 */
 	uint16_t rc_early_recovery_segs;	/* Socket option value Lock(a) */
 	uint16_t rc_reorder_shift;	/* Socket option value Lock(a) */
-	uint16_t rc_pkt_delay;	/* Socket option value Lock(a) */
 	uint8_t rc_no_push_at_mrtt;	/* No push when we exceed max rtt */
-	uint8_t num_avg;	/* average count before we go to normal decay */
-	uint8_t rc_prop_rate;	/* Socket option value Lock(a) */
-	uint8_t rc_prop_reduce;	/* Socket option value Lock(a) */
+	uint8_t num_measurements;	/* Number of measurements (up to 0xff, we freeze at 0xff)  */
+	uint8_t req_measurements;	/* How many measurements are required? */
 	uint8_t rc_tlp_cwnd_reduce;	/* Socket option value Lock(a) */
-	uint8_t rc_early_recovery;	/* Socket option value Lock(a) */
 	uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
-	uint8_t rc_min_to;	/* Socket option value Lock(a) */
 	uint8_t rc_rate_sample_method;
 	uint8_t rc_gp_hist_idx;
 };
@ -402,21 +511,57 @@ struct tcp_rack {
 	    int32_t, int32_t, uint32_t, int, int, uint8_t);	/* Lock(a) */
 	struct tcpcb *rc_tp;	/* The tcpcb Lock(a) */
 	struct inpcb *rc_inp;	/* The inpcb Lock(a) */
-	uint32_t rc_free_cnt;	/* Number of free entries on the rc_free list
+	uint8_t rc_free_cnt;	/* Number of free entries on the rc_free list
 				 * Lock(a) */
+	uint8_t client_bufferlvl;	/* 0 - 5 normaly, less than or at 2 means its real low */
+	uint8_t no_prr_addback : 1,
+		gp_ready : 1,
+		defer_options: 1,
+		fast_rsm_hack: 1,
+		rc_ack_can_sendout_data: 1, /*
+					     * If set it will override pacing restrictions on not sending
+					     * data when the pacing timer is running. I.e. you set this
+					     * and an ACK will send data. Default is off and its only used
+					     * without pacing when we are doing 5G speed up for there
+					     * ack filtering.
+					     */
+		rc_pacing_cc_set: 1,	     /*
+					      * If we are pacing (pace_always=1) and we have reached the
+					      * point where we start pacing (fixed or gp has reached its
+					      * magic gp_ready state) this flag indicates we have set in
+					      * values to effect CC's backoff's. If pacing is turned off
+					      * then we must restore the values saved in rc_saved_beta,
+					      * if its going to gp_ready we need to copy the values into
+					      * the CC module and set our flags.
+					      *
+					      * Note this only happens if the cc name is newreno (CCALGONAME_NEWRENO).
+					      */
+
+		avail :2;
+	uint8_t avail_bytes;
 	uint32_t rc_rack_rtt;	/* RACK-RTT Lock(a) */
 	uint16_t r_mbuf_queue : 1,	/* Do we do mbuf queue for non-paced */
 		 rtt_limit_mul : 4,	/* muliply this by low rtt */
 		 r_limit_scw : 1,
-		 r_avail_bits : 10;	/* Available */
+		 r_must_retran : 1,	/* For non-sack customers we hit an RTO and new data should be resends */
+		 r_use_cmp_ack: 1,	/* Do we use compressed acks */
+		 r_ent_rec_ns: 1,	/* We entered recovery and have not sent */
+		 r_might_revert: 1,	/* Flag to find out if we might need to revert */
+		 r_fast_output: 1, 	/* Fast output is in progress we can skip the bulk of rack_output */
+		 r_fsb_inited: 1,
+		 r_rack_hw_rate_caps: 1,
+		 r_up_only: 1,
+		 r_via_fill_cw : 1,
+		 r_fill_less_agg : 1;

-	uint16_t rc_user_set_max_segs;	/* Socket option value Lock(a) */
+	uint8_t rc_user_set_max_segs;	/* Socket option value Lock(a) */
+	uint8_t rc_labc;		/* Appropriate Byte Counting Value */
 	uint16_t forced_ack : 1,
 		rc_gp_incr : 1,
 		rc_gp_bwred : 1,
 		rc_gp_timely_inc_cnt : 3,
 		rc_gp_timely_dec_cnt : 3,
-		rc_not_backing_off: 1,
+		r_use_labc_for_rec: 1,
 		rc_highly_buffered: 1,		/* The path is highly buffered */
 		rc_dragged_bottom: 1,
 		rc_dack_mode : 1,		/* Mac O/S emulation of d-ack */
@ -435,7 +580,7 @@ struct tcp_rack {
 		rc_always_pace : 1,	/* Socket option value Lock(a) */
 		rc_pace_to_cwnd : 1,
 		rc_pace_fill_if_rttin_range : 1,
-		xxx_avail_bits : 1;
+		rc_srtt_measure_made : 1;
 	uint8_t app_limited_needs_set : 1,
 		use_fixed_rate : 1,
 		rc_has_collapsed : 1,
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@ -193,6 +193,16 @@ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps,
    &tcp_sad_low_pps, 100,
    "What is the input pps that below which we do not decay?");
 #endif
+uint32_t tcp_ack_war_time_window = 1000;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow,
+    CTLFLAG_RW,
+    &tcp_ack_war_time_window, 1000,
+   "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?");
+uint32_t tcp_ack_war_cnt = 5;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt,
+    CTLFLAG_RW,
+    &tcp_ack_war_cnt, 5,
+   "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?");

 struct rwlock tcp_function_lock;

@ -268,6 +278,18 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_R
    &VNET_NAME(tcp_ts_offset_per_conn), 0,
    "Initialize TCP timestamps per connection instead of per host pair");

+/* How many connections are pacing */
+static volatile uint32_t number_of_tcp_connections_pacing = 0;
+static uint32_t shadow_num_connections = 0;
+
+static int tcp_pacing_limit = 10000;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
+    &tcp_pacing_limit, 1000,
+    "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");
+
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
+    &shadow_num_connections, 0, "Number of TCP connections being paced");
+
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
    &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
@ -3511,6 +3533,54 @@ tcp_maxseg(const struct tcpcb *tp)
 	return (tp->t_maxseg - optlen);
 }

+
+u_int
+tcp_fixed_maxseg(const struct tcpcb *tp)
+{
+	int optlen;
+
+	if (tp->t_flags & TF_NOOPT)
+		return (tp->t_maxseg);
+
+	/*
+	 * Here we have a simplified code from tcp_addoptions(),
+	 * without a proper loop, and having most of paddings hardcoded.
+	 * We only consider fixed options that we would send every
+	 * time I.e. SACK is not considered. This is important
+	 * for cc modules to figure out what the modulo of the
+	 * cwnd should be.
+	 */
+#define	PAD(len)	((((len) / 4) + !!((len) % 4)) * 4)
+	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+		if (tp->t_flags & TF_RCVD_TSTMP)
+			optlen = TCPOLEN_TSTAMP_APPA;
+		else
+			optlen = 0;
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+		if (tp->t_flags & TF_SIGNATURE)
+			optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+	} else {
+		if (tp->t_flags & TF_REQ_TSTMP)
+			optlen = TCPOLEN_TSTAMP_APPA;
+		else
+			optlen = PAD(TCPOLEN_MAXSEG);
+		if (tp->t_flags & TF_REQ_SCALE)
+			optlen += PAD(TCPOLEN_WINDOW);
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+		if (tp->t_flags & TF_SIGNATURE)
+			optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+		if (tp->t_flags & TF_SACK_PERMIT)
+			optlen += PAD(TCPOLEN_SACK_PERMITTED);
+	}
+#undef PAD
+	optlen = min(optlen, TCP_MAXOLEN);
+	return (tp->t_maxseg - optlen);
+}
+
+
+
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
@ -3972,3 +4042,38 @@ tcp_log_end_status(struct tcpcb *tp, uint8_t status)
 		}
 	}
 }
+
+int
+tcp_can_enable_pacing(void)
+{
+
+	if ((tcp_pacing_limit == -1) ||
+	    (tcp_pacing_limit > number_of_tcp_connections_pacing)) {
+		atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1);
+		shadow_num_connections = number_of_tcp_connections_pacing;
+		return (1);
+	} else {
+		return (0);
+	}
+}
+
+static uint8_t tcp_pacing_warning = 0;
+
+void
+tcp_decrement_paced_conn(void)
+{
+	uint32_t ret;
+
+	ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1);
+	shadow_num_connections = number_of_tcp_connections_pacing;
+	KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?"));
+	if (ret == 0) {
+		if (tcp_pacing_limit != -1) {
+			printf("Warning all pacing is now disabled, count decrements invalidly!\n");
+			tcp_pacing_limit = 0;
+		} else if (tcp_pacing_warning == 0) {
+			printf("Warning pacing count is invalid, invalid decrement\n");
+			tcp_pacing_warning = 1;
+		}
+	}
+}
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@ -258,6 +258,10 @@ struct tcpcb {
 	tcp_seq gput_seq;		/* Outbound measurement seq */
 	tcp_seq gput_ack;		/* Inbound measurement ack */
 	int32_t t_stats_gput_prev;	/* XXXLAS: Prev gput measurement */
+	uint32_t t_maxpeakrate;		/* max peak rate set by user, in bytes/s */
+	uint32_t t_sndtlppack;		/* tail loss probe packets sent */
+	uint64_t t_sndtlpbyte;		/* total tail loss probe bytes sent */
+
 	uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
 	uint32_t t_end_info_status;	/* Status flag of end info */
 	unsigned int *t_tfo_pending;	/* TCP Fast Open server pending counter */
@ -974,6 +978,7 @@ void	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
 void 	cc_conn_init(struct tcpcb *tp);
 void 	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 void    cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos);
+void	cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos);
 void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
 #ifdef TCP_HHOOK
 void	hhook_run_tcp_est_in(struct tcpcb *tp,
@ -1022,10 +1027,13 @@ extern int32_t tcp_sad_low_pps;
 extern int32_t tcp_map_minimum;
 extern int32_t tcp_attack_on_turns_on_logging;
 #endif
+extern uint32_t tcp_ack_war_time_window;
+extern uint32_t tcp_ack_war_cnt;

 uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
 uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
 u_int	 tcp_maxseg(const struct tcpcb *);
+u_int	 tcp_fixed_maxseg(const struct tcpcb *);
 void	 tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
 	    struct tcp_ifcap *);
 void	 tcp_mss(struct tcpcb *, int);
@ -1075,6 +1083,7 @@ uint32_t tcp_new_ts_offset(struct in_conninfo *);
 tcp_seq	 tcp_new_isn(struct in_conninfo *);

 int	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
+int	 tcp_dsack_block_exists(struct tcpcb *);
 void	 tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
 void	 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
 void	 tcp_clean_dsack_blocks(struct tcpcb *tp);
@ -1090,6 +1099,9 @@ uint32_t tcp_compute_initwnd(uint32_t);
 void	 tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
 int	 tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
    size_t seed_len);
+int tcp_can_enable_pacing(void);
+void tcp_decrement_paced_conn(void);
+
 struct mbuf *
 	 tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
 	   int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls);