This commit brings in a new refactored TCP stack called Rack.

Rack includes the following features: - A different SACK processing scheme (the old sack structures are not used). - RACK (Recent acknowledgment) where counting dup-acks is no longer done instead time is used to knwo when to retransmit. (see the I-D) - TLP (Tail Loss Probe) where we will probe for tail-losses to attempt to try not to take a retransmit time-out. (see the I-D) - Burst mitigation using TCPHTPS - PRR (partial rate reduction) see the RFC. Once built into your kernel, you can select this stack by either socket option with the name of the stack is "rack" or by setting the global sysctl so the default is rack. Note that any connection that does not support SACK will be kicked back to the "default" base FreeBSD stack (currently known as "default"). To build this into your kernel you will need to enable in your kernel: makeoptions WITH_EXTRA_TCP_STACKS=1 options TCPHPTS Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D15525
svn path=/head/; revision=334804
2018-06-07 18:18:13 +00:00 · 2018-06-07 18:18:13 +00:00 · 89e560f441 · 2020-12-20 02:59:44 +00:00
commit 89e560f441
parent ce024bdc0c
19 changed files with 10766 additions and 25 deletions
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@ -1283,6 +1283,55 @@ sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
 	return (ret);
 }

+struct mbuf *
+sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff)
+{
+	struct mbuf *m;
+
+	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+	if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
+		*moff = off;
+		if (sb->sb_sndptr == NULL) {
+			sb->sb_sndptr = sb->sb_mb;
+			sb->sb_sndptroff = 0;
+		}
+		return (sb->sb_mb);
+	} else {
+		m = sb->sb_sndptr;
+		off -= sb->sb_sndptroff;
+	}
+	*moff = off;
+	return (m);
+}
+
+void
+sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len)
+{
+	/*
+	 * A small copy was done, advance forward the sb_sbsndptr to cover
+	 * it.
+	 */
+	struct mbuf *m;
+
+	if (mb != sb->sb_sndptr) {
+		/* Did not copyout at the same mbuf */
+		return;
+	}
+	m = mb;
+	while (m && (len > 0)) {
+		if (len >= m->m_len) {
+			len -= m->m_len;
+			if (m->m_next) {
+				sb->sb_sndptroff += m->m_len;
+				sb->sb_sndptr = m->m_next;
+			}
+			m = m->m_next;
+		} else {
+			len = 0;
+		}
+	}
+}
+
 /*
 * Return the first mbuf and the mbuf data offset for the provided
 * send offset without changing the "sb_sndptroff" field.
--- a/sys/modules/tcp/Makefile
+++ b/sys/modules/tcp/Makefile
@ -7,10 +7,12 @@ SYSDIR?=${SRCTOP}/sys

 SUBDIR=	\
 	${_tcp_fastpath} \
+        ${_tcp_rack} \
 	${_tcpmd5} \

 .if ${MK_EXTRA_TCP_STACKS} != "no" || defined(ALL_MODULES)
 _tcp_fastpath=	fastpath
+_tcp_rack= 	rack
 .endif

 .if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \
--- a/sys/modules/tcp/rack/Makefile
+++ b/sys/modules/tcp/rack/Makefile
@ -0,0 +1,24 @@
+#
+# $FreeBSD$
+#
+
+.PATH: ${.CURDIR}/../../../netinet/tcp_stacks
+
+STACKNAME=	rack
+KMOD=	tcp_${STACKNAME}
+SRCS=	rack.c sack_filter.c
+
+SRCS+=	opt_inet.h opt_inet6.h opt_ipsec.h
+SRCS+=	opt_tcpdebug.h
+SRCS+=	opt_kern_tls.h
+
+#
+# Enable full debugging
+#
+#CFLAGS += -g
+
+CFLAGS+=	-DMODNAME=${KMOD}
+CFLAGS+=	-DSTACKNAME=${STACKNAME}
+CFLAGS+=	-DSTACKALIAS=rack_18q21
+
+.include <bsd.kmod.mk>
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@ -176,6 +176,7 @@ struct tcphdr {
 				   device */
 #define	TCP_CONGESTION	64	/* get/set congestion control algorithm */
 #define	TCP_CCALGOOPT	65	/* get/set cc algorithm specific options */
+#define TCP_DELACK  	72	/* socket option for delayed ack */
 #define	TCP_KEEPINIT	128	/* N, time to establish connection */
 #define	TCP_KEEPIDLE	256	/* L,N,X start keeplives after this period */
 #define	TCP_KEEPINTVL	512	/* L,N interval between keepalives */
@ -184,6 +185,61 @@ struct tcphdr {
 #define	TCP_PCAP_OUT	2048	/* number of output packets to keep */
 #define	TCP_PCAP_IN	4096	/* number of input packets to keep */
 #define TCP_FUNCTION_BLK 8192	/* Set the tcp function pointers to the specified stack */
+/* Options for Rack and BBR */
+#define TCP_RACK_PROP	      1051 /* RACK proportional rate reduction (bool) */
+#define TCP_RACK_TLP_REDUCE   1052 /* RACK TLP cwnd reduction (bool) */
+#define TCP_RACK_PACE_REDUCE  1053 /* RACK Pacing reduction factor (divisor) */
+#define TCP_RACK_PACE_MAX_SEG 1054 /* Max segments in a pace */
+#define TCP_RACK_PACE_ALWAYS  1055 /* Use the always pace method */
+#define TCP_RACK_PROP_RATE    1056 /* The proportional reduction rate */
+#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
+#define TCP_RACK_MIN_TO       1058 /* Minimum time between rack t-o's in ms */
+#define TCP_RACK_EARLY_RECOV  1059 /* Should recovery happen early (bool) */
+#define TCP_RACK_EARLY_SEG    1060 /* If early recovery max segments */
+#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
+#define TCP_RACK_REORD_FADE   1062 /* Does reordering fade after ms time */
+#define TCP_RACK_TLP_THRESH   1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
+#define TCP_RACK_PKT_DELAY    1064 /* RACK added ms i.e. rack-rtt + reord + N */
+#define TCP_RACK_TLP_INC_VAR  1065 /* Does TLP include rtt variance in t-o */
+#define TCP_RACK_SESS_CWV     1066 /* Enable RFC7611 cwnd validation on sess */
+#define TCP_BBR_IWINTSO	      1067 /* Initial TSO window for BBRs first sends */
+#define TCP_BBR_RECFORCE      1068 /* Enter recovery force out a segment disregard pacer */
+#define TCP_BBR_STARTUP_PG    1069 /* Startup pacing gain */
+#define TCP_BBR_DRAIN_PG      1070 /* Drain pacing gain */
+#define TCP_BBR_RWND_IS_APP   1071 /* Rwnd limited is considered app limited */
+#define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */
+#define TCP_BBR_ONE_RETRAN    1073 /* Is only one segment allowed out during retran */
+#define TCP_BBR_STARTUP_LOSS_EXIT 1074	/* Do we exit a loss during startup if not 20% incr */
+#define TCP_BBR_USE_LOWGAIN   1075 /* lower the gain in PROBE_BW enable */
+#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
+#define TCP_BBR_LOWGAIN_HALF  1077 /* Do we halfstep lowgain down */
+#define TCP_BBR_LOWGAIN_FD    1078 /* Do we force a drain when lowgain in place */
+#define TCP_BBR_USEDEL_RATE   1079 /* Enable use of delivery rate for loss recovery */
+#define TCP_BBR_MIN_RTO       1080 /* Min RTO in milliseconds */
+#define TCP_BBR_MAX_RTO	      1081 /* Max RTO in milliseconds */
+#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
+#define TCP_BBR_UNLIMITED     1083 /* Does BBR, in non-recovery not use cwnd */
+#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
+#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
+#define TCP_BBR_PACE_PER_SEC   1086
+#define TCP_BBR_PACE_DEL_TAR   1087
+#define TCP_BBR_PACE_SEG_MAX   1088
+#define TCP_BBR_PACE_SEG_MIN   1089
+#define TCP_BBR_PACE_CROSS     1090
+#define TCP_RACK_IDLE_REDUCE_HIGH 1092  /* Reduce the highest cwnd seen to IW on idle */
+#define TCP_RACK_IDLE_REDUCE_HIGH 1092  /* Reduce the highest cwnd seen to IW on idle */
+#define TCP_RACK_MIN_PACE      1093 	/* Do we enforce rack min pace time */
+#define TCP_RACK_MIN_PACE_SEG  1094	/* If so what is the seg threshould */
+#define TCP_RACK_TLP_USE       1095
+#define TCP_BBR_ACK_COMP_ALG   1096 	/* Not used */
+#define TCP_BBR_EXTRA_GAIN     1097
+#define TCP_BBR_RACK_RTT_USE   1098	/* what RTT should we use 0, 1, or 2? */
+#define TCP_BBR_RETRAN_WTSO    1099
+#define TCP_DATA_AFTER_CLOSE   1100
+#define TCP_BBR_PROBE_RTT_GAIN 1101
+#define TCP_BBR_PROBE_RTT_LEN  1102
+
+
 /* Start of reserved space for third-party user-settable options. */
 #define	TCP_VENDOR	SO_VENDOR

--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@ -94,7 +94,7 @@ struct tcp_log_bbr {
 	uint16_t flex7;
 	uint8_t bbr_state;
 	uint8_t bbr_substate;
-	uint8_t inpacer;
+	uint8_t inhpts;
 	uint8_t ininput;
 	uint8_t use_lt_bw;
 	uint8_t flex8;
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@ -143,18 +143,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_R
 	    tcp_timer_active((tp), TT_PERSIST),				\
 	    ("neither rexmt nor persist timer is set"))

-#ifdef TCP_HHOOK
-static void inline	hhook_run_tcp_est_out(struct tcpcb *tp,
-			    struct tcphdr *th, struct tcpopt *to,
-			    uint32_t len, int tso);
-#endif
 static void inline	cc_after_idle(struct tcpcb *tp);

 #ifdef TCP_HHOOK
 /*
 * Wrapper for the TCP established output helper hook.
 */
-static void inline
+void
 hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
    struct tcpopt *to, uint32_t len, int tso)
 {
@ -1851,6 +1846,144 @@ tcp_addoptions(struct tcpopt *to, u_char *optp)
 	return (optlen);
 }

+/*
+ * This is a copy of m_copym(), taking the TSO segment size/limit
+ * constraints into account, and advancing the sndptr as it goes.
+ */
+struct mbuf *
+tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
+    int32_t seglimit, int32_t segsize, struct sockbuf *sb)
+{
+	struct mbuf *n, **np;
+	struct mbuf *top;
+	int32_t off = off0;
+	int32_t len = *plen;
+	int32_t fragsize;
+	int32_t len_cp = 0;
+	int32_t *pkthdrlen;
+	uint32_t mlen, frags;
+	bool copyhdr;
+
+
+	KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off));
+	KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len));
+	if (off == 0 && m->m_flags & M_PKTHDR)
+		copyhdr = true;
+	else
+		copyhdr = false;
+	while (off > 0) {
+		KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain"));
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		if ((sb) && (m == sb->sb_sndptr)) {
+			sb->sb_sndptroff += m->m_len;
+			sb->sb_sndptr = m->m_next;
+		}
+		m = m->m_next;
+	}
+	np = &top;
+	top = NULL;
+	pkthdrlen = NULL;
+	while (len > 0) {
+		if (m == NULL) {
+			KASSERT(len == M_COPYALL,
+			    ("tcp_m_copym, length > size of mbuf chain"));
+			*plen = len_cp;
+			if (pkthdrlen != NULL)
+				*pkthdrlen = len_cp;
+			break;
+		}
+		mlen = min(len, m->m_len - off);
+		if (seglimit) {
+			/*
+			 * For M_NOMAP mbufs, add 3 segments
+			 * + 1 in case we are crossing page boundaries
+			 * + 2 in case the TLS hdr/trailer are used
+			 * It is cheaper to just add the segments
+			 * than it is to take the cache miss to look
+			 * at the mbuf ext_pgs state in detail.
+			 */
+			if (m->m_flags & M_NOMAP) {
+				fragsize = min(segsize, PAGE_SIZE);
+				frags = 3;
+			} else {
+				fragsize = segsize;
+				frags = 0;
+			}
+
+			/* Break if we really can't fit anymore. */
+			if ((frags + 1) >= seglimit) {
+				*plen =	len_cp;
+				if (pkthdrlen != NULL)
+					*pkthdrlen = len_cp;
+				break;
+			}
+
+			/*
+			 * Reduce size if you can't copy the whole
+			 * mbuf. If we can't copy the whole mbuf, also
+			 * adjust len so the loop will end after this
+			 * mbuf.
+			 */
+			if ((frags + howmany(mlen, fragsize)) >= seglimit) {
+				mlen = (seglimit - frags - 1) * fragsize;
+				len = mlen;
+				*plen = len_cp + len;
+				if (pkthdrlen != NULL)
+					*pkthdrlen = *plen;
+			}
+			frags += howmany(mlen, fragsize);
+			if (frags == 0)
+				frags++;
+			seglimit -= frags;
+			KASSERT(seglimit > 0,
+			    ("%s: seglimit went too low", __func__));
+		}
+		if (copyhdr)
+			n = m_gethdr(M_NOWAIT, m->m_type);
+		else
+			n = m_get(M_NOWAIT, m->m_type);
+		*np = n;
+		if (n == NULL)
+			goto nospace;
+		if (copyhdr) {
+			if (!m_dup_pkthdr(n, m, M_NOWAIT))
+				goto nospace;
+			if (len == M_COPYALL)
+				n->m_pkthdr.len -= off0;
+			else
+				n->m_pkthdr.len = len;
+			pkthdrlen = &n->m_pkthdr.len;
+			copyhdr = false;
+		}
+		n->m_len = mlen;
+		len_cp += n->m_len;
+		if (m->m_flags & M_EXT) {
+			n->m_data = m->m_data + off;
+			mb_dupcl(n, m);
+		} else
+			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+			    (u_int)n->m_len);
+
+		if (sb && (sb->sb_sndptr == m) &&
+		    ((n->m_len + off) >= m->m_len) && m->m_next) {
+			sb->sb_sndptroff += m->m_len;
+			sb->sb_sndptr = m->m_next;
+		}
+		off = 0;
+		if (len != M_COPYALL) {
+			len -= n->m_len;
+		}
+		m = m->m_next;
+		np = &n->m_next;
+	}
+	return (top);
+nospace:
+	m_freem(top);
+	return (NULL);
+}
+
 void
 tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin)
 {
--- a/sys/netinet/tcp_stacks/fastpath.c
+++ b/sys/netinet/tcp_stacks/fastpath.c
@ -2392,7 +2392,7 @@ struct tcp_function_block __tcp_fastack = {
 static int
 tcp_addfastpaths(module_t mod, int type, void *data)
 {
-	int err=0;
+	int err = 0;

 	switch (type) {
 	case MOD_LOAD:
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack_bbr_common.h
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.h
@ -0,0 +1,70 @@
+#ifndef __pacer_timer_h__
+#define __pacer_timer_h__
+/*-
+ * Copyright (c) 2017
+ *	Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$");
+ */
+/* Common defines and such used by both RACK and BBR */
+/* Special values for mss accounting array */
+#define TCP_MSS_ACCT_JUSTRET 0
+#define TCP_MSS_ACCT_SNDACK  1
+#define TCP_MSS_ACCT_PERSIST 2
+#define TCP_MSS_ACCT_ATIMER  60
+#define TCP_MSS_ACCT_INPACE  61
+#define TCP_MSS_ACCT_LATE    62
+#define TCP_MSS_SMALL_SIZE_OFF 63	/* Point where small sizes enter */
+#define TCP_MSS_ACCT_SIZE    70
+#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)
+
+
+/* Magic flags to tell whats cooking on the pacing wheel */
+#define PACE_PKT_OUTPUT 0x01	/* Output Packets being paced */
+#define PACE_TMR_RACK   0x02	/* RACK timer running */
+#define PACE_TMR_TLP    0x04	/* TLP timer running */
+#define PACE_TMR_RXT    0x08	/* Retransmit timer running */
+#define PACE_TMR_PERSIT 0x10	/* Persists timer running */
+#define PACE_TMR_KEEP   0x20	/* Keep alive timer running */
+#define PACE_TMR_DELACK 0x40	/* Delayed ack timer running */
+#define PACE_TMR_MASK   (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
+
+/* Magic flags for tracing progress events */
+#define PROGRESS_DROP   1
+#define PROGRESS_UPDATE 2
+#define PROGRESS_CLEAR  3
+#define PROGRESS_START  4
+
+
+/* RTT sample methods */
+#define USE_RTT_HIGH 0
+#define USE_RTT_LOW  1
+#define USE_RTT_AVG  2
+
+#ifdef _KERNEL
+/* We have only 7 bits in rack so assert its true */
+CTASSERT((PACE_TMR_MASK & 0x80) == 0);
+#endif
+#endif
--- a/sys/netinet/tcp_stacks/sack_filter.c
+++ b/sys/netinet/tcp_stacks/sack_filter.c
@ -0,0 +1,706 @@
+/*-
+ * Copyright (c) 2017
+ *	Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/mbuf.h>
+#include <sys/sockopt.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_seq.h>
+#ifndef _KERNEL
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <getopt.h>
+#endif
+#include "sack_filter.h"
+
+/*
+ * Sack filter is used to filter out sacks
+ * that have already been processed. The idea
+ * is pretty simple really, consider two sacks
+ *
+ * SACK 1
+ *   cum-ack A
+ *     sack B - C
+ * SACK 2
+ *   cum-ack A
+ *     sack D - E
+ *     sack B - C
+ * 
+ * The previous sack information (B-C) is repeated
+ * in SACK 2. If the receiver gets SACK 1 and then
+ * SACK 2 then any work associated with B-C as already
+ * been completed. This only effects where we may have
+ * (as in bbr or rack) cases where we walk a linked list.
+ *
+ * Now the utility trys to keep everything in a single
+ * cache line. This means that its not perfect and 
+ * it could be that so big of sack's come that a 
+ * "remembered" processed sack falls off the list and
+ * so gets re-processed. Thats ok, it just means we
+ * did some extra work. We could of course take more
+ * cache line hits by expanding the size of this
+ * structure, but then that would cost more.
+ */
+
+#ifndef _KERNEL
+int detailed_dump = 0;
+uint64_t cnt_skipped_oldsack = 0;
+uint64_t cnt_used_oldsack = 0;
+int highest_used=0;
+int over_written=0;
+int empty_avail=0;
+int no_collapse = 0;
+FILE *out = NULL;
+FILE *in = NULL;
+#endif
+
+#define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
+#define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
+#define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
+
+#ifndef _KERNEL
+static
+#endif
+void
+sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
+{
+	sf->sf_ack = seq;
+	sf->sf_bits = 0;
+	sf->sf_cur = 0;
+	sf->sf_used = 0;
+}
+/*
+ * Given a previous sack filter block, filter out
+ * any entries where the cum-ack moves over them
+ * fully or partially.
+ */
+static void
+sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
+{
+	int32_t i;
+	/* start with the oldest */
+	for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
+		if (sack_blk_used(sf, i)) {
+			if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
+				/* This block is consumed */
+				sf->sf_bits = sack_blk_clr(sf, i);
+				sf->sf_used--;
+			} else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
+				/* Some of it is acked */
+				sf->sf_blks[i].start = th_ack;
+				/* We could in theory break here, but
+				 * there are some broken implementations
+				 * that send multiple blocks. We want
+				 * to catch them all with similar seq's.
+				 */
+			}
+		}
+	}
+	sf->sf_ack = th_ack;
+}
+
+/* 
+ * Return true if you find that
+ * the sackblock b is on the score
+ * board. Update it along the way
+ * if part of it is on the board.
+ */
+static int32_t
+is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
+{
+	int32_t i, cnt;
+	for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
+		if (sack_blk_used(sf, i)) {
+			if (SEQ_LT(b->start, sf->sf_ack)) {
+				/* Behind cum-ack update */
+				b->start = sf->sf_ack;
+			}
+			if (SEQ_LT(b->end, sf->sf_ack)) {
+				/* End back behind too */
+				b->end = sf->sf_ack;
+			}
+			if (b->start == b->end)
+				return(1);
+			/* Jonathans Rule 1 */
+			if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
+			    SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
+				/**
+				 * Our board has this entirely in
+				 * whole or in part:
+				 *
+				 * board  |-------------|
+				 * sack   |-------------|
+				 * <or>
+				 * board  |-------------|
+				 * sack       |----|
+				 *
+				 */
+				return(1);
+			}
+			/* Jonathans Rule 2 */
+			if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
+				/**
+				 * Not near each other:
+				 * 
+				 * board   |---|
+				 * sack           |---|
+				 */
+				goto nxt_blk;
+			}
+			/* Jonathans Rule 3 */
+			if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
+				/**
+				 * Not near each other:
+				 * 
+				 * board         |---|
+				 * sack  |---|
+				 */
+				goto nxt_blk;
+			}
+			if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
+				/** 
+				 * The board block partial meets:
+				 *
+				 *  board   |--------|
+				 *  sack        |----------|  
+				 *    <or>
+				 *  board   |--------|
+				 *  sack    |--------------|  
+				 *
+				 * up with this one (we have part of it).
+				 * 1) Update the board block to the new end
+				 *      and
+				 * 2) Update the start of this block to my end.
+				 */
+				b->start = sf->sf_blks[i].end;
+				sf->sf_blks[i].end = b->end;
+				goto nxt_blk;
+			}
+			if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
+				/** 
+				 * The board block partial meets:
+				 *
+				 *  board       |--------|
+				 *  sack  |----------|  
+				 *     <or>
+				 *  board       |----|
+				 *  sack  |----------|  
+				 * 1) Update the board block to the new start
+				 *      and
+				 * 2) Update the start of this block to my end.
+				 */
+				b->end = sf->sf_blks[i].start;
+				sf->sf_blks[i].start = b->start;
+				goto nxt_blk;
+			}
+		} 
+	nxt_blk:
+		i++;
+		i %= SACK_FILTER_BLOCKS;
+	}
+	/* Did we totally consume it in pieces? */
+	if (b->start != b->end)
+		return(0);
+	else
+		return(1);
+}
+
+static int32_t
+sack_filter_old(struct sack_filter *sf, struct sackblk *in, int  numblks)
+{
+	int32_t num, i;
+	struct sackblk blkboard[TCP_MAX_SACK];
+	/* 
+	 * An old sack has arrived. It may contain data
+	 * we do not have. We might not have it since
+	 * we could have had a lost ack <or> we might have the
+	 * entire thing on our current board. We want to prune
+	 * off anything we have. With this function though we
+	 * won't add to the board.
+	 */
+	for( i = 0, num = 0; i<numblks; i++ ) {
+		if (is_sack_on_board(sf, &in[i])) {
+#ifndef _KERNEL
+			cnt_skipped_oldsack++;
+#endif
+			continue;
+		}
+		/* Did not find it (or found only 
+		 * a piece of it). Copy it to 
+		 * our outgoing board.
+		 */
+		memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
+#ifndef _KERNEL
+		cnt_used_oldsack++;
+#endif
+		num++;
+	}
+	if (num) {
+		memcpy(in, blkboard, (num * sizeof(struct sackblk)));
+	}
+	return (num);
+}
+
+/* 
+ * Given idx its used but there is space available 
+ * move the entry to the next free slot
+ */
+static void
+sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
+{
+	int32_t i, cnt;
+
+	i = (idx + 1) % SACK_FILTER_BLOCKS;
+	for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
+		if (sack_blk_used(sf, i) == 0) {
+			memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));			
+			sf->sf_bits = sack_blk_clr(sf, idx);
+			sf->sf_bits = sack_blk_set(sf, i);
+			return;
+		}
+		i++;
+		i %= SACK_FILTER_BLOCKS;
+	}
+}
+
+static int32_t
+sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
+{
+	struct sackblk blkboard[TCP_MAX_SACK];
+	int32_t num, i;
+	/* 
+	 * First lets trim the old and possibly 
+	 * throw any away we have. 
+	 */
+	for(i=0, num=0; i<numblks; i++) {
+		if (is_sack_on_board(sf, &in[i]))
+			continue;
+		memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
+		num++;
+	}
+	if (num == 0)
+		return(num);
+
+	/* Now what we are left is either 
+	 * completely merged on to the board
+	 * from the above steps, or are new
+	 * and need to be added to the board
+	 * with the last one updated to current.
+	 *
+	 * First copy it out we want to return that
+	 * to our caller for processing.
+	 */
+	memcpy(in, blkboard, (num * sizeof(struct sackblk)));	
+	numblks = num;
+	/* Now go through and add to our board as needed */
+	for(i=(num-1); i>=0; i--) {
+		if (is_sack_on_board(sf, &blkboard[i]))
+			continue;
+		/* Add this guy its not listed */
+		sf->sf_cur++;
+		sf->sf_cur %= SACK_FILTER_BLOCKS;
+		if ((sack_blk_used(sf, sf->sf_cur)) &&
+		    (sf->sf_used < SACK_FILTER_BLOCKS)) {
+			sack_move_to_empty(sf, sf->sf_cur);
+		}
+#ifndef _KERNEL
+		if (sack_blk_used(sf, sf->sf_cur)) {
+			over_written++;
+			if (sf->sf_used < SACK_FILTER_BLOCKS)
+				empty_avail++;
+		}
+#endif
+		memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
+		if (sack_blk_used(sf, sf->sf_cur) == 0) {
+			sf->sf_used++;
+#ifndef _KERNEL
+			if (sf->sf_used > highest_used)
+				highest_used = sf->sf_used;
+#endif
+			sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
+		}
+	}
+	return(numblks);
+}
+
+/*
+ * Given a sack block on the board (the skip index) see if
+ * any other used entries overlap or meet, if so return the index.
+ */
+static int32_t
+sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
+{
+	int32_t i;
+	
+	for(i=0; i<SACK_FILTER_BLOCKS; i++) {
+		if (sack_blk_used(sf, i) == 0)
+			continue;
+		if (i == skip)
+			continue;
+		if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
+		    SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
+		    SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
+			/** 
+			 * The two board blocks meet:
+			 *
+			 *  board1   |--------|
+			 *  board2       |----------|  
+			 *    <or>
+			 *  board1   |--------|
+			 *  board2   |--------------|  
+			 *    <or>
+			 *  board1   |--------|
+			 *  board2   |--------|
+			 */
+			return(i);
+		}
+		if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
+		    SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
+		    SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
+			/** 
+			 * The board block partial meets:
+			 *
+			 *  board       |--------|
+			 *  sack  |----------|  
+			 *     <or>
+			 *  board       |----|
+			 *  sack  |----------|  
+			 * 1) Update the board block to the new start
+			 *      and
+			 * 2) Update the start of this block to my end.
+			 */
+			return(i);
+		}
+	}
+	return (-1);
+}
+
+/*
+ * Collapse entry src into entry into
+ * and free up the src entry afterwards.
+ */
+static void
+sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
+{
+	if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
+		/* src has a lower starting point */
+		sf->sf_blks[into].start = sf->sf_blks[src].start;
+	}
+	if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
+		/* src has a higher ending point */
+		sf->sf_blks[into].end = sf->sf_blks[src].end;
+	}
+	sf->sf_bits = sack_blk_clr(sf, src);
+	sf->sf_used--;
+}
+
+static void
+sack_board_collapse(struct sack_filter *sf)
+{
+	int32_t i, j, i_d, j_d;
+
+	for(i=0; i<SACK_FILTER_BLOCKS; i++) {
+		if (sack_blk_used(sf, i) == 0)
+			continue;
+		/*
+		 * Look at all other blocks but this guy 
+		 * to see if they overlap. If so we collapse
+		 * the two blocks together.
+		 */
+		j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
+		if (j == -1) {
+			/* No overlap */
+			continue;
+		}
+		/* 
+		 * Ok j and i overlap with each other, collapse the
+		 * one out furthest away from the current position.
+		 */
+		if (sf->sf_cur > i)
+			i_d = sf->sf_cur - i;
+		else
+			i_d = i - sf->sf_cur;
+		if (sf->sf_cur > j)
+			j_d = sf->sf_cur - j;
+		else
+			j_d = j - sf->sf_cur;
+		if (j_d > i_d) {
+			sack_collapse(sf, j, i);
+		} else
+			sack_collapse(sf, i, j);
+	}
+}
+
+#ifndef _KERNEL
+static
+#endif
+int
+sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
+{
+	int32_t i, ret;
+	
+	if (numblks > TCP_MAX_SACK) {
+		panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
+		      sf, in, 
+		      numblks);
+		return(numblks);
+	}
+	if ((sf->sf_used == 0) && numblks) {
+		/* 
+		 * We are brand new add the blocks in 
+		 * reverse order. Note we can see more
+		 * than one in new, since ack's could be lost.
+		 */
+		sf->sf_ack = th_ack;
+		for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
+			memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
+			sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
+			sf->sf_cur++;
+			sf->sf_cur %= SACK_FILTER_BLOCKS;
+			sf->sf_used++;
+#ifndef _KERNEL
+			if (sf->sf_used > highest_used)
+				highest_used = sf->sf_used;
+#endif
+		}
+		if (sf->sf_cur)
+			sf->sf_cur--;
+		return(numblks);
+	}
+	if (SEQ_GT(th_ack, sf->sf_ack)) {
+		sack_filter_prune(sf, th_ack);
+	}
+	if (numblks) {
+		if (SEQ_GEQ(th_ack, sf->sf_ack)) {
+			ret = sack_filter_new(sf, in, numblks, th_ack);
+		} else {
+			ret = sack_filter_old(sf, in, numblks);
+		}
+	} else
+		ret = 0;
+#ifndef _KERNEL
+	if ((sf->sf_used > 1) && (no_collapse == 0))
+		sack_board_collapse(sf);
+
+#else	
+	if (sf->sf_used > 1) 
+		sack_board_collapse(sf);
+
+#endif
+	return (ret);
+}
+
+#ifndef _KERNEL
+uint64_t saved=0;
+uint64_t tot_sack_blks=0;
+
+static void
+sack_filter_dump(FILE *out, struct sack_filter *sf)
+{
+	int i;
+	fprintf(out, "	sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
+		sf->sf_ack, sf->sf_bits,
+		sf->sf_cur, sf->sf_used);
+
+	for(i=0; i<SACK_FILTER_BLOCKS; i++) {
+		if (sack_blk_used(sf, i)) {
+			fprintf(out, "Entry:%d start:%u end:%u\n", i,
+			       sf->sf_blks[i].start,
+			       sf->sf_blks[i].end);
+		}
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	char buffer[512];
+	struct sackblk blks[TCP_MAX_SACK];
+	FILE *err;
+	tcp_seq th_ack, snd_una;
+	struct sack_filter sf;
+	int32_t numblks,i;
+	int snd_una_set=0;
+	double a, b, c;
+	int invalid_sack_print = 0;	
+	uint32_t chg_remembered=0;
+	uint32_t sack_chg=0;
+	char line_buf[10][256];
+	int line_buf_at=0;
+
+	in = stdin;
+	out = stdout;
+	while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
+		switch (i) {
+		case 'n':
+			no_collapse = 1;
+			break;
+		case 'd':
+			detailed_dump = 1;
+			break;
+		case'I':
+			invalid_sack_print = 1;
+			break;
+		case 'i':
+			in = fopen(optarg, "r");
+			if (in == NULL) {
+				fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
+				exit(-1);
+			}
+			break;
+		case 'o':
+			out = fopen(optarg, "w");
+			if (out == NULL) {
+				fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
+				exit(-1);
+			}
+			break;
+		default:
+		case '?':
+		case 'h':
+			fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
+			return(0);
+			break;
+		};
+	}
+	sack_filter_clear(&sf, 0);
+	memset(buffer, 0, sizeof(buffer));
+	memset(blks, 0, sizeof(blks));
+	numblks = 0;
+	fprintf(out, "************************************\n");
+	while (fgets(buffer, sizeof(buffer), in) != NULL) {
+		sprintf(line_buf[line_buf_at], "%s", buffer);
+		line_buf_at++;
+		if (strncmp(buffer, "QUIT", 4) == 0) {
+			break;
+		} else if (strncmp(buffer, "DONE", 4) == 0) {
+			int nn, ii;
+			if (numblks) {
+				uint32_t szof, tot_chg;
+				for(ii=0; ii<line_buf_at; ii++) {
+					fprintf(out, "%s", line_buf[ii]);
+				}
+				fprintf(out, "------------------------------------\n");
+				nn = sack_filter_blks(&sf, blks, numblks, th_ack);
+				saved += numblks - nn;
+				tot_sack_blks += numblks;
+				fprintf(out, "ACK:%u\n", sf.sf_ack);
+				for(ii=0, tot_chg=0; ii<nn; ii++) {
+					szof = blks[ii].end - blks[ii].start;
+					tot_chg += szof;
+					fprintf(out, "SACK:%u:%u [%u]\n",
+					       blks[ii].start,
+						blks[ii].end, szof);
+				}
+				fprintf(out,"************************************\n");
+				chg_remembered = tot_chg;
+				if (detailed_dump) {
+					sack_filter_dump(out, &sf);
+					fprintf(out,"************************************\n");
+				}
+			}
+			memset(blks, 0, sizeof(blks));
+			memset(line_buf, 0, sizeof(line_buf));
+			line_buf_at=0;
+			numblks = 0;
+		} else if (strncmp(buffer, "CHG:", 4) == 0) {
+			sack_chg = strtoul(&buffer[4], NULL, 0);
+			if ((sack_chg != chg_remembered) &&
+			    (sack_chg > chg_remembered)){
+				fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
+					sack_chg, chg_remembered
+					);
+			}
+			sack_chg = chg_remembered = 0;
+		} else if (strncmp(buffer, "RXT", 3) == 0) {
+			sack_filter_clear(&sf, snd_una);
+		} else if (strncmp(buffer, "ACK:", 4) == 0) {
+			th_ack = strtoul(&buffer[4], NULL, 0);
+			if (snd_una_set == 0) {
+				snd_una = th_ack;
+				snd_una_set = 1;
+			} else if (SEQ_GT(th_ack, snd_una)) {
+				snd_una = th_ack;
+			}
+		} else if (strncmp(buffer, "EXIT", 4) == 0) {
+			sack_filter_clear(&sf, snd_una);
+			sack_chg = chg_remembered = 0;
+		} else if (strncmp(buffer, "SACK:", 5) == 0) {
+			char *end=NULL;
+			uint32_t start;
+			uint32_t endv;
+			start = strtoul(&buffer[5], &end, 0);
+			if (end) {
+				endv = strtoul(&end[1], NULL, 0);
+			} else {
+				fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
+				continue;
+			}
+			if (SEQ_LT(endv, start)) {
+				fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
+				continue;
+			}
+			if (numblks == TCP_MAX_SACK) {
+				fprintf(out, "--Exceeded max %d\n", numblks);
+				exit(0);
+			}
+			blks[numblks].start = start;
+			blks[numblks].end = endv;
+			numblks++;
+		}
+		memset(buffer, 0, sizeof(buffer));
+	}
+	if (in != stdin) {
+		fclose(in);
+	}
+	if (out != stdout) {
+		fclose(out);
+	}
+	a = saved * 100.0;
+	b = tot_sack_blks * 1.0;
+	if (b > 0.0)
+		c = a/b;
+	else
+		c = 0.0;
+	if (out != stdout)
+		err = stdout;
+	else
+		err = stderr;
+	fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
+		saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);
+	return(0);
+}
+#endif
--- a/sys/netinet/tcp_stacks/sack_filter.h
+++ b/sys/netinet/tcp_stacks/sack_filter.h
@ -0,0 +1,58 @@
+#ifndef __sack_filter_h__
+#define __sack_filter_h__
+/*-
+ * Copyright (c) 2017
+ *	Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$");
+ */
+
+/*
+ * Seven entry's is carefully choosen to
+ * fit in one cache line. We can easily
+ * change this to 15 (but it gets very
+ * little extra filtering). To change it
+ * to be larger than 15 would require either
+ * sf_bits becoming a uint32_t and then you
+ * could go to 31.. or change it to a full
+ * bitstring.. It is really doubtful you
+ * will get much benefit beyond 7, in testing
+ * there was a small amount but very very small.
+ */
+#define SACK_FILTER_BLOCKS 7
+
+struct sack_filter {
+	tcp_seq sf_ack;
+	uint16_t sf_bits;
+	uint8_t sf_cur;
+	uint8_t sf_used;
+	struct sackblk sf_blks[SACK_FILTER_BLOCKS];
+};
+#ifdef _KERNEL
+void sack_filter_clear(struct sack_filter *sf, tcp_seq seq);
+int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack);
+
+#endif
+#endif
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@ -0,0 +1,321 @@
+/*-
+ * Copyright (c) 2016
+ *	Netflix Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_RACK_H_
+#define _NETINET_TCP_RACK_H_
+
+#define RACK_ACKED	  0x0001/* The remote endpoint acked this */
+#define RACK_TO_MIXED	  0x0002/* A timeout occured that mixed the send order */
+#define RACK_DEFERRED	  0x0004/* We can't use this for RTT calc */
+#define RACK_OVERMAX	  0x0008/* We have more retran's then we can fit */
+#define RACK_SACK_PASSED  0x0010/* A sack was done above this block */
+#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */
+#define RACK_HAS_FIN	  0x0040/* segment is sent with fin */
+#define RACK_TLP	  0x0080/* segment sent as tail-loss-probe */
+
+#define RACK_NUM_OF_RETRANS 3
+
+#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */
+
+struct rack_sendmap {
+	TAILQ_ENTRY(rack_sendmap) r_next;	/* seq number arrayed next */
+	TAILQ_ENTRY(rack_sendmap) r_tnext;	/* Time of transmit based next */
+	uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
+	uint32_t r_start;	/* Sequence number of the segment */
+	uint32_t r_end;		/* End seq, this is 1 beyond actually */
+	uint32_t r_rtr_bytes;	/* How many bytes have been retransmitted */
+	uint16_t r_rtr_cnt;	/* Retran count, index this -1 to get time
+				 * sent */
+	uint8_t r_flags;	/* Flags as defined above */
+	uint8_t r_sndcnt;	/* Retran count, not limited by
+				 * RACK_NUM_OF_RETRANS */
+	uint8_t r_in_tmap;	/* Flag to see if its in the r_tnext array */
+	uint8_t r_resv[3];
+};
+
+TAILQ_HEAD(rack_head, rack_sendmap);
+
+
+/*
+ * We use the rate sample structure to
+ * assist in single sack/ack rate and rtt
+ * calculation. In the future we will expand
+ * this in BBR to do forward rate sample
+ * b/w estimation.
+ */
+#define RACK_RTT_EMPTY 0x00000001	/* Nothing yet stored in RTT's */
+#define RACK_RTT_VALID 0x00000002	/* We have at least one valid RTT */
+struct rack_rtt_sample {
+	uint32_t rs_flags;
+	uint32_t rs_rtt_lowest;
+	uint32_t rs_rtt_highest;
+	uint32_t rs_rtt_cnt;
+	uint64_t rs_rtt_tot;
+};
+
+#define RACK_LOG_TYPE_ACK	0x01
+#define RACK_LOG_TYPE_OUT	0x02
+#define RACK_LOG_TYPE_TO	0x03
+#define RACK_LOG_TYPE_ALLOC     0x04
+#define RACK_LOG_TYPE_FREE      0x05
+
+
+struct rack_log {
+	union {
+		struct rack_sendmap *rsm;	/* For alloc/free */
+		uint64_t sb_acc;/* For out/ack or t-o */
+	};
+	uint32_t th_seq;
+	uint32_t th_ack;
+	uint32_t snd_una;
+	uint32_t snd_nxt;	/* th_win for TYPE_ACK */
+	uint32_t snd_max;
+	uint32_t blk_start[4];
+	uint32_t blk_end[4];
+	uint8_t type;
+	uint8_t n_sackblks;
+	uint16_t len;		/* Timeout T3=1, TLP=2, RACK=3 */
+};
+
+/*
+ * Magic numbers for logging timeout events if the
+ * logging is enabled.
+ */
+#define RACK_TO_FRM_TMR  1
+#define RACK_TO_FRM_TLP  2
+#define RACK_TO_FRM_RACK 3
+#define RACK_TO_FRM_KEEP 4
+#define RACK_TO_FRM_PERSIST 5
+#define RACK_TO_FRM_DELACK 6
+
+struct rack_opts_stats {
+	uint64_t tcp_rack_prop_rate;
+ 	uint64_t tcp_rack_prop;
+	uint64_t tcp_rack_tlp_reduce;
+	uint64_t tcp_rack_early_recov;
+	uint64_t tcp_rack_pace_always;
+	uint64_t tcp_rack_pace_reduce;
+	uint64_t tcp_rack_max_seg;
+	uint64_t tcp_rack_prr_sendalot;
+	uint64_t tcp_rack_min_to;
+	uint64_t tcp_rack_early_seg;
+	uint64_t tcp_rack_reord_thresh;
+	uint64_t tcp_rack_reord_fade;
+	uint64_t tcp_rack_tlp_thresh;
+	uint64_t tcp_rack_pkt_delay;
+	uint64_t tcp_rack_tlp_inc_var;
+	uint64_t tcp_tlp_use;
+	uint64_t tcp_rack_idle_reduce;
+	uint64_t tcp_rack_idle_reduce_high;
+	uint64_t rack_no_timer_in_hpts;
+	uint64_t tcp_rack_min_pace_seg;
+	uint64_t tcp_rack_min_pace;
+};
+
+#define TLP_USE_ID	1	/* Internet draft behavior */
+#define TLP_USE_TWO_ONE 2	/* Use 2.1 behavior */
+#define TLP_USE_TWO_TWO 3	/* Use 2.2 behavior */
+
+#ifdef _KERNEL
+#define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t))
+extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
+#define RACK_OPTS_ADD(name, amm) counter_u64_add(rack_opts_arry[(offsetof(struct rack_opts_stats, name)/sizeof(uint64_t))], (amm))
+#define RACK_OPTS_INC(name) RACK_OPTS_ADD(name, 1)
+#endif
+/*
+ * As we get each SACK we wade through the
+ * rc_map and mark off what is acked.
+ * We also increment rc_sacked as well.
+ *
+ * We also pay attention to missing entries
+ * based on the time and possibly mark them
+ * for retransmit. If we do and we are not already
+ * in recovery we enter recovery. In doing
+ * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec.
+ * We also setup rc_next/rc_snd_nxt/rc_send_end so
+ * we will know where to send from. When not in
+ * recovery rc_next will be NULL and rc_snd_nxt should
+ * equal snd_max.
+ *
+ * Whenever we retransmit from recovery we increment
+ * rc_holes_rxt as we retran a block and mark it as retransmitted
+ * with the time it was sent. During non-recovery sending we
+ * add to our map and note the time down of any send expanding
+ * the rc_map at the tail and moving rc_snd_nxt up with snd_max.
+ *
+ * In recovery during SACK/ACK processing if a chunk has
+ * been retransmitted and it is now acked, we decrement rc_holes_rxt.
+ * When we retransmit from the scoreboard we use
+ * rc_next and rc_snd_nxt/rc_send_end to help us
+ * find what needs to be retran.
+ *
+ * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt
+ * This gets us the effect of RFC6675 pipe, counting twice for
+ * bytes retransmitted.
+ */
+
+#define TT_RACK_FR_TMR	0x2000
+
+/*
+ * Locking for the rack control block.
+ * a) Locked by INP_WLOCK
+ * b) Locked by the hpts-mutex
+ *
+ */
+
+struct rack_control {
+	/* Second cache line 0x40 from tcp_rack */
+	struct rack_head rc_map;/* List of all segments Lock(a) */
+	struct rack_head rc_tmap;	/* List in transmit order Lock(a) */
+	struct rack_sendmap *rc_tlpsend;	/* Remembered place for
+						 * tlp_sending Lock(a) */
+	struct rack_sendmap *rc_resend;	/* something we have been asked to
+					 * resend */
+	uint32_t rc_hpts_flags;
+	uint32_t rc_timer_exp;	/* If a timer ticks of expiry */
+	uint32_t rc_rack_min_rtt;	/* lowest RTT seen Lock(a) */
+	uint32_t rc_rack_largest_cwnd;	/* Largest CWND we have seen Lock(a) */
+
+	/* Third Cache line 0x80 */
+	struct rack_head rc_free;	/* Allocation array */
+	uint32_t rc_time_last_sent;	/* Time we last sent some data and
+					 * logged it Lock(a). */
+	uint32_t rc_reorder_ts;	/* Last time we saw reordering Lock(a) */
+
+	uint32_t rc_tlp_new_data;	/* we need to send new-data on a TLP
+					 * Lock(a) */
+	uint32_t rc_prr_out;	/* bytes sent during recovery Lock(a) */
+
+	uint32_t rc_prr_recovery_fs;	/* recovery fs point Lock(a) */
+
+	uint32_t rc_prr_sndcnt;	/* Prr sndcnt Lock(a) */
+
+	uint32_t rc_sacked;	/* Tot sacked on scoreboard Lock(a) */
+	uint32_t rc_last_tlp_seq;	/* Last tlp sequence Lock(a) */
+
+	uint32_t rc_prr_delivered;	/* during recovery prr var Lock(a) */
+	uint16_t rc_tlp_send_cnt;	/* Number of TLP sends we have done
+					 * since peer spoke to us Lock(a) */
+	uint16_t rc_tlp_seg_send_cnt;	/* Number of times we have TLP sent
+					 * rc_last_tlp_seq Lock(a) */
+
+	uint32_t rc_loss_count;	/* During recovery how many segments were lost
+				 * Lock(a) */
+	uint32_t rc_reorder_fade;	/* Socket option value Lock(a) */
+
+	/* Forth cache line 0xc0  */
+	/* Times */
+
+	uint32_t rc_rack_tmit_time;	/* Rack transmit time Lock(a) */
+	uint32_t rc_holes_rxt;	/* Tot retraned from scoreboard Lock(a) */
+
+	/* Variables to track bad retransmits and recover */
+	uint32_t rc_rsm_start;	/* RSM seq number we retransmitted Lock(a) */
+	uint32_t rc_cwnd_at;	/* cwnd at the retransmit Lock(a) */
+
+	uint32_t rc_ssthresh_at;/* ssthresh at the retransmit Lock(a) */
+	uint32_t rc_num_maps_alloced;	/* Number of map blocks (sacks) we
+					 * have allocated */
+	uint32_t rc_rcvtime;	/* When we last received data */
+	uint32_t rc_notused;
+	uint32_t rc_last_output_to; 
+	uint32_t rc_went_idle_time;
+
+	struct rack_sendmap *rc_sacklast;	/* sack remembered place
+						 * Lock(a) */
+
+	struct rack_sendmap *rc_next;	/* remembered place where we next
+					 * retransmit at Lock(a) */
+	struct rack_sendmap *rc_rsm_at_retran;	/* Debug variable kept for
+						 * cache line alignment
+						 * Lock(a) */
+	/* Cache line split 0x100 */
+	struct sack_filter rack_sf;
+	/* Cache line split 0x140 */
+	/* Flags for various things */
+	struct rack_rtt_sample rack_rs;
+	uint32_t rc_tlp_threshold;	/* Socket option value Lock(a) */
+	uint16_t rc_early_recovery_segs;	/* Socket option value Lock(a) */
+	uint16_t rc_reorder_shift;	/* Socket option value Lock(a) */
+	uint16_t rc_pkt_delay;	/* Socket option value Lock(a) */
+	uint8_t rc_prop_rate;	/* Socket option value Lock(a) */
+	uint8_t rc_prop_reduce;	/* Socket option value Lock(a) */
+	uint8_t rc_tlp_cwnd_reduce;	/* Socket option value Lock(a) */
+	uint8_t rc_early_recovery;	/* Socket option value Lock(a) */
+	uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
+	uint8_t rc_min_to;	/* Socket option value Lock(a) */
+	uint8_t rc_prr_inc_var;	/* Socket option value Lock(a) */
+	uint8_t rc_tlp_rtx_out;	/* This is TLPRtxOut in the draft */
+	uint8_t rc_rate_sample_method;
+};
+
+#ifdef _KERNEL
+
+struct tcp_rack {
+	/* First cache line 0x00 */
+	TAILQ_ENTRY(tcp_rack) r_hpts;	/* hptsi queue next Lock(b) */
+	int32_t(*r_substate) (struct mbuf *, struct tcphdr *,
+	    struct socket *, struct tcpcb *, struct tcpopt *,
+	    int32_t, int32_t, int32_t *, uint32_t, int, int);	/* Lock(a) */
+	struct tcpcb *rc_tp;	/* The tcpcb Lock(a) */
+	struct inpcb *rc_inp;	/* The inpcb Lock(a) */
+	uint32_t rc_free_cnt;	/* Number of free entries on the rc_free list
+				 * Lock(a) */
+	uint32_t rc_rack_rtt;	/* RACK-RTT Lock(a) */
+	uint16_t r_wanted_output;	/* Output routine wanted to be called */
+	uint16_t r_cpu;		/* CPU that the INP is running on Lock(a) */
+	uint16_t rc_pace_max_segs;	/* Socket option value Lock(a) */
+	uint16_t rc_pace_reduce;/* Socket option value Lock(a) */
+
+	uint8_t r_state;	/* Current rack state Lock(a) */
+	uint8_t rc_tmr_stopped : 7,
+		t_timers_stopped : 1;
+	uint8_t rc_enobuf;	/* count of enobufs on connection provides
+				 * backoff Lock(a) */
+	uint8_t r_timer_override : 1,	/* hpts override Lock(a) */
+		r_tlp_running : 1, 	/* Running from a TLP timeout Lock(a) */
+		r_is_v6 : 1,	/* V6 pcb Lock(a)  */
+		rc_in_persist : 1,
+		rc_last_pto_set : 1, /* XXX not used */
+		rc_tlp_in_progress : 1,
+		rc_always_pace : 1,	/* Socket option value Lock(a) */
+		rc_timer_up : 1;	/* The rack timer is up flag  Lock(a) */
+	uint8_t r_idle_reduce_largest : 1,
+		r_enforce_min_pace : 2,
+		r_min_pace_seg_thresh : 5;
+	uint8_t rack_tlp_threshold_use;
+	uint8_t rc_allow_data_af_clo: 1,
+		delayed_ack : 1,
+		rc_avail : 6;
+	uint8_t r_resv[2];	/* Fill to cache line boundary */
+	/* Cache line 2 0x40 */
+	struct rack_control r_ctl;
+}        __aligned(CACHE_LINE_SIZE);
+
+#endif
+#endif
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
+#include <netinet/tcp_seq.h>
 #include <netinet/cc/cc.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
@ -139,7 +140,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
 	/* max idle probes */
 int	tcp_maxpersistidle;

-static int	tcp_rexmit_drop_options = 0;
+int	tcp_rexmit_drop_options = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
    &tcp_rexmit_drop_options, 0,
    "Drop TCP options from 3rd and later retransmitted SYN");
@ -174,18 +175,13 @@ static int	per_cpu_timers = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
    &per_cpu_timers , 0, "run tcp timers on all cpus");

-#if 0
-#define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
-		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
-#endif
-
 /*
 * Map the given inp to a CPU id.
 *
 * This queries RSS if it's compiled in, else it defaults to the current
 * CPU ID.
 */
-static inline int
+inline int
 inp_to_cpuid(struct inpcb *inp)
 {
 	u_int cpuid;
@ -243,7 +239,7 @@ int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
    { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };

-static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
+int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */

 /*
 * TCP timer processing.
@ -948,6 +944,111 @@ tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 	return callout_active(t_callout);
 }

+/*
+ * Stop the timer from running, and apply a flag
+ * against the timer_flags that will force the
+ * timer never to run. The flag is needed to assure
+ * a race does not leave it running and cause
+ * the timer to possibly restart itself (keep and persist
+ * especially do this). 
+ */
+int
+tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type)
+{
+	struct callout *t_callout;
+	uint32_t t_flags;
+
+	switch (timer_type) {
+		case TT_DELACK:
+			t_flags = TT_DELACK_SUS;
+			t_callout = &tp->t_timers->tt_delack;
+			break;
+		case TT_REXMT:
+			t_flags = TT_REXMT_SUS;
+			t_callout = &tp->t_timers->tt_rexmt;
+			break;
+		case TT_PERSIST:
+			t_flags = TT_PERSIST_SUS;
+			t_callout = &tp->t_timers->tt_persist;
+			break;
+		case TT_KEEP:
+			t_flags = TT_KEEP_SUS;
+			t_callout = &tp->t_timers->tt_keep;
+			break;
+		case TT_2MSL:
+			t_flags = TT_2MSL_SUS;
+			t_callout = &tp->t_timers->tt_2msl;
+			break;
+		default:
+			panic("tp:%p bad timer_type 0x%x", tp, timer_type);
+	}
+	tp->t_timers->tt_flags |= t_flags;
+	return (callout_stop(t_callout));
+}
+
+void
+tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type)
+{
+	switch (timer_type) {
+		case TT_DELACK:
+			if (tp->t_timers->tt_flags & TT_DELACK_SUS) {
+				tp->t_timers->tt_flags &= ~TT_DELACK_SUS;
+				if (tp->t_flags & TF_DELACK) {
+					/* Delayed ack timer should be up activate a timer */
+					tp->t_flags &= ~TF_DELACK;
+					tcp_timer_activate(tp, TT_DELACK,
+					    tcp_delacktime);
+				}
+			}
+			break;
+		case TT_REXMT:
+			if (tp->t_timers->tt_flags & TT_REXMT_SUS) {
+				tp->t_timers->tt_flags &= ~TT_REXMT_SUS;
+				if (SEQ_GT(tp->snd_max, tp->snd_una) &&
+				    (tcp_timer_active((tp), TT_PERSIST) == 0) &&
+				    tp->snd_wnd) {
+					/* We have outstanding data activate a timer */
+					tcp_timer_activate(tp, TT_REXMT, 
+                                            tp->t_rxtcur);
+				}
+			}
+			break;
+		case TT_PERSIST:
+			if (tp->t_timers->tt_flags & TT_PERSIST_SUS) {
+				tp->t_timers->tt_flags &= ~TT_PERSIST_SUS;
+				if (tp->snd_wnd == 0) {
+					/* Activate the persists timer */
+					tp->t_rxtshift = 0;
+					tcp_setpersist(tp);
+				}
+			}
+			break;
+		case TT_KEEP:
+			if (tp->t_timers->tt_flags & TT_KEEP_SUS) {
+				tp->t_timers->tt_flags &= ~TT_KEEP_SUS;
+				tcp_timer_activate(tp, TT_KEEP,
+					    TCPS_HAVEESTABLISHED(tp->t_state) ?
+					    TP_KEEPIDLE(tp) : TP_KEEPINIT(tp));
+			}
+			break;
+		case TT_2MSL:
+			if (tp->t_timers->tt_flags &= TT_2MSL_SUS) {
+				tp->t_timers->tt_flags &= ~TT_2MSL_SUS;
+				if ((tp->t_state == TCPS_FIN_WAIT_2) &&
+				    ((tp->t_inpcb->inp_socket == NULL) ||
+				     (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) {
+					/* Star the 2MSL timer */
+					tcp_timer_activate(tp, TT_2MSL,
+					    (tcp_fast_finwait2_recycle) ?
+					    tcp_finwait2_timeout : TP_MAXIDLE(tp));
+				}
+			}
+			break;
+		default:
+			panic("tp:%p bad timer_type 0x%x", tp, timer_type);
+	}
+}
+
 void
 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@ -168,11 +168,15 @@ struct tcp_timer {
 #define TT_2MSL		0x0010
 #define TT_MASK		(TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)

-#define TT_DELACK_RST	0x0100
-#define TT_REXMT_RST	0x0200
-#define TT_PERSIST_RST	0x0400
-#define TT_KEEP_RST	0x0800
-#define TT_2MSL_RST	0x1000
+/* 
+ * Suspend flags - used when suspending a timer
+ * from ever running again.
+ */
+#define TT_DELACK_SUS	0x0100
+#define TT_REXMT_SUS	0x0200
+#define TT_PERSIST_SUS	0x0400
+#define TT_KEEP_SUS	0x0800
+#define TT_2MSL_SUS	0x1000

 #define TT_STOPPED	0x00010000

@ -196,6 +200,8 @@ extern int tcp_msl;
 extern int tcp_ttl;			/* time to live for TCP segs */
 extern int tcp_backoff[];
 extern int tcp_syn_backoff[];
+extern int tcp_totbackoff;
+extern int tcp_rexmit_drop_options;

 extern int tcp_always_keepalive;
 extern int tcp_finwait2_timeout;
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@ -93,8 +93,11 @@ struct tcpcb {
 	void	*t_fb_ptr;		/* Pointer to t_fb specific data */
 	uint32_t t_maxseg:24,		/* maximum segment size */
 		t_logstate:8;		/* State of "black box" logging */
-	uint32_t t_state:4,		/* state of this connection */
-		bits_spare : 24;
+	uint32_t t_port:16,		/* Tunneling (over udp) port */
+		t_state:4,		/* state of this connection */
+		t_idle_reduce : 1,
+		t_delayed_ack: 7,	/* Delayed ack variable */
+		bits_spare : 4;
 	u_int	t_flags;
 	tcp_seq	snd_una;		/* sent but unacknowledged */
 	tcp_seq	snd_max;		/* highest sequence number sent;
@ -104,7 +107,7 @@ struct tcpcb {
 	tcp_seq	snd_up;			/* send urgent pointer */
 	uint32_t  snd_wnd;		/* send window */
 	uint32_t  snd_cwnd;		/* congestion-controlled window */
-	uint32_t cl1_spare; 		/* Spare to round out CL 1 */
+	uint32_t t_peakrate_thr; 	/* pre-calculated peak rate threshold */
 	/* Cache line 2 */
 	u_int32_t  ts_offset;		/* our timestamp offset */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
@ -189,6 +192,7 @@ struct tcpcb {
 	struct cc_var	*ccv;		/* congestion control specific vars */
 	struct osd	*osd;		/* storage for Khelp module data */
 	int	t_bytes_acked;		/* # bytes acked during current RTT */
+	u_int   t_maxunacktime;
 	u_int	t_keepinit;		/* time to establish connection */
 	u_int	t_keepidle;		/* time before keepalive probes begin */
 	u_int	t_keepintvl;		/* interval between keepalives */
@ -361,6 +365,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
 #define	TF2_PLPMTU_PMTUD	0x00000002 /* Allowed to attempt PLPMTUD. */
 #define	TF2_PLPMTU_MAXSEGSNT	0x00000004 /* Last seg sent was full seg. */
 #define	TF2_LOG_AUTO		0x00000008 /* Session is auto-logging. */
+#define TF2_DROP_AF_DATA 	0x00000010 /* Drop after all data ack'd */

 /*
 * Structure to hold TCP options that are only used during segment
@ -649,6 +654,11 @@ struct tcp_hhook_data {
 	int		tso;
 	tcp_seq		curack;
 };
+#ifdef TCP_HHOOK
+void hhook_run_tcp_est_out(struct tcpcb *tp,
+	struct tcphdr *th, struct tcpopt *to,
+	uint32_t len, int tso);
+#endif
 #endif

 /*
@ -801,6 +811,9 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo);
 #define	V_tcp_sack_maxholes		VNET(tcp_sack_maxholes)
 #define	V_tcp_sc_rst_sock_fail		VNET(tcp_sc_rst_sock_fail)
 #define	V_tcp_sendspace			VNET(tcp_sendspace)
+#define	V_tcp_udp_tunneling_overhead	VNET(tcp_udp_tunneling_overhead)
+#define	V_tcp_udp_tunneling_port	VNET(tcp_udp_tunneling_port)
+

 #ifdef TCP_HHOOK
 VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
@ -893,9 +906,12 @@ struct tcptemp *
 	 tcpip_maketemplate(struct inpcb *);
 void	 tcpip_fillheaders(struct inpcb *, void *, void *);
 void	 tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
+int	 tcp_timer_suspend(struct tcpcb *, uint32_t);
+void	 tcp_timers_unsuspend(struct tcpcb *, uint32_t);
 int	 tcp_timer_active(struct tcpcb *, uint32_t);
 void	 tcp_timer_stop(struct tcpcb *, uint32_t);
 void	 tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
+int	 inp_to_cpuid(struct inpcb *inp);
 /*
 * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
 */
@ -921,6 +937,10 @@ void	 tcp_free_sackholes(struct tcpcb *tp);
 int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 int	 tcp_compute_pipe(struct tcpcb *);
 void	 tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
+struct mbuf *
+	 tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
+	   int32_t seglimit, int32_t segsize, struct sockbuf *sb);
+

 static inline void
 tcp_fields_to_host(struct tcphdr *th)
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@ -304,7 +304,7 @@ struct mbuf {
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
-#define	M_UNUSED_8	0x00000100 /* --available-- */
+#define	M_NOMAP		0x00000100 /* mbuf data is unmapped (soon from Drew) */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_TSTMP		0x00000400 /* rcv_tstmp field is valid */
 #define	M_TSTMP_HPREC	0x00000800 /* rcv_tstmp is high-prec, typically
--- a/sys/sys/queue.h
+++ b/sys/sys/queue.h
@ -95,6 +95,7 @@
 * _NEXT			+	+	+	+
 * _PREV			-	+	-	+
 * _LAST			-	-	+	+
+ * _LAST_FAST			-	-	-	+
 * _FOREACH			+	+	+	+
 * _FOREACH_FROM		+	+	+	+
 * _FOREACH_SAFE		+	+	+	+
@ -817,6 +818,16 @@ struct {								\
 #define	TAILQ_LAST(head, headname)					\
 	(*(((struct headname *)((head)->tqh_last))->tqh_last))

+/*
+ * The FAST function is fast in that it causes no data access other
+ * then the access to the head. The standard LAST function above
+ * will cause a data access of both the element you want and 
+ * the previous element. FAST is very useful for instances when
+ * you may want to prefetch the last data element.
+ */
+#define	TAILQ_LAST_FAST(head, type, field)			\
+    (TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, QUEUE_TYPEOF(type), field.tqe_next))
+
 #define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)

 #define	TAILQ_PREV(elm, headname, field)				\
--- a/sys/sys/sockbuf.h
+++ b/sys/sys/sockbuf.h
@ -165,6 +165,10 @@ int	sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
 	    struct thread *td);
 struct mbuf *
 	sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff);
+struct mbuf *
+	sbsndptr_noadv(struct sockbuf *sb, u_int off, u_int *moff);
+void
+	sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, u_int len);
 struct mbuf *
 	sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff);
 int	sbwait(struct sockbuf *sb);
--- a/sys/sys/time.h
+++ b/sys/sys/time.h
@ -289,6 +289,22 @@ tvtosbt(struct timeval _tv)
 #endif /* __BSD_VISIBLE */

 #ifdef _KERNEL
+/*
+ * Simple macros to convert ticks to milliseconds
+ * or microseconds and vice-versa. The answer
+ * will always be at least 1. Note the return
+ * value is a uint32_t however we step up the
+ * operations to 64 bit to avoid any overflow/underflow
+ * problems.
+ */
+#define TICKS_2_MSEC(t) max(1, (uint32_t)(hz == 1000) ? \
+	  (t) : (((uint64_t)(t) * (uint64_t)1000)/(uint64_t)hz))
+#define TICKS_2_USEC(t) max(1, (uint32_t)(hz == 1000) ? \
+	  ((t) * 1000) : (((uint64_t)(t) * (uint64_t)1000000)/(uint64_t)hz))
+#define MSEC_2_TICKS(m) max(1, (uint32_t)((hz == 1000) ? \
+	  (m) : ((uint64_t)(m) * (uint64_t)hz)/(uint64_t)1000))
+#define USEC_2_TICKS(u) max(1, (uint32_t)((hz == 1000) ? \
+	 ((u) / 1000) : ((uint64_t)(u) * (uint64_t)hz)/(uint64_t)1000000))

 /* Operations on timespecs */
 #define	timespecclear(tvp)	((tvp)->tv_sec = (tvp)->tv_nsec = 0)