This commit brings in a new refactored TCP stack called Rack.

Rack includes the following features:
 - A different SACK processing scheme (the old sack structures are not used).
 - RACK (Recent acknowledgment) where counting dup-acks is no longer done
        instead time is used to knwo when to retransmit. (see the I-D)
 - TLP (Tail Loss Probe) where we will probe for tail-losses to attempt
        to try not to take a retransmit time-out. (see the I-D)
 - Burst mitigation using TCPHTPS
 - PRR (partial rate reduction) see the RFC.

Once built into your kernel, you can select this stack by either
socket option with the name of the stack is "rack" or by setting
the global sysctl so the default is rack.

Note that any connection that does not support SACK will be kicked
back to the "default" base  FreeBSD stack (currently known as "default").

To build this into your kernel you will need to enable in your
kernel:
   makeoptions WITH_EXTRA_TCP_STACKS=1
   options TCPHPTS

Sponsored by:	Netflix Inc.
Differential Revision:		https://reviews.freebsd.org/D15525
This commit is contained in:
Randall Stewart 2018-06-07 18:18:13 +00:00
parent ce024bdc0c
commit 89e560f441
19 changed files with 10766 additions and 25 deletions

View File

@ -1283,6 +1283,55 @@ sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
return (ret);
}
struct mbuf *
sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff)
{
struct mbuf *m;
KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
*moff = off;
if (sb->sb_sndptr == NULL) {
sb->sb_sndptr = sb->sb_mb;
sb->sb_sndptroff = 0;
}
return (sb->sb_mb);
} else {
m = sb->sb_sndptr;
off -= sb->sb_sndptroff;
}
*moff = off;
return (m);
}
void
sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len)
{
/*
* A small copy was done, advance forward the sb_sbsndptr to cover
* it.
*/
struct mbuf *m;
if (mb != sb->sb_sndptr) {
/* Did not copyout at the same mbuf */
return;
}
m = mb;
while (m && (len > 0)) {
if (len >= m->m_len) {
len -= m->m_len;
if (m->m_next) {
sb->sb_sndptroff += m->m_len;
sb->sb_sndptr = m->m_next;
}
m = m->m_next;
} else {
len = 0;
}
}
}
/*
* Return the first mbuf and the mbuf data offset for the provided
* send offset without changing the "sb_sndptroff" field.

View File

@ -7,10 +7,12 @@ SYSDIR?=${SRCTOP}/sys
SUBDIR= \
${_tcp_fastpath} \
${_tcp_rack} \
${_tcpmd5} \
.if ${MK_EXTRA_TCP_STACKS} != "no" || defined(ALL_MODULES)
_tcp_fastpath= fastpath
_tcp_rack= rack
.endif
.if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \

View File

@ -0,0 +1,24 @@
#
# $FreeBSD$
#
.PATH: ${.CURDIR}/../../../netinet/tcp_stacks
STACKNAME= rack
KMOD= tcp_${STACKNAME}
SRCS= rack.c sack_filter.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_tcpdebug.h
SRCS+= opt_kern_tls.h
#
# Enable full debugging
#
#CFLAGS += -g
CFLAGS+= -DMODNAME=${KMOD}
CFLAGS+= -DSTACKNAME=${STACKNAME}
CFLAGS+= -DSTACKALIAS=rack_18q21
.include <bsd.kmod.mk>

View File

@ -176,6 +176,7 @@ struct tcphdr {
device */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
#define TCP_DELACK 72 /* socket option for delayed ack */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
@ -184,6 +185,61 @@ struct tcphdr {
#define TCP_PCAP_OUT 2048 /* number of output packets to keep */
#define TCP_PCAP_IN 4096 /* number of input packets to keep */
#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */
/* Options for Rack and BBR */
#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacing reduction factor (divisor) */
#define TCP_RACK_PACE_MAX_SEG 1054 /* Max segments in a pace */
#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */
#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */
#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */
#define TCP_RACK_EARLY_RECOV 1059 /* Should recovery happen early (bool) */
#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */
#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */
#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */
#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */
#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */
#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */
#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */
#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */
#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */
#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */
#define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */
#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */
#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */
#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */
#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */
#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */
#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */
#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */
#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */
#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */
#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
#define TCP_BBR_PACE_PER_SEC 1086
#define TCP_BBR_PACE_DEL_TAR 1087
#define TCP_BBR_PACE_SEG_MAX 1088
#define TCP_BBR_PACE_SEG_MIN 1089
#define TCP_BBR_PACE_CROSS 1090
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */
#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */
#define TCP_RACK_TLP_USE 1095
#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
#define TCP_BBR_EXTRA_GAIN 1097
#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
#define TCP_BBR_RETRAN_WTSO 1099
#define TCP_DATA_AFTER_CLOSE 1100
#define TCP_BBR_PROBE_RTT_GAIN 1101
#define TCP_BBR_PROBE_RTT_LEN 1102
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR

View File

@ -94,7 +94,7 @@ struct tcp_log_bbr {
uint16_t flex7;
uint8_t bbr_state;
uint8_t bbr_substate;
uint8_t inpacer;
uint8_t inhpts;
uint8_t ininput;
uint8_t use_lt_bw;
uint8_t flex8;

View File

@ -143,18 +143,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_R
tcp_timer_active((tp), TT_PERSIST), \
("neither rexmt nor persist timer is set"))
#ifdef TCP_HHOOK
static void inline hhook_run_tcp_est_out(struct tcpcb *tp,
struct tcphdr *th, struct tcpopt *to,
uint32_t len, int tso);
#endif
static void inline cc_after_idle(struct tcpcb *tp);
#ifdef TCP_HHOOK
/*
* Wrapper for the TCP established output helper hook.
*/
static void inline
void
hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
struct tcpopt *to, uint32_t len, int tso)
{
@ -1851,6 +1846,144 @@ tcp_addoptions(struct tcpopt *to, u_char *optp)
return (optlen);
}
/*
* This is a copy of m_copym(), taking the TSO segment size/limit
* constraints into account, and advancing the sndptr as it goes.
*/
struct mbuf *
tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
int32_t seglimit, int32_t segsize, struct sockbuf *sb)
{
struct mbuf *n, **np;
struct mbuf *top;
int32_t off = off0;
int32_t len = *plen;
int32_t fragsize;
int32_t len_cp = 0;
int32_t *pkthdrlen;
uint32_t mlen, frags;
bool copyhdr;
KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off));
KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len));
if (off == 0 && m->m_flags & M_PKTHDR)
copyhdr = true;
else
copyhdr = false;
while (off > 0) {
KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain"));
if (off < m->m_len)
break;
off -= m->m_len;
if ((sb) && (m == sb->sb_sndptr)) {
sb->sb_sndptroff += m->m_len;
sb->sb_sndptr = m->m_next;
}
m = m->m_next;
}
np = &top;
top = NULL;
pkthdrlen = NULL;
while (len > 0) {
if (m == NULL) {
KASSERT(len == M_COPYALL,
("tcp_m_copym, length > size of mbuf chain"));
*plen = len_cp;
if (pkthdrlen != NULL)
*pkthdrlen = len_cp;
break;
}
mlen = min(len, m->m_len - off);
if (seglimit) {
/*
* For M_NOMAP mbufs, add 3 segments
* + 1 in case we are crossing page boundaries
* + 2 in case the TLS hdr/trailer are used
* It is cheaper to just add the segments
* than it is to take the cache miss to look
* at the mbuf ext_pgs state in detail.
*/
if (m->m_flags & M_NOMAP) {
fragsize = min(segsize, PAGE_SIZE);
frags = 3;
} else {
fragsize = segsize;
frags = 0;
}
/* Break if we really can't fit anymore. */
if ((frags + 1) >= seglimit) {
*plen = len_cp;
if (pkthdrlen != NULL)
*pkthdrlen = len_cp;
break;
}
/*
* Reduce size if you can't copy the whole
* mbuf. If we can't copy the whole mbuf, also
* adjust len so the loop will end after this
* mbuf.
*/
if ((frags + howmany(mlen, fragsize)) >= seglimit) {
mlen = (seglimit - frags - 1) * fragsize;
len = mlen;
*plen = len_cp + len;
if (pkthdrlen != NULL)
*pkthdrlen = *plen;
}
frags += howmany(mlen, fragsize);
if (frags == 0)
frags++;
seglimit -= frags;
KASSERT(seglimit > 0,
("%s: seglimit went too low", __func__));
}
if (copyhdr)
n = m_gethdr(M_NOWAIT, m->m_type);
else
n = m_get(M_NOWAIT, m->m_type);
*np = n;
if (n == NULL)
goto nospace;
if (copyhdr) {
if (!m_dup_pkthdr(n, m, M_NOWAIT))
goto nospace;
if (len == M_COPYALL)
n->m_pkthdr.len -= off0;
else
n->m_pkthdr.len = len;
pkthdrlen = &n->m_pkthdr.len;
copyhdr = false;
}
n->m_len = mlen;
len_cp += n->m_len;
if (m->m_flags & M_EXT) {
n->m_data = m->m_data + off;
mb_dupcl(n, m);
} else
bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
(u_int)n->m_len);
if (sb && (sb->sb_sndptr == m) &&
((n->m_len + off) >= m->m_len) && m->m_next) {
sb->sb_sndptroff += m->m_len;
sb->sb_sndptr = m->m_next;
}
off = 0;
if (len != M_COPYALL) {
len -= n->m_len;
}
m = m->m_next;
np = &n->m_next;
}
return (top);
nospace:
m_freem(top);
return (NULL);
}
void
tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin)
{

View File

@ -2392,7 +2392,7 @@ struct tcp_function_block __tcp_fastack = {
static int
tcp_addfastpaths(module_t mod, int type, void *data)
{
int err=0;
int err = 0;
switch (type) {
case MOD_LOAD:

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,70 @@
#ifndef __pacer_timer_h__
#define __pacer_timer_h__
/*-
* Copyright (c) 2017
* Netflix Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* __FBSDID("$FreeBSD$");
*/
/* Common defines and such used by both RACK and BBR */
/* Special values for mss accounting array */
#define TCP_MSS_ACCT_JUSTRET 0
#define TCP_MSS_ACCT_SNDACK 1
#define TCP_MSS_ACCT_PERSIST 2
#define TCP_MSS_ACCT_ATIMER 60
#define TCP_MSS_ACCT_INPACE 61
#define TCP_MSS_ACCT_LATE 62
#define TCP_MSS_SMALL_SIZE_OFF 63 /* Point where small sizes enter */
#define TCP_MSS_ACCT_SIZE 70
#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)
/* Magic flags to tell whats cooking on the pacing wheel */
#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */
#define PACE_TMR_RACK 0x02 /* RACK timer running */
#define PACE_TMR_TLP 0x04 /* TLP timer running */
#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */
#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
/* Magic flags for tracing progress events */
#define PROGRESS_DROP 1
#define PROGRESS_UPDATE 2
#define PROGRESS_CLEAR 3
#define PROGRESS_START 4
/* RTT sample methods */
#define USE_RTT_HIGH 0
#define USE_RTT_LOW 1
#define USE_RTT_AVG 2
#ifdef _KERNEL
/* We have only 7 bits in rack so assert its true */
CTASSERT((PACE_TMR_MASK & 0x80) == 0);
#endif
#endif

View File

@ -0,0 +1,706 @@
/*-
* Copyright (c) 2017
* Netflix Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/socket.h>
#include <sys/mbuf.h>
#include <sys/sockopt.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_seq.h>
#ifndef _KERNEL
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include <limits.h>
#include <getopt.h>
#endif
#include "sack_filter.h"
/*
* Sack filter is used to filter out sacks
* that have already been processed. The idea
* is pretty simple really, consider two sacks
*
* SACK 1
* cum-ack A
* sack B - C
* SACK 2
* cum-ack A
* sack D - E
* sack B - C
*
* The previous sack information (B-C) is repeated
* in SACK 2. If the receiver gets SACK 1 and then
* SACK 2 then any work associated with B-C as already
* been completed. This only effects where we may have
* (as in bbr or rack) cases where we walk a linked list.
*
* Now the utility trys to keep everything in a single
* cache line. This means that its not perfect and
* it could be that so big of sack's come that a
* "remembered" processed sack falls off the list and
* so gets re-processed. Thats ok, it just means we
* did some extra work. We could of course take more
* cache line hits by expanding the size of this
* structure, but then that would cost more.
*/
#ifndef _KERNEL
int detailed_dump = 0;
uint64_t cnt_skipped_oldsack = 0;
uint64_t cnt_used_oldsack = 0;
int highest_used=0;
int over_written=0;
int empty_avail=0;
int no_collapse = 0;
FILE *out = NULL;
FILE *in = NULL;
#endif
#define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
#define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
#define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
#ifndef _KERNEL
static
#endif
void
sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
{
sf->sf_ack = seq;
sf->sf_bits = 0;
sf->sf_cur = 0;
sf->sf_used = 0;
}
/*
* Given a previous sack filter block, filter out
* any entries where the cum-ack moves over them
* fully or partially.
*/
static void
sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
{
int32_t i;
/* start with the oldest */
for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i)) {
if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
/* This block is consumed */
sf->sf_bits = sack_blk_clr(sf, i);
sf->sf_used--;
} else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
/* Some of it is acked */
sf->sf_blks[i].start = th_ack;
/* We could in theory break here, but
* there are some broken implementations
* that send multiple blocks. We want
* to catch them all with similar seq's.
*/
}
}
}
sf->sf_ack = th_ack;
}
/*
* Return true if you find that
* the sackblock b is on the score
* board. Update it along the way
* if part of it is on the board.
*/
static int32_t
is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
{
int32_t i, cnt;
for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
if (sack_blk_used(sf, i)) {
if (SEQ_LT(b->start, sf->sf_ack)) {
/* Behind cum-ack update */
b->start = sf->sf_ack;
}
if (SEQ_LT(b->end, sf->sf_ack)) {
/* End back behind too */
b->end = sf->sf_ack;
}
if (b->start == b->end)
return(1);
/* Jonathans Rule 1 */
if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
/**
* Our board has this entirely in
* whole or in part:
*
* board |-------------|
* sack |-------------|
* <or>
* board |-------------|
* sack |----|
*
*/
return(1);
}
/* Jonathans Rule 2 */
if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
/**
* Not near each other:
*
* board |---|
* sack |---|
*/
goto nxt_blk;
}
/* Jonathans Rule 3 */
if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
/**
* Not near each other:
*
* board |---|
* sack |---|
*/
goto nxt_blk;
}
if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
/**
* The board block partial meets:
*
* board |--------|
* sack |----------|
* <or>
* board |--------|
* sack |--------------|
*
* up with this one (we have part of it).
* 1) Update the board block to the new end
* and
* 2) Update the start of this block to my end.
*/
b->start = sf->sf_blks[i].end;
sf->sf_blks[i].end = b->end;
goto nxt_blk;
}
if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
/**
* The board block partial meets:
*
* board |--------|
* sack |----------|
* <or>
* board |----|
* sack |----------|
* 1) Update the board block to the new start
* and
* 2) Update the start of this block to my end.
*/
b->end = sf->sf_blks[i].start;
sf->sf_blks[i].start = b->start;
goto nxt_blk;
}
}
nxt_blk:
i++;
i %= SACK_FILTER_BLOCKS;
}
/* Did we totally consume it in pieces? */
if (b->start != b->end)
return(0);
else
return(1);
}
static int32_t
sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks)
{
int32_t num, i;
struct sackblk blkboard[TCP_MAX_SACK];
/*
* An old sack has arrived. It may contain data
* we do not have. We might not have it since
* we could have had a lost ack <or> we might have the
* entire thing on our current board. We want to prune
* off anything we have. With this function though we
* won't add to the board.
*/
for( i = 0, num = 0; i<numblks; i++ ) {
if (is_sack_on_board(sf, &in[i])) {
#ifndef _KERNEL
cnt_skipped_oldsack++;
#endif
continue;
}
/* Did not find it (or found only
* a piece of it). Copy it to
* our outgoing board.
*/
memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
#ifndef _KERNEL
cnt_used_oldsack++;
#endif
num++;
}
if (num) {
memcpy(in, blkboard, (num * sizeof(struct sackblk)));
}
return (num);
}
/*
* Given idx its used but there is space available
* move the entry to the next free slot
*/
static void
sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
{
int32_t i, cnt;
i = (idx + 1) % SACK_FILTER_BLOCKS;
for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
if (sack_blk_used(sf, i) == 0) {
memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));
sf->sf_bits = sack_blk_clr(sf, idx);
sf->sf_bits = sack_blk_set(sf, i);
return;
}
i++;
i %= SACK_FILTER_BLOCKS;
}
}
static int32_t
sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
{
struct sackblk blkboard[TCP_MAX_SACK];
int32_t num, i;
/*
* First lets trim the old and possibly
* throw any away we have.
*/
for(i=0, num=0; i<numblks; i++) {
if (is_sack_on_board(sf, &in[i]))
continue;
memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
num++;
}
if (num == 0)
return(num);
/* Now what we are left is either
* completely merged on to the board
* from the above steps, or are new
* and need to be added to the board
* with the last one updated to current.
*
* First copy it out we want to return that
* to our caller for processing.
*/
memcpy(in, blkboard, (num * sizeof(struct sackblk)));
numblks = num;
/* Now go through and add to our board as needed */
for(i=(num-1); i>=0; i--) {
if (is_sack_on_board(sf, &blkboard[i]))
continue;
/* Add this guy its not listed */
sf->sf_cur++;
sf->sf_cur %= SACK_FILTER_BLOCKS;
if ((sack_blk_used(sf, sf->sf_cur)) &&
(sf->sf_used < SACK_FILTER_BLOCKS)) {
sack_move_to_empty(sf, sf->sf_cur);
}
#ifndef _KERNEL
if (sack_blk_used(sf, sf->sf_cur)) {
over_written++;
if (sf->sf_used < SACK_FILTER_BLOCKS)
empty_avail++;
}
#endif
memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
if (sack_blk_used(sf, sf->sf_cur) == 0) {
sf->sf_used++;
#ifndef _KERNEL
if (sf->sf_used > highest_used)
highest_used = sf->sf_used;
#endif
sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
}
}
return(numblks);
}
/*
* Given a sack block on the board (the skip index) see if
* any other used entries overlap or meet, if so return the index.
*/
static int32_t
sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
{
int32_t i;
for(i=0; i<SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i) == 0)
continue;
if (i == skip)
continue;
if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
/**
* The two board blocks meet:
*
* board1 |--------|
* board2 |----------|
* <or>
* board1 |--------|
* board2 |--------------|
* <or>
* board1 |--------|
* board2 |--------|
*/
return(i);
}
if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
/**
* The board block partial meets:
*
* board |--------|
* sack |----------|
* <or>
* board |----|
* sack |----------|
* 1) Update the board block to the new start
* and
* 2) Update the start of this block to my end.
*/
return(i);
}
}
return (-1);
}
/*
* Collapse entry src into entry into
* and free up the src entry afterwards.
*/
static void
sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
{
if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
/* src has a lower starting point */
sf->sf_blks[into].start = sf->sf_blks[src].start;
}
if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
/* src has a higher ending point */
sf->sf_blks[into].end = sf->sf_blks[src].end;
}
sf->sf_bits = sack_blk_clr(sf, src);
sf->sf_used--;
}
static void
sack_board_collapse(struct sack_filter *sf)
{
int32_t i, j, i_d, j_d;
for(i=0; i<SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i) == 0)
continue;
/*
* Look at all other blocks but this guy
* to see if they overlap. If so we collapse
* the two blocks together.
*/
j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
if (j == -1) {
/* No overlap */
continue;
}
/*
* Ok j and i overlap with each other, collapse the
* one out furthest away from the current position.
*/
if (sf->sf_cur > i)
i_d = sf->sf_cur - i;
else
i_d = i - sf->sf_cur;
if (sf->sf_cur > j)
j_d = sf->sf_cur - j;
else
j_d = j - sf->sf_cur;
if (j_d > i_d) {
sack_collapse(sf, j, i);
} else
sack_collapse(sf, i, j);
}
}
#ifndef _KERNEL
static
#endif
int
sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
{
int32_t i, ret;
if (numblks > TCP_MAX_SACK) {
panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
sf, in,
numblks);
return(numblks);
}
if ((sf->sf_used == 0) && numblks) {
/*
* We are brand new add the blocks in
* reverse order. Note we can see more
* than one in new, since ack's could be lost.
*/
sf->sf_ack = th_ack;
for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
sf->sf_cur++;
sf->sf_cur %= SACK_FILTER_BLOCKS;
sf->sf_used++;
#ifndef _KERNEL
if (sf->sf_used > highest_used)
highest_used = sf->sf_used;
#endif
}
if (sf->sf_cur)
sf->sf_cur--;
return(numblks);
}
if (SEQ_GT(th_ack, sf->sf_ack)) {
sack_filter_prune(sf, th_ack);
}
if (numblks) {
if (SEQ_GEQ(th_ack, sf->sf_ack)) {
ret = sack_filter_new(sf, in, numblks, th_ack);
} else {
ret = sack_filter_old(sf, in, numblks);
}
} else
ret = 0;
#ifndef _KERNEL
if ((sf->sf_used > 1) && (no_collapse == 0))
sack_board_collapse(sf);
#else
if (sf->sf_used > 1)
sack_board_collapse(sf);
#endif
return (ret);
}
#ifndef _KERNEL
uint64_t saved=0;
uint64_t tot_sack_blks=0;
static void
sack_filter_dump(FILE *out, struct sack_filter *sf)
{
int i;
fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
sf->sf_ack, sf->sf_bits,
sf->sf_cur, sf->sf_used);
for(i=0; i<SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i)) {
fprintf(out, "Entry:%d start:%u end:%u\n", i,
sf->sf_blks[i].start,
sf->sf_blks[i].end);
}
}
}
int
main(int argc, char **argv)
{
char buffer[512];
struct sackblk blks[TCP_MAX_SACK];
FILE *err;
tcp_seq th_ack, snd_una;
struct sack_filter sf;
int32_t numblks,i;
int snd_una_set=0;
double a, b, c;
int invalid_sack_print = 0;
uint32_t chg_remembered=0;
uint32_t sack_chg=0;
char line_buf[10][256];
int line_buf_at=0;
in = stdin;
out = stdout;
while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
switch (i) {
case 'n':
no_collapse = 1;
break;
case 'd':
detailed_dump = 1;
break;
case'I':
invalid_sack_print = 1;
break;
case 'i':
in = fopen(optarg, "r");
if (in == NULL) {
fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
exit(-1);
}
break;
case 'o':
out = fopen(optarg, "w");
if (out == NULL) {
fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
exit(-1);
}
break;
default:
case '?':
case 'h':
fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
return(0);
break;
};
}
sack_filter_clear(&sf, 0);
memset(buffer, 0, sizeof(buffer));
memset(blks, 0, sizeof(blks));
numblks = 0;
fprintf(out, "************************************\n");
while (fgets(buffer, sizeof(buffer), in) != NULL) {
sprintf(line_buf[line_buf_at], "%s", buffer);
line_buf_at++;
if (strncmp(buffer, "QUIT", 4) == 0) {
break;
} else if (strncmp(buffer, "DONE", 4) == 0) {
int nn, ii;
if (numblks) {
uint32_t szof, tot_chg;
for(ii=0; ii<line_buf_at; ii++) {
fprintf(out, "%s", line_buf[ii]);
}
fprintf(out, "------------------------------------\n");
nn = sack_filter_blks(&sf, blks, numblks, th_ack);
saved += numblks - nn;
tot_sack_blks += numblks;
fprintf(out, "ACK:%u\n", sf.sf_ack);
for(ii=0, tot_chg=0; ii<nn; ii++) {
szof = blks[ii].end - blks[ii].start;
tot_chg += szof;
fprintf(out, "SACK:%u:%u [%u]\n",
blks[ii].start,
blks[ii].end, szof);
}
fprintf(out,"************************************\n");
chg_remembered = tot_chg;
if (detailed_dump) {
sack_filter_dump(out, &sf);
fprintf(out,"************************************\n");
}
}
memset(blks, 0, sizeof(blks));
memset(line_buf, 0, sizeof(line_buf));
line_buf_at=0;
numblks = 0;
} else if (strncmp(buffer, "CHG:", 4) == 0) {
sack_chg = strtoul(&buffer[4], NULL, 0);
if ((sack_chg != chg_remembered) &&
(sack_chg > chg_remembered)){
fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
sack_chg, chg_remembered
);
}
sack_chg = chg_remembered = 0;
} else if (strncmp(buffer, "RXT", 3) == 0) {
sack_filter_clear(&sf, snd_una);
} else if (strncmp(buffer, "ACK:", 4) == 0) {
th_ack = strtoul(&buffer[4], NULL, 0);
if (snd_una_set == 0) {
snd_una = th_ack;
snd_una_set = 1;
} else if (SEQ_GT(th_ack, snd_una)) {
snd_una = th_ack;
}
} else if (strncmp(buffer, "EXIT", 4) == 0) {
sack_filter_clear(&sf, snd_una);
sack_chg = chg_remembered = 0;
} else if (strncmp(buffer, "SACK:", 5) == 0) {
char *end=NULL;
uint32_t start;
uint32_t endv;
start = strtoul(&buffer[5], &end, 0);
if (end) {
endv = strtoul(&end[1], NULL, 0);
} else {
fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
continue;
}
if (SEQ_LT(endv, start)) {
fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
continue;
}
if (numblks == TCP_MAX_SACK) {
fprintf(out, "--Exceeded max %d\n", numblks);
exit(0);
}
blks[numblks].start = start;
blks[numblks].end = endv;
numblks++;
}
memset(buffer, 0, sizeof(buffer));
}
if (in != stdin) {
fclose(in);
}
if (out != stdout) {
fclose(out);
}
a = saved * 100.0;
b = tot_sack_blks * 1.0;
if (b > 0.0)
c = a/b;
else
c = 0.0;
if (out != stdout)
err = stdout;
else
err = stderr;
fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);
return(0);
}
#endif

View File

@ -0,0 +1,58 @@
#ifndef __sack_filter_h__
#define __sack_filter_h__
/*-
* Copyright (c) 2017
* Netflix Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* __FBSDID("$FreeBSD$");
*/
/*
* Seven entry's is carefully choosen to
* fit in one cache line. We can easily
* change this to 15 (but it gets very
* little extra filtering). To change it
* to be larger than 15 would require either
* sf_bits becoming a uint32_t and then you
* could go to 31.. or change it to a full
* bitstring.. It is really doubtful you
* will get much benefit beyond 7, in testing
* there was a small amount but very very small.
*/
#define SACK_FILTER_BLOCKS 7
struct sack_filter {
tcp_seq sf_ack;
uint16_t sf_bits;
uint8_t sf_cur;
uint8_t sf_used;
struct sackblk sf_blks[SACK_FILTER_BLOCKS];
};
#ifdef _KERNEL
void sack_filter_clear(struct sack_filter *sf, tcp_seq seq);
int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack);
#endif
#endif

View File

@ -0,0 +1,321 @@
/*-
* Copyright (c) 2016
* Netflix Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _NETINET_TCP_RACK_H_
#define _NETINET_TCP_RACK_H_
#define RACK_ACKED 0x0001/* The remote endpoint acked this */
#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */
#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */
#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */
#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */
#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */
#define RACK_HAS_FIN 0x0040/* segment is sent with fin */
#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */
#define RACK_NUM_OF_RETRANS 3
#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */
struct rack_sendmap {
TAILQ_ENTRY(rack_sendmap) r_next; /* seq number arrayed next */
TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
uint32_t r_start; /* Sequence number of the segment */
uint32_t r_end; /* End seq, this is 1 beyond actually */
uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time
* sent */
uint8_t r_flags; /* Flags as defined above */
uint8_t r_sndcnt; /* Retran count, not limited by
* RACK_NUM_OF_RETRANS */
uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */
uint8_t r_resv[3];
};
TAILQ_HEAD(rack_head, rack_sendmap);
/*
* We use the rate sample structure to
* assist in single sack/ack rate and rtt
* calculation. In the future we will expand
* this in BBR to do forward rate sample
* b/w estimation.
*/
#define RACK_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */
#define RACK_RTT_VALID 0x00000002 /* We have at least one valid RTT */
struct rack_rtt_sample {
uint32_t rs_flags;
uint32_t rs_rtt_lowest;
uint32_t rs_rtt_highest;
uint32_t rs_rtt_cnt;
uint64_t rs_rtt_tot;
};
#define RACK_LOG_TYPE_ACK 0x01
#define RACK_LOG_TYPE_OUT 0x02
#define RACK_LOG_TYPE_TO 0x03
#define RACK_LOG_TYPE_ALLOC 0x04
#define RACK_LOG_TYPE_FREE 0x05
struct rack_log {
union {
struct rack_sendmap *rsm; /* For alloc/free */
uint64_t sb_acc;/* For out/ack or t-o */
};
uint32_t th_seq;
uint32_t th_ack;
uint32_t snd_una;
uint32_t snd_nxt; /* th_win for TYPE_ACK */
uint32_t snd_max;
uint32_t blk_start[4];
uint32_t blk_end[4];
uint8_t type;
uint8_t n_sackblks;
uint16_t len; /* Timeout T3=1, TLP=2, RACK=3 */
};
/*
* Magic numbers for logging timeout events if the
* logging is enabled.
*/
#define RACK_TO_FRM_TMR 1
#define RACK_TO_FRM_TLP 2
#define RACK_TO_FRM_RACK 3
#define RACK_TO_FRM_KEEP 4
#define RACK_TO_FRM_PERSIST 5
#define RACK_TO_FRM_DELACK 6
struct rack_opts_stats {
uint64_t tcp_rack_prop_rate;
uint64_t tcp_rack_prop;
uint64_t tcp_rack_tlp_reduce;
uint64_t tcp_rack_early_recov;
uint64_t tcp_rack_pace_always;
uint64_t tcp_rack_pace_reduce;
uint64_t tcp_rack_max_seg;
uint64_t tcp_rack_prr_sendalot;
uint64_t tcp_rack_min_to;
uint64_t tcp_rack_early_seg;
uint64_t tcp_rack_reord_thresh;
uint64_t tcp_rack_reord_fade;
uint64_t tcp_rack_tlp_thresh;
uint64_t tcp_rack_pkt_delay;
uint64_t tcp_rack_tlp_inc_var;
uint64_t tcp_tlp_use;
uint64_t tcp_rack_idle_reduce;
uint64_t tcp_rack_idle_reduce_high;
uint64_t rack_no_timer_in_hpts;
uint64_t tcp_rack_min_pace_seg;
uint64_t tcp_rack_min_pace;
};
#define TLP_USE_ID 1 /* Internet draft behavior */
#define TLP_USE_TWO_ONE 2 /* Use 2.1 behavior */
#define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */
#ifdef _KERNEL
#define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t))
extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
#define RACK_OPTS_ADD(name, amm) counter_u64_add(rack_opts_arry[(offsetof(struct rack_opts_stats, name)/sizeof(uint64_t))], (amm))
#define RACK_OPTS_INC(name) RACK_OPTS_ADD(name, 1)
#endif
/*
* As we get each SACK we wade through the
* rc_map and mark off what is acked.
* We also increment rc_sacked as well.
*
* We also pay attention to missing entries
* based on the time and possibly mark them
* for retransmit. If we do and we are not already
* in recovery we enter recovery. In doing
* so we claer prr_delivered/holes_rxt and prr_sent_dur_rec.
* We also setup rc_next/rc_snd_nxt/rc_send_end so
* we will know where to send from. When not in
* recovery rc_next will be NULL and rc_snd_nxt should
* equal snd_max.
*
* Whenever we retransmit from recovery we increment
* rc_holes_rxt as we retran a block and mark it as retransmitted
* with the time it was sent. During non-recovery sending we
* add to our map and note the time down of any send expanding
* the rc_map at the tail and moving rc_snd_nxt up with snd_max.
*
* In recovery during SACK/ACK processing if a chunk has
* been retransmitted and it is now acked, we decrement rc_holes_rxt.
* When we retransmit from the scoreboard we use
* rc_next and rc_snd_nxt/rc_send_end to help us
* find what needs to be retran.
*
* To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt
* This gets us the effect of RFC6675 pipe, counting twice for
* bytes retransmitted.
*/
#define TT_RACK_FR_TMR 0x2000
/*
* Locking for the rack control block.
* a) Locked by INP_WLOCK
* b) Locked by the hpts-mutex
*
*/
struct rack_control {
/* Second cache line 0x40 from tcp_rack */
struct rack_head rc_map;/* List of all segments Lock(a) */
struct rack_head rc_tmap; /* List in transmit order Lock(a) */
struct rack_sendmap *rc_tlpsend; /* Remembered place for
* tlp_sending Lock(a) */
struct rack_sendmap *rc_resend; /* something we have been asked to
* resend */
uint32_t rc_hpts_flags;
uint32_t rc_timer_exp; /* If a timer ticks of expiry */
uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */
uint32_t rc_rack_largest_cwnd; /* Largest CWND we have seen Lock(a) */
/* Third Cache line 0x80 */
struct rack_head rc_free; /* Allocation array */
uint32_t rc_time_last_sent; /* Time we last sent some data and
* logged it Lock(a). */
uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
uint32_t rc_tlp_new_data; /* we need to send new-data on a TLP
* Lock(a) */
uint32_t rc_prr_out; /* bytes sent during recovery Lock(a) */
uint32_t rc_prr_recovery_fs; /* recovery fs point Lock(a) */
uint32_t rc_prr_sndcnt; /* Prr sndcnt Lock(a) */
uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */
uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */
uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */
uint16_t rc_tlp_send_cnt; /* Number of TLP sends we have done
* since peer spoke to us Lock(a) */
uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent
* rc_last_tlp_seq Lock(a) */
uint32_t rc_loss_count; /* During recovery how many segments were lost
* Lock(a) */
uint32_t rc_reorder_fade; /* Socket option value Lock(a) */
/* Forth cache line 0xc0 */
/* Times */
uint32_t rc_rack_tmit_time; /* Rack transmit time Lock(a) */
uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */
/* Variables to track bad retransmits and recover */
uint32_t rc_rsm_start; /* RSM seq number we retransmitted Lock(a) */
uint32_t rc_cwnd_at; /* cwnd at the retransmit Lock(a) */
uint32_t rc_ssthresh_at;/* ssthresh at the retransmit Lock(a) */
uint32_t rc_num_maps_alloced; /* Number of map blocks (sacks) we
* have allocated */
uint32_t rc_rcvtime; /* When we last received data */
uint32_t rc_notused;
uint32_t rc_last_output_to;
uint32_t rc_went_idle_time;
struct rack_sendmap *rc_sacklast; /* sack remembered place
* Lock(a) */
struct rack_sendmap *rc_next; /* remembered place where we next
* retransmit at Lock(a) */
struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for
* cache line alignment
* Lock(a) */
/* Cache line split 0x100 */
struct sack_filter rack_sf;
/* Cache line split 0x140 */
/* Flags for various things */
struct rack_rtt_sample rack_rs;
uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
uint16_t rc_pkt_delay; /* Socket option value Lock(a) */
uint8_t rc_prop_rate; /* Socket option value Lock(a) */
uint8_t rc_prop_reduce; /* Socket option value Lock(a) */
uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
uint8_t rc_early_recovery; /* Socket option value Lock(a) */
uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
uint8_t rc_min_to; /* Socket option value Lock(a) */
uint8_t rc_prr_inc_var; /* Socket option value Lock(a) */
uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */
uint8_t rc_rate_sample_method;
};
#ifdef _KERNEL
struct tcp_rack {
/* First cache line 0x00 */
TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */
int32_t(*r_substate) (struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *, struct tcpopt *,
int32_t, int32_t, int32_t *, uint32_t, int, int); /* Lock(a) */
struct tcpcb *rc_tp; /* The tcpcb Lock(a) */
struct inpcb *rc_inp; /* The inpcb Lock(a) */
uint32_t rc_free_cnt; /* Number of free entries on the rc_free list
* Lock(a) */
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_wanted_output; /* Output routine wanted to be called */
uint16_t r_cpu; /* CPU that the INP is running on Lock(a) */
uint16_t rc_pace_max_segs; /* Socket option value Lock(a) */
uint16_t rc_pace_reduce;/* Socket option value Lock(a) */
uint8_t r_state; /* Current rack state Lock(a) */
uint8_t rc_tmr_stopped : 7,
t_timers_stopped : 1;
uint8_t rc_enobuf; /* count of enobufs on connection provides
* backoff Lock(a) */
uint8_t r_timer_override : 1, /* hpts override Lock(a) */
r_tlp_running : 1, /* Running from a TLP timeout Lock(a) */
r_is_v6 : 1, /* V6 pcb Lock(a) */
rc_in_persist : 1,
rc_last_pto_set : 1, /* XXX not used */
rc_tlp_in_progress : 1,
rc_always_pace : 1, /* Socket option value Lock(a) */
rc_timer_up : 1; /* The rack timer is up flag Lock(a) */
uint8_t r_idle_reduce_largest : 1,
r_enforce_min_pace : 2,
r_min_pace_seg_thresh : 5;
uint8_t rack_tlp_threshold_use;
uint8_t rc_allow_data_af_clo: 1,
delayed_ack : 1,
rc_avail : 6;
uint8_t r_resv[2]; /* Fill to cache line boundary */
/* Cache line 2 0x40 */
struct rack_control r_ctl;
} __aligned(CACHE_LINE_SIZE);
#endif
#endif

View File

@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_seq.h>
#include <netinet/cc/cc.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
@ -139,7 +140,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
/* max idle probes */
int tcp_maxpersistidle;
static int tcp_rexmit_drop_options = 0;
int tcp_rexmit_drop_options = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
&tcp_rexmit_drop_options, 0,
"Drop TCP options from 3rd and later retransmitted SYN");
@ -174,18 +175,13 @@ static int per_cpu_timers = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
&per_cpu_timers , 0, "run tcp timers on all cpus");
#if 0
#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
#endif
/*
* Map the given inp to a CPU id.
*
* This queries RSS if it's compiled in, else it defaults to the current
* CPU ID.
*/
static inline int
inline int
inp_to_cpuid(struct inpcb *inp)
{
u_int cpuid;
@ -243,7 +239,7 @@ int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
{ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
/*
* TCP timer processing.
@ -948,6 +944,111 @@ tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
return callout_active(t_callout);
}
/*
* Stop the timer from running, and apply a flag
* against the timer_flags that will force the
* timer never to run. The flag is needed to assure
* a race does not leave it running and cause
* the timer to possibly restart itself (keep and persist
* especially do this).
*/
int
tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type)
{
struct callout *t_callout;
uint32_t t_flags;
switch (timer_type) {
case TT_DELACK:
t_flags = TT_DELACK_SUS;
t_callout = &tp->t_timers->tt_delack;
break;
case TT_REXMT:
t_flags = TT_REXMT_SUS;
t_callout = &tp->t_timers->tt_rexmt;
break;
case TT_PERSIST:
t_flags = TT_PERSIST_SUS;
t_callout = &tp->t_timers->tt_persist;
break;
case TT_KEEP:
t_flags = TT_KEEP_SUS;
t_callout = &tp->t_timers->tt_keep;
break;
case TT_2MSL:
t_flags = TT_2MSL_SUS;
t_callout = &tp->t_timers->tt_2msl;
break;
default:
panic("tp:%p bad timer_type 0x%x", tp, timer_type);
}
tp->t_timers->tt_flags |= t_flags;
return (callout_stop(t_callout));
}
void
tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type)
{
switch (timer_type) {
case TT_DELACK:
if (tp->t_timers->tt_flags & TT_DELACK_SUS) {
tp->t_timers->tt_flags &= ~TT_DELACK_SUS;
if (tp->t_flags & TF_DELACK) {
/* Delayed ack timer should be up activate a timer */
tp->t_flags &= ~TF_DELACK;
tcp_timer_activate(tp, TT_DELACK,
tcp_delacktime);
}
}
break;
case TT_REXMT:
if (tp->t_timers->tt_flags & TT_REXMT_SUS) {
tp->t_timers->tt_flags &= ~TT_REXMT_SUS;
if (SEQ_GT(tp->snd_max, tp->snd_una) &&
(tcp_timer_active((tp), TT_PERSIST) == 0) &&
tp->snd_wnd) {
/* We have outstanding data activate a timer */
tcp_timer_activate(tp, TT_REXMT,
tp->t_rxtcur);
}
}
break;
case TT_PERSIST:
if (tp->t_timers->tt_flags & TT_PERSIST_SUS) {
tp->t_timers->tt_flags &= ~TT_PERSIST_SUS;
if (tp->snd_wnd == 0) {
/* Activate the persists timer */
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
}
break;
case TT_KEEP:
if (tp->t_timers->tt_flags & TT_KEEP_SUS) {
tp->t_timers->tt_flags &= ~TT_KEEP_SUS;
tcp_timer_activate(tp, TT_KEEP,
TCPS_HAVEESTABLISHED(tp->t_state) ?
TP_KEEPIDLE(tp) : TP_KEEPINIT(tp));
}
break;
case TT_2MSL:
if (tp->t_timers->tt_flags &= TT_2MSL_SUS) {
tp->t_timers->tt_flags &= ~TT_2MSL_SUS;
if ((tp->t_state == TCPS_FIN_WAIT_2) &&
((tp->t_inpcb->inp_socket == NULL) ||
(tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) {
/* Star the 2MSL timer */
tcp_timer_activate(tp, TT_2MSL,
(tcp_fast_finwait2_recycle) ?
tcp_finwait2_timeout : TP_MAXIDLE(tp));
}
}
break;
default:
panic("tp:%p bad timer_type 0x%x", tp, timer_type);
}
}
void
tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
{

View File

@ -168,11 +168,15 @@ struct tcp_timer {
#define TT_2MSL 0x0010
#define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
#define TT_DELACK_RST 0x0100
#define TT_REXMT_RST 0x0200
#define TT_PERSIST_RST 0x0400
#define TT_KEEP_RST 0x0800
#define TT_2MSL_RST 0x1000
/*
* Suspend flags - used when suspending a timer
* from ever running again.
*/
#define TT_DELACK_SUS 0x0100
#define TT_REXMT_SUS 0x0200
#define TT_PERSIST_SUS 0x0400
#define TT_KEEP_SUS 0x0800
#define TT_2MSL_SUS 0x1000
#define TT_STOPPED 0x00010000
@ -196,6 +200,8 @@ extern int tcp_msl;
extern int tcp_ttl; /* time to live for TCP segs */
extern int tcp_backoff[];
extern int tcp_syn_backoff[];
extern int tcp_totbackoff;
extern int tcp_rexmit_drop_options;
extern int tcp_always_keepalive;
extern int tcp_finwait2_timeout;

View File

@ -93,8 +93,11 @@ struct tcpcb {
void *t_fb_ptr; /* Pointer to t_fb specific data */
uint32_t t_maxseg:24, /* maximum segment size */
t_logstate:8; /* State of "black box" logging */
uint32_t t_state:4, /* state of this connection */
bits_spare : 24;
uint32_t t_port:16, /* Tunneling (over udp) port */
t_state:4, /* state of this connection */
t_idle_reduce : 1,
t_delayed_ack: 7, /* Delayed ack variable */
bits_spare : 4;
u_int t_flags;
tcp_seq snd_una; /* sent but unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
@ -104,7 +107,7 @@ struct tcpcb {
tcp_seq snd_up; /* send urgent pointer */
uint32_t snd_wnd; /* send window */
uint32_t snd_cwnd; /* congestion-controlled window */
uint32_t cl1_spare; /* Spare to round out CL 1 */
uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */
/* Cache line 2 */
u_int32_t ts_offset; /* our timestamp offset */
u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
@ -189,6 +192,7 @@ struct tcpcb {
struct cc_var *ccv; /* congestion control specific vars */
struct osd *osd; /* storage for Khelp module data */
int t_bytes_acked; /* # bytes acked during current RTT */
u_int t_maxunacktime;
u_int t_keepinit; /* time to establish connection */
u_int t_keepidle; /* time before keepalive probes begin */
u_int t_keepintvl; /* interval between keepalives */
@ -361,6 +365,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */
#define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */
#define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */
#define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */
/*
* Structure to hold TCP options that are only used during segment
@ -649,6 +654,11 @@ struct tcp_hhook_data {
int tso;
tcp_seq curack;
};
#ifdef TCP_HHOOK
void hhook_run_tcp_est_out(struct tcpcb *tp,
struct tcphdr *th, struct tcpopt *to,
uint32_t len, int tso);
#endif
#endif
/*
@ -801,6 +811,9 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo);
#define V_tcp_sack_maxholes VNET(tcp_sack_maxholes)
#define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail)
#define V_tcp_sendspace VNET(tcp_sendspace)
#define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead)
#define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port)
#ifdef TCP_HHOOK
VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
@ -893,9 +906,12 @@ struct tcptemp *
tcpip_maketemplate(struct inpcb *);
void tcpip_fillheaders(struct inpcb *, void *, void *);
void tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
int tcp_timer_suspend(struct tcpcb *, uint32_t);
void tcp_timers_unsuspend(struct tcpcb *, uint32_t);
int tcp_timer_active(struct tcpcb *, uint32_t);
void tcp_timer_stop(struct tcpcb *, uint32_t);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
int inp_to_cpuid(struct inpcb *inp);
/*
* All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
*/
@ -921,6 +937,10 @@ void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);
int tcp_compute_pipe(struct tcpcb *);
void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
struct mbuf *
tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
int32_t seglimit, int32_t segsize, struct sockbuf *sb);
static inline void
tcp_fields_to_host(struct tcphdr *th)

View File

@ -304,7 +304,7 @@ struct mbuf {
#define M_MCAST 0x00000020 /* send/received as link-level multicast */
#define M_PROMISC 0x00000040 /* packet was not for us */
#define M_VLANTAG 0x00000080 /* ether_vtag is valid */
#define M_UNUSED_8 0x00000100 /* --available-- */
#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */
#define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */
#define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically

View File

@ -95,6 +95,7 @@
* _NEXT + + + +
* _PREV - + - +
* _LAST - - + +
* _LAST_FAST - - - +
* _FOREACH + + + +
* _FOREACH_FROM + + + +
* _FOREACH_SAFE + + + +
@ -817,6 +818,16 @@ struct { \
#define TAILQ_LAST(head, headname) \
(*(((struct headname *)((head)->tqh_last))->tqh_last))
/*
* The FAST function is fast in that it causes no data access other
* then the access to the head. The standard LAST function above
* will cause a data access of both the element you want and
* the previous element. FAST is very useful for instances when
* you may want to prefetch the last data element.
*/
#define TAILQ_LAST_FAST(head, type, field) \
(TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, QUEUE_TYPEOF(type), field.tqe_next))
#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
#define TAILQ_PREV(elm, headname, field) \

View File

@ -165,6 +165,10 @@ int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
struct thread *td);
struct mbuf *
sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff);
struct mbuf *
sbsndptr_noadv(struct sockbuf *sb, u_int off, u_int *moff);
void
sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, u_int len);
struct mbuf *
sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff);
int sbwait(struct sockbuf *sb);

View File

@ -289,6 +289,22 @@ tvtosbt(struct timeval _tv)
#endif /* __BSD_VISIBLE */
#ifdef _KERNEL
/*
* Simple macros to convert ticks to milliseconds
* or microseconds and vice-versa. The answer
* will always be at least 1. Note the return
* value is a uint32_t however we step up the
* operations to 64 bit to avoid any overflow/underflow
* problems.
*/
#define TICKS_2_MSEC(t) max(1, (uint32_t)(hz == 1000) ? \
(t) : (((uint64_t)(t) * (uint64_t)1000)/(uint64_t)hz))
#define TICKS_2_USEC(t) max(1, (uint32_t)(hz == 1000) ? \
((t) * 1000) : (((uint64_t)(t) * (uint64_t)1000000)/(uint64_t)hz))
#define MSEC_2_TICKS(m) max(1, (uint32_t)((hz == 1000) ? \
(m) : ((uint64_t)(m) * (uint64_t)hz)/(uint64_t)1000))
#define USEC_2_TICKS(u) max(1, (uint32_t)((hz == 1000) ? \
((u) / 1000) : ((uint64_t)(u) * (uint64_t)hz)/(uint64_t)1000000))
/* Operations on timespecs */
#define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0)