Add support for TCP Selective Acknowledgements. The work for this

originated on RELENG_4 and was ported to -CURRENT.

The scoreboarding code was obtained from OpenBSD, and many
of the remaining changes were inspired by OpenBSD, but not
taken directly from there.

You can enable/disable sack using net.inet.tcp.do_sack. You can
also limit the number of sack holes that all senders can have in
the scoreboard with net.inet.tcp.sackhole_limit.

Reviewed by:	gnn
Obtained from:	Yahoo! (Mohan Srinivasan, Jayanth Vijayaraghavan)
This commit is contained in:
Paul Saab 2004-06-23 21:04:37 +00:00
parent 89ec2c3c42
commit 6d90faf3d8
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=130989
13 changed files with 975 additions and 37 deletions

View File

@ -1465,6 +1465,7 @@ netinet/tcp_debug.c optional tcpdebug
netinet/tcp_hostcache.c optional inet
netinet/tcp_input.c optional inet
netinet/tcp_output.c optional inet
netinet/tcp_sack.c optional inet
netinet/tcp_subr.c optional inet
netinet/tcp_syncache.c optional inet
netinet/tcp_timer.c optional inet

View File

@ -354,6 +354,7 @@ RANDOM_IP_ID
SLIP_IFF_OPTS opt_slip.h
TCPDEBUG
TCP_SIGNATURE opt_inet.h
TCP_SACK_DEBUG opt_tcp_sack.h
TCP_DROP_SYNFIN opt_tcp_input.h
XBONEHACK

View File

@ -85,12 +85,15 @@ struct tcphdr {
#define TCPOPT_SACK_PERMITTED 4 /* Experimental */
#define TCPOLEN_SACK_PERMITTED 2
#define TCPOPT_SACK 5 /* Experimental */
#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */
#define TCPOPT_TIMESTAMP 8
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
#define TCPOPT_TSTAMP_HDR \
(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */
#define TCPOPT_CC 11 /* CC options: RFC-1644 */
#define TCPOPT_CCNEW 12
#define TCPOPT_CCECHO 13
@ -101,6 +104,15 @@ struct tcphdr {
#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */
#define TCPOLEN_SIGNATURE 18
/* Option definitions */
#define TCPOPT_SACK_PERMIT_HDR \
(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
/* Miscellaneous constants */
#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */
#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */
/*
* Default maximum segment size for TCP.
* With an IP MTU of 576, this is 536,

View File

@ -37,6 +37,7 @@
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_input.h"
#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/kernel.h>
@ -159,7 +160,9 @@ struct inpcbhead tcb;
struct inpcbinfo tcbinfo;
struct mtx *tcbinfo_mtx;
static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *,
int, int, struct tcphdr *);
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
@ -724,7 +727,7 @@ tcp_input(m, off0)
* present in a SYN segment. See tcp_timewait().
*/
if (thflags & TH_SYN)
tcp_dooptions(&to, optp, optlen, 1);
tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th);
if (tcp_timewait((struct tcptw *)inp->inp_ppcb,
&to, th, m, tlen))
goto findpcb;
@ -938,7 +941,7 @@ tcp_input(m, off0)
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
tcp_dooptions(&to, optp, optlen, 1);
tcp_dooptions(tp, &to, optp, optlen, 1, th);
if (!syncache_add(&inc, &to, th, &so, m))
goto drop;
if (so == NULL) {
@ -1054,7 +1057,7 @@ tcp_input(m, off0)
* for incoming connections is handled in tcp_syncache.
* XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th);
if (thflags & TH_SYN) {
if (to.to_flags & TOF_SCALE) {
tp->t_flags |= TF_RCVD_SCALE;
@ -1069,6 +1072,20 @@ tcp_input(m, off0)
tp->t_flags |= TF_RCVD_CC;
if (to.to_flags & TOF_MSS)
tcp_mss(tp, to.to_mss);
if (tp->sack_enable) {
if (!(to.to_flags & TOF_SACK))
tp->sack_enable = 0;
else
tp->t_flags |= TF_SACK_PERMIT;
}
}
if (tp->sack_enable) {
/* Delete stale (cumulatively acked) SACK holes */
tcp_del_sackholes(tp, th);
tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
tp->rcv_lastend = th->th_seq + tlen;
}
/*
@ -1120,9 +1137,10 @@ tcp_input(m, off0)
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
((!tcp_do_newreno &&
((!tcp_do_newreno && !tp->sack_enable &&
tp->t_dupacks < tcprexmtthresh) ||
(tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
((tcp_do_newreno || tp->sack_enable) &&
!IN_FASTRECOVERY(tp)))) {
KASSERT(headlocked, ("headlocked"));
INP_INFO_WUNLOCK(&tcbinfo);
/*
@ -1218,6 +1236,9 @@ tcp_input(m, off0)
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
/* Clean receiver SACK report if present */
if (tp->sack_enable && tp->rcv_numsacks)
tcp_clean_sackreport(tp);
++tcpstat.tcps_preddat;
tp->rcv_nxt += tlen;
/*
@ -1898,7 +1919,7 @@ tcp_input(m, off0)
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
(tcp_do_newreno &&
((tcp_do_newreno || tp->sack_enable) &&
IN_FASTRECOVERY(tp))) {
tp->snd_cwnd += tp->t_maxseg;
(void) tcp_output(tp);
@ -1906,7 +1927,8 @@ tcp_input(m, off0)
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
u_int win;
if (tcp_do_newreno &&
if ((tcp_do_newreno ||
tp->sack_enable) &&
SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
@ -1921,6 +1943,17 @@ tcp_input(m, off0)
tp->snd_recover = tp->snd_max;
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
if (tp->sack_enable) {
tcpstat.tcps_sack_recovery_episode++;
tp->snd_cwnd =
tp->t_maxseg *
tp->t_dupacks;
(void) tcp_output(tp);
tp->snd_cwnd =
tp->snd_ssthresh;
goto drop;
}
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
(void) tcp_output(tp);
@ -1971,12 +2004,16 @@ tcp_input(m, off0)
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
if (tcp_do_newreno) {
if (tcp_do_newreno || tp->sack_enable) {
if (IN_FASTRECOVERY(tp)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
tcp_newreno_partial_ack(tp, th);
if (tp->sack_enable)
tcp_sack_partialack(tp, th);
else
tcp_newreno_partial_ack(tp, th);
} else {
/*
* Out of fast recovery.
* Window inflation should have left us
* with approximately snd_ssthresh
* outstanding data.
@ -2098,7 +2135,8 @@ tcp_input(m, off0)
* Otherwise open linearly: maxseg per window
* (maxseg^2 / cwnd per packet).
*/
if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) {
if ((!tcp_do_newreno && !tp->sack_enable) ||
!IN_FASTRECOVERY(tp)) {
register u_int cw = tp->snd_cwnd;
register u_int incr = tp->t_maxseg;
if (cw > tp->snd_ssthresh)
@ -2116,14 +2154,20 @@ tcp_input(m, off0)
}
sowwakeup(so);
/* detect una wraparound */
if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
if ((tcp_do_newreno || tp->sack_enable) &&
!IN_FASTRECOVERY(tp) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
if ((tcp_do_newreno || tp->sack_enable) &&
IN_FASTRECOVERY(tp) &&
SEQ_GEQ(th->th_ack, tp->snd_recover))
EXIT_FASTRECOVERY(tp);
tp->snd_una = th->th_ack;
if (tp->sack_enable) {
if (SEQ_GT(tp->snd_una, tp->snd_recover))
tp->snd_recover = tp->snd_una;
}
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
@ -2327,7 +2371,8 @@ tcp_input(m, off0)
thflags = tcp_reass(tp, th, &tlen, m);
tp->t_flags |= TF_ACKNOW;
}
if (tp->sack_enable)
tcp_update_sack_list(tp);
/*
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
@ -2530,11 +2575,13 @@ tcp_input(m, off0)
* Parse TCP options and place in tcpopt.
*/
static void
tcp_dooptions(to, cp, cnt, is_syn)
tcp_dooptions(tp, to, cp, cnt, is_syn, th)
struct tcpcb *tp;
struct tcpopt *to;
u_char *cp;
u_char *cp;
int cnt;
int is_syn;
struct tcphdr *th;
{
int opt, optlen;
@ -2623,6 +2670,20 @@ tcp_dooptions(to, cp, cnt, is_syn)
to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
break;
#endif
case TCPOPT_SACK_PERMITTED:
if (!tcp_do_sack ||
optlen != TCPOLEN_SACK_PERMITTED)
continue;
if (is_syn) {
/* MUST only be set on SYN */
to->to_flags |= TOF_SACK;
}
break;
case TCPOPT_SACK:
if (!tp || tcp_sack_option(tp, th, cp, optlen))
continue;
break;
default:
continue;
}

View File

@ -35,6 +35,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@ -122,6 +123,8 @@ tcp_output(struct tcpcb *tp)
u_char opt[TCP_MAXOLEN];
unsigned ipoptlen, optlen, hdrlen;
int idle, sendalot;
int i, sack_rxmit;
struct sackhole *p;
#if 0
int maxburst = TCP_MAXBURST;
#endif
@ -171,12 +174,49 @@ tcp_output(struct tcpcb *tp)
}
}
again:
/*
* If we've recently taken a timeout, snd_max will be greater than
* snd_nxt. There may be SACK information that allows us to avoid
* resending already delivered data. Adjust snd_nxt accordingly.
*/
if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
tcp_sack_adjust(tp);
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
sendwin = min(sendwin, tp->snd_bwnd);
flags = tcp_outflags[tp->t_state];
/*
* Send any SACK-generated retransmissions. If we're explicitly trying
* to send out new data (when sendalot is 1), bypass this function.
* If we retransmit in fast recovery mode, decrement snd_cwnd, since
* we're replacing a (future) new transmission with a retransmission
* now, and we previously incremented snd_cwnd in tcp_input().
*/
/*
* Still in sack recovery , reset rxmit flag to zero.
*/
sack_rxmit = 0;
len = 0;
p = NULL;
if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
(p = tcp_sack_output(tp))) {
sack_rxmit = 1;
sendalot = 1;
off = p->rxmit - tp->snd_una;
KASSERT(tp->snd_cwnd >= 0,("%s: CWIN is negative: %ld", __func__, tp->snd_cwnd));
/* Do not retransmit SACK segments beyond snd_recover */
if (SEQ_GT(p->end, tp->snd_recover))
len = min(tp->snd_cwnd, tp->snd_recover - p->rxmit);
else
len = min(tp->snd_cwnd, p->end - p->rxmit);
if (len > 0) {
tcpstat.tcps_sack_rexmits++;
tcpstat.tcps_sack_rexmit_bytes +=
min(len, tp->t_maxseg);
}
}
/*
* Get standard flags, and add SYN or FIN if requested by 'hidden'
* state flags.
@ -230,9 +270,12 @@ tcp_output(struct tcpcb *tp)
* In the normal retransmit-FIN-only case, however, snd_nxt will
* be set to snd_una, the offset will be 0, and the length may
* wind up 0.
*
* If sack_rxmit is true we are retransmitting from the scoreboard
* in which case len is already set.
*/
len = (long)ulmin(so->so_snd.sb_cc, sendwin) - off;
if (!sack_rxmit)
len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
/*
* Lop off SYN bit if it has already been sent. However, if this
@ -331,6 +374,8 @@ tcp_output(struct tcpcb *tp)
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
goto send;
if (sack_rxmit)
goto send;
}
/*
@ -374,7 +419,18 @@ tcp_output(struct tcpcb *tp)
if (flags & TH_FIN &&
((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
goto send;
/*
* In SACK, it is possible for tcp_output to fail to send a segment
* after the retransmission timer has been turned off. Make sure
* that the retransmission timer is set.
*/
if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
!callout_active(tp->tt_rexmt) &&
!callout_active(tp->tt_persist)) {
callout_reset(tp->tt_rexmt, tp->t_rxtcur,
tcp_timer_rexmt, tp);
return (0);
}
/*
* TCP window updates are not reliable, rather a polling protocol
* using ``persist'' packets is used to insure receipt of window
@ -435,6 +491,19 @@ tcp_output(struct tcpcb *tp)
(void)memcpy(opt + 2, &mss, sizeof(mss));
optlen = TCPOLEN_MAXSEG;
/*
* If this is the first SYN of connection (not a SYN
* ACK), include SACK_PERMIT_HDR option. If this is a
* SYN ACK, include SACK_PERMIT_HDR option if peer has
* already done so. This is only for active connect,
* since the syncache takes care of the passive connect.
*/
if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_SACK_PERMIT))) {
*((u_int32_t *) (opt + optlen)) =
htonl(TCPOPT_SACK_PERMIT_HDR);
optlen += 4;
}
if ((tp->t_flags & TF_REQ_SCALE) &&
((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_SCALE))) {
@ -466,6 +535,32 @@ tcp_output(struct tcpcb *tp)
optlen += TCPOLEN_TSTAMP_APPA;
}
/*
* Send SACKs if necessary. This should be the last option processed.
* Only as many SACKs are sent as are permitted by the maximum options
* size. No more than three SACKs are sent.
*/
if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
(tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
tp->rcv_numsacks) {
u_int32_t *lp = (u_int32_t *)(opt + optlen);
u_int32_t *olp = lp++;
int count = 0; /* actual number of SACKs inserted */
int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
tcpstat.tcps_sack_send_blocks++;
maxsack = min(maxsack, TCP_MAX_SACK);
for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
struct sackblk sack = tp->sackblks[i];
if (sack.start == 0 && sack.end == 0)
continue;
*lp++ = htonl(sack.start);
*lp++ = htonl(sack.end);
count++;
}
*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
}
/*
* Send `CC-family' options if our side wants to use them (TF_REQ_CC),
* options are allowed (!TF_NOOPT) and it's not a RST.
@ -734,6 +829,10 @@ tcp_output(struct tcpcb *tp)
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
if (sack_rxmit) {
th->th_seq = htonl(p->rxmit);
p->rxmit += len;
}
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) {
bcopy(opt, th + 1, optlen);
@ -831,6 +930,8 @@ tcp_output(struct tcpcb *tp)
tp->t_flags |= TF_SENTFIN;
}
}
if (tp->sack_enable && sack_rxmit && (p->rxmit != tp->snd_nxt))
goto timer;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
@ -853,6 +954,17 @@ tcp_output(struct tcpcb *tp)
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
timer:
if (tp->sack_enable && sack_rxmit &&
!callout_active(tp->tt_rexmt) &&
tp->snd_nxt != tp->snd_max) {
callout_reset(tp->tt_rexmt, tp->t_rxtcur,
tcp_timer_rexmt, tp);
if (callout_active(tp->tt_persist)) {
callout_stop(tp->tt_persist);
tp->t_rxtshift = 0;
}
}
if (!callout_active(tp->tt_rexmt) &&
tp->snd_nxt != tp->snd_una) {
if (callout_active(tp->tt_persist)) {

View File

@ -37,6 +37,7 @@
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_input.h"
#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/kernel.h>
@ -159,7 +160,9 @@ struct inpcbhead tcb;
struct inpcbinfo tcbinfo;
struct mtx *tcbinfo_mtx;
static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *,
int, int, struct tcphdr *);
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
@ -724,7 +727,7 @@ tcp_input(m, off0)
* present in a SYN segment. See tcp_timewait().
*/
if (thflags & TH_SYN)
tcp_dooptions(&to, optp, optlen, 1);
tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th);
if (tcp_timewait((struct tcptw *)inp->inp_ppcb,
&to, th, m, tlen))
goto findpcb;
@ -938,7 +941,7 @@ tcp_input(m, off0)
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
tcp_dooptions(&to, optp, optlen, 1);
tcp_dooptions(tp, &to, optp, optlen, 1, th);
if (!syncache_add(&inc, &to, th, &so, m))
goto drop;
if (so == NULL) {
@ -1054,7 +1057,7 @@ tcp_input(m, off0)
* for incoming connections is handled in tcp_syncache.
* XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th);
if (thflags & TH_SYN) {
if (to.to_flags & TOF_SCALE) {
tp->t_flags |= TF_RCVD_SCALE;
@ -1069,6 +1072,20 @@ tcp_input(m, off0)
tp->t_flags |= TF_RCVD_CC;
if (to.to_flags & TOF_MSS)
tcp_mss(tp, to.to_mss);
if (tp->sack_enable) {
if (!(to.to_flags & TOF_SACK))
tp->sack_enable = 0;
else
tp->t_flags |= TF_SACK_PERMIT;
}
}
if (tp->sack_enable) {
/* Delete stale (cumulatively acked) SACK holes */
tcp_del_sackholes(tp, th);
tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
tp->rcv_lastend = th->th_seq + tlen;
}
/*
@ -1120,9 +1137,10 @@ tcp_input(m, off0)
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
((!tcp_do_newreno &&
((!tcp_do_newreno && !tp->sack_enable &&
tp->t_dupacks < tcprexmtthresh) ||
(tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
((tcp_do_newreno || tp->sack_enable) &&
!IN_FASTRECOVERY(tp)))) {
KASSERT(headlocked, ("headlocked"));
INP_INFO_WUNLOCK(&tcbinfo);
/*
@ -1218,6 +1236,9 @@ tcp_input(m, off0)
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
/* Clean receiver SACK report if present */
if (tp->sack_enable && tp->rcv_numsacks)
tcp_clean_sackreport(tp);
++tcpstat.tcps_preddat;
tp->rcv_nxt += tlen;
/*
@ -1898,7 +1919,7 @@ tcp_input(m, off0)
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
(tcp_do_newreno &&
((tcp_do_newreno || tp->sack_enable) &&
IN_FASTRECOVERY(tp))) {
tp->snd_cwnd += tp->t_maxseg;
(void) tcp_output(tp);
@ -1906,7 +1927,8 @@ tcp_input(m, off0)
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
u_int win;
if (tcp_do_newreno &&
if ((tcp_do_newreno ||
tp->sack_enable) &&
SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
@ -1921,6 +1943,17 @@ tcp_input(m, off0)
tp->snd_recover = tp->snd_max;
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
if (tp->sack_enable) {
tcpstat.tcps_sack_recovery_episode++;
tp->snd_cwnd =
tp->t_maxseg *
tp->t_dupacks;
(void) tcp_output(tp);
tp->snd_cwnd =
tp->snd_ssthresh;
goto drop;
}
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
(void) tcp_output(tp);
@ -1971,12 +2004,16 @@ tcp_input(m, off0)
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
if (tcp_do_newreno) {
if (tcp_do_newreno || tp->sack_enable) {
if (IN_FASTRECOVERY(tp)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
tcp_newreno_partial_ack(tp, th);
if (tp->sack_enable)
tcp_sack_partialack(tp, th);
else
tcp_newreno_partial_ack(tp, th);
} else {
/*
* Out of fast recovery.
* Window inflation should have left us
* with approximately snd_ssthresh
* outstanding data.
@ -2098,7 +2135,8 @@ tcp_input(m, off0)
* Otherwise open linearly: maxseg per window
* (maxseg^2 / cwnd per packet).
*/
if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) {
if ((!tcp_do_newreno && !tp->sack_enable) ||
!IN_FASTRECOVERY(tp)) {
register u_int cw = tp->snd_cwnd;
register u_int incr = tp->t_maxseg;
if (cw > tp->snd_ssthresh)
@ -2116,14 +2154,20 @@ tcp_input(m, off0)
}
sowwakeup(so);
/* detect una wraparound */
if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
if ((tcp_do_newreno || tp->sack_enable) &&
!IN_FASTRECOVERY(tp) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
if ((tcp_do_newreno || tp->sack_enable) &&
IN_FASTRECOVERY(tp) &&
SEQ_GEQ(th->th_ack, tp->snd_recover))
EXIT_FASTRECOVERY(tp);
tp->snd_una = th->th_ack;
if (tp->sack_enable) {
if (SEQ_GT(tp->snd_una, tp->snd_recover))
tp->snd_recover = tp->snd_una;
}
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
@ -2327,7 +2371,8 @@ tcp_input(m, off0)
thflags = tcp_reass(tp, th, &tlen, m);
tp->t_flags |= TF_ACKNOW;
}
if (tp->sack_enable)
tcp_update_sack_list(tp);
/*
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
@ -2530,11 +2575,13 @@ tcp_input(m, off0)
* Parse TCP options and place in tcpopt.
*/
static void
tcp_dooptions(to, cp, cnt, is_syn)
tcp_dooptions(tp, to, cp, cnt, is_syn, th)
struct tcpcb *tp;
struct tcpopt *to;
u_char *cp;
u_char *cp;
int cnt;
int is_syn;
struct tcphdr *th;
{
int opt, optlen;
@ -2623,6 +2670,20 @@ tcp_dooptions(to, cp, cnt, is_syn)
to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
break;
#endif
case TCPOPT_SACK_PERMITTED:
if (!tcp_do_sack ||
optlen != TCPOLEN_SACK_PERMITTED)
continue;
if (is_syn) {
/* MUST only be set on SYN */
to->to_flags |= TOF_SACK;
}
break;
case TCPOPT_SACK:
if (!tp || tcp_sack_option(tp, th, cp, optlen))
continue;
break;
default:
continue;
}

592
sys/netinet/tcp_sack.c Normal file
View File

@ -0,0 +1,592 @@
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
* $FreeBSD$
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include "opt_ipfw.h" /* for ipfw_fwd */
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_input.h"
#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
#include <vm/uma.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
#include <netinet/in_var.h>
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet6/nd6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
struct tcphdr tcp_savetcp;
#endif /* TCPDEBUG */
#ifdef FAST_IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#endif
#ifdef IPSEC
#include <netinet6/ipsec.h>
#include <netinet6/ipsec6.h>
#include <netkey/key.h>
#endif /*IPSEC*/
#include <machine/in_cksum.h>
extern struct uma_zone *sack_hole_zone;
/*
* This function is called upon receipt of new valid data (while not in header
* prediction mode), and it updates the ordered list of sacks.
*/
void
tcp_update_sack_list(tp)
struct tcpcb *tp;
{
/*
* First reported block MUST be the most recent one. Subsequent
* blocks SHOULD be in the order in which they arrived at the
* receiver. These two conditions make the implementation fully
* compliant with RFC 2018.
*/
int i, j = 0, count = 0, lastpos = -1;
struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
INP_LOCK_ASSERT(tp->t_inpcb);
/* First clean up current list of sacks */
for (i = 0; i < tp->rcv_numsacks; i++) {
sack = tp->sackblks[i];
if (sack.start == 0 && sack.end == 0) {
count++; /* count = number of blocks to be discarded */
continue;
}
if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
tp->sackblks[i].start = tp->sackblks[i].end = 0;
count++;
} else {
temp[j].start = tp->sackblks[i].start;
temp[j++].end = tp->sackblks[i].end;
}
}
tp->rcv_numsacks -= count;
if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
tcp_clean_sackreport(tp);
if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) {
/* ==> need first sack block */
tp->sackblks[0].start = tp->rcv_laststart;
tp->sackblks[0].end = tp->rcv_lastend;
tp->rcv_numsacks = 1;
}
return;
}
/* Otherwise, sack blocks are already present. */
for (i = 0; i < tp->rcv_numsacks; i++)
tp->sackblks[i] = temp[i]; /* first copy back sack list */
if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend))
return; /* sack list remains unchanged */
/*
* From here, segment just received should be (part of) the 1st sack.
* Go through list, possibly coalescing sack block entries.
*/
firstsack.start = tp->rcv_laststart;
firstsack.end = tp->rcv_lastend;
for (i = 0; i < tp->rcv_numsacks; i++) {
sack = tp->sackblks[i];
if (SEQ_LT(sack.end, firstsack.start) ||
SEQ_GT(sack.start, firstsack.end))
continue; /* no overlap */
if (sack.start == firstsack.start && sack.end == firstsack.end){
/*
* identical block; delete it here since we will
* move it to the front of the list.
*/
tp->sackblks[i].start = tp->sackblks[i].end = 0;
lastpos = i; /* last posn with a zero entry */
continue;
}
if (SEQ_LEQ(sack.start, firstsack.start))
firstsack.start = sack.start; /* merge blocks */
if (SEQ_GEQ(sack.end, firstsack.end))
firstsack.end = sack.end; /* merge blocks */
tp->sackblks[i].start = tp->sackblks[i].end = 0;
lastpos = i; /* last posn with a zero entry */
}
if (lastpos != -1) { /* at least one merge */
for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
sack = tp->sackblks[i];
if (sack.start == 0 && sack.end == 0)
continue;
temp[j++] = sack;
}
tp->rcv_numsacks = j; /* including first blk (added later) */
for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
tp->sackblks[i] = temp[i];
} else { /* no merges -- shift sacks by 1 */
if (tp->rcv_numsacks < MAX_SACK_BLKS)
tp->rcv_numsacks++;
for (i = tp->rcv_numsacks-1; i > 0; i--)
tp->sackblks[i] = tp->sackblks[i-1];
}
tp->sackblks[0] = firstsack;
return;
}
/*
* Delete all receiver-side SACK information.
*/
void
tcp_clean_sackreport(tp)
struct tcpcb *tp;
{
int i;
INP_LOCK_ASSERT(tp->t_inpcb);
tp->rcv_numsacks = 0;
for (i = 0; i < MAX_SACK_BLKS; i++)
tp->sackblks[i].start = tp->sackblks[i].end=0;
}
/*
* Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue,
* and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list
* of holes (oldest to newest, in terms of the sequence space).
*/
int
tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
{
int tmp_olen;
u_char *tmp_cp;
struct sackhole *cur, *p, *temp;
INP_LOCK_ASSERT(tp->t_inpcb);
if (!tp->sack_enable)
return (1);
/* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
return (1);
tmp_cp = cp + 2;
tmp_olen = optlen - 2;
tcpstat.tcps_sack_rcv_blocks++;
if (tp->snd_numholes < 0)
tp->snd_numholes = 0;
if (tp->t_maxseg == 0)
panic("tcp_sack_option"); /* Should never happen */
while (tmp_olen > 0) {
struct sackblk sack;
bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
sack.start = ntohl(sack.start);
bcopy(tmp_cp + sizeof(tcp_seq),
(char *) &(sack.end), sizeof(tcp_seq));
sack.end = ntohl(sack.end);
tmp_olen -= TCPOLEN_SACK;
tmp_cp += TCPOLEN_SACK;
if (SEQ_LEQ(sack.end, sack.start))
continue; /* bad SACK fields */
if (SEQ_LEQ(sack.end, tp->snd_una))
continue; /* old block */
if (SEQ_GT(th->th_ack, tp->snd_una)) {
if (SEQ_LT(sack.start, th->th_ack))
continue;
}
if (SEQ_GT(sack.end, tp->snd_max))
continue;
if (tp->snd_holes == NULL) { /* first hole */
tp->snd_holes = (struct sackhole *)
uma_zalloc(sack_hole_zone,M_NOWAIT);
if (tp->snd_holes == NULL) {
/* ENOBUFS, so ignore SACKed block for now*/
continue;
}
cur = tp->snd_holes;
cur->start = th->th_ack;
cur->end = sack.start;
cur->rxmit = cur->start;
cur->next = NULL;
tp->snd_numholes = 1;
tp->rcv_lastsack = sack.end;
continue; /* with next sack block */
}
/* Go thru list of holes: p = previous, cur = current */
p = cur = tp->snd_holes;
while (cur) {
if (SEQ_LEQ(sack.end, cur->start))
/* SACKs data before the current hole */
break; /* no use going through more holes */
if (SEQ_GEQ(sack.start, cur->end)) {
/* SACKs data beyond the current hole */
p = cur;
cur = cur->next;
continue;
}
if (SEQ_LEQ(sack.start, cur->start)) {
/* Data acks at least the beginning of hole */
if (SEQ_GEQ(sack.end, cur->end)) {
/* Acks entire hole, so delete hole */
if (p != cur) {
p->next = cur->next;
uma_zfree(sack_hole_zone, cur);
cur = p->next;
} else {
cur = cur->next;
uma_zfree(sack_hole_zone, p);
p = cur;
tp->snd_holes = p;
}
tp->snd_numholes--;
continue;
}
/* otherwise, move start of hole forward */
cur->start = sack.end;
cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
p = cur;
cur = cur->next;
continue;
}
/* move end of hole backward */
if (SEQ_GEQ(sack.end, cur->end)) {
cur->end = sack.start;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
p = cur;
cur = cur->next;
continue;
}
if (SEQ_LT(cur->start, sack.start) &&
SEQ_GT(cur->end, sack.end)) {
/*
* ACKs some data in middle of a hole; need to
* split current hole
*/
temp = (struct sackhole *)
uma_zalloc(sack_hole_zone,M_NOWAIT);
if (temp == NULL)
continue; /* ENOBUFS */
temp->next = cur->next;
temp->start = sack.end;
temp->end = cur->end;
temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
cur->end = sack.start;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
cur->next = temp;
p = temp;
cur = p->next;
tp->snd_numholes++;
}
}
/* At this point, p points to the last hole on the list */
if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
/*
* Need to append new hole at end.
* Last hole is p (and it's not NULL).
*/
temp = (struct sackhole *)
uma_zalloc(sack_hole_zone,M_NOWAIT);
if (temp == NULL)
continue; /* ENOBUFS */
temp->start = tp->rcv_lastsack;
temp->end = sack.start;
temp->rxmit = temp->start;
temp->next = 0;
p->next = temp;
tp->rcv_lastsack = sack.end;
tp->snd_numholes++;
}
}
return (0);
}
/*
* Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
* it is completely acked; otherwise, tcp_sack_option(), called from
* tcp_dooptions(), will fix up the hole.
*/
void
tcp_del_sackholes(tp, th)
struct tcpcb *tp;
struct tcphdr *th;
{
INP_LOCK_ASSERT(tp->t_inpcb);
if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
/* max because this could be an older ack just arrived */
tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
th->th_ack : tp->snd_una;
struct sackhole *cur = tp->snd_holes;
struct sackhole *prev;
while (cur)
if (SEQ_LEQ(cur->end, lastack)) {
prev = cur;
cur = cur->next;
uma_zfree(sack_hole_zone, prev);
tp->snd_numholes--;
} else if (SEQ_LT(cur->start, lastack)) {
cur->start = lastack;
if (SEQ_LT(cur->rxmit, cur->start))
cur->rxmit = cur->start;
break;
} else
break;
tp->snd_holes = cur;
}
}
void
tcp_free_sackholes(struct tcpcb *tp)
{
struct sackhole *p, *q;
INP_LOCK_ASSERT(tp->t_inpcb);
q = tp->snd_holes;
while (q != NULL) {
p = q;
q = q->next;
uma_zfree(sack_hole_zone, p);
}
tp->snd_holes = 0;
}
/*
* Checks for partial ack. If partial ack arrives, turn off retransmission
* timer, deflate the window, do not clear tp->t_dupacks, and return 1.
* If the ack advances at least to tp->snd_recover, return 0.
*/
void
tcp_sack_partialack(tp, th)
struct tcpcb *tp;
struct tcphdr *th;
{
INP_LOCK_ASSERT(tp->t_inpcb);
u_long ocwnd = tp->snd_cwnd;
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
/*
* Set snd_cwnd to one segment beyond acknowledged offset
* (tp->snd_una has not yet been updated when this function is called.)
*/
/*
* Should really be
* min(tp->snd_cwnd, tp->t_maxseg + (th->th_ack - tp->snd_una))
*/
tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
tp->snd_cwnd = ocwnd;
/*
* Partial window deflation. Relies on fact that tp->snd_una
* not updated yet.
*/
tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
}
#ifdef TCP_SACK_DEBUG
void
tcp_print_holes(struct tcpcb *tp)
{
struct sackhole *p = tp->snd_holes;
if (p == 0)
return;
printf("Hole report: start--end dups rxmit\n");
while (p) {
printf("%x--%x r %x\n", p->start, p->end, p->rxmit);
p = p->next;
}
printf("\n");
}
#endif /* TCP_SACK_DEBUG */
/*
* Returns pointer to a sackhole if there are any pending retransmissions;
* NULL otherwise.
*/
struct sackhole *
tcp_sack_output(struct tcpcb *tp)
{
struct sackhole *p;
INP_LOCK_ASSERT(tp->t_inpcb);
if (!tp->sack_enable)
return (NULL);
p = tp->snd_holes;
while (p) {
if (SEQ_LT(p->rxmit, p->end)) {
if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
p = p->next;
continue;
}
#ifdef TCP_SACK_DEBUG
if (p)
tcp_print_holes(tp);
#endif
return (p);
}
p = p->next;
}
return (NULL);
}
/*
* After a timeout, the SACK list may be rebuilt. This SACK information
* should be used to avoid retransmitting SACKed data. This function
* traverses the SACK list to see if snd_nxt should be moved forward.
*/
void
tcp_sack_adjust(struct tcpcb *tp)
{
INP_LOCK_ASSERT(tp->t_inpcb);
struct sackhole *cur = tp->snd_holes;
if (cur == NULL)
return; /* No holes */
if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
return; /* We're already beyond any SACKed blocks */
/*
* Two cases for which we want to advance snd_nxt:
* i) snd_nxt lies between end of one hole and beginning of another
* ii) snd_nxt lies between end of last hole and rcv_lastsack
*/
while (cur->next) {
if (SEQ_LT(tp->snd_nxt, cur->end))
return;
if (SEQ_GEQ(tp->snd_nxt, cur->next->start))
cur = cur->next;
else {
tp->snd_nxt = cur->next->start;
return;
}
}
if (SEQ_LT(tp->snd_nxt, cur->end))
return;
tp->snd_nxt = tp->rcv_lastsack;
return;
}

View File

@ -42,6 +42,9 @@
#define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
/* for modulo comparisons of timestamps */
#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)

View File

@ -36,6 +36,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@ -201,6 +202,17 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
int tcp_do_sack = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW,
&tcp_do_sack, 0, "Enable/Disable TCP SACK support");
int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW,
&tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements");
uma_zone_t sack_hole_zone;
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
static void tcp_isn_tick(void *);
@ -292,6 +304,8 @@ tcp_init()
tcp_isn_tick(NULL);
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
}
void
@ -606,6 +620,7 @@ tcp_newtcpcb(inp)
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
if (tcp_do_rfc1644)
tp->t_flags |= TF_REQ_CC;
tp->sack_enable = tcp_do_sack;
tp->t_inpcb = inp; /* XXX */
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
@ -739,6 +754,7 @@ tcp_discardcb(tp)
tp->t_segqlen--;
tcp_reass_qsize--;
}
tcp_free_sackholes(tp);
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(tcpcb_zone, tp);

View File

@ -39,6 +39,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@ -704,7 +705,10 @@ syncache_socket(sc, lso, m)
if (sc->sc_flags & SCF_SIGNATURE)
tp->t_flags |= TF_SIGNATURE;
#endif
if (sc->sc_flags & SCF_SACK) {
tp->sack_enable = 1;
tp->t_flags |= TF_SACK_PERMIT;
}
/*
* Set up MSS and get cached values from tcp_hostcache.
* This might overwrite some of the defaults we just set.
@ -991,6 +995,9 @@ syncache_add(inc, to, th, sop, m)
sc->sc_flags = SCF_SIGNATURE;
#endif
if (to->to_flags & TOF_SACK)
sc->sc_flags |= SCF_SACK;
/*
* XXX
* We have the option here of not doing TAO (even if the segment
@ -1107,6 +1114,7 @@ syncache_respond(sc, m)
optlen += (sc->sc_flags & SCF_SIGNATURE) ?
TCPOLEN_SIGNATURE + 2 : 0;
#endif
optlen += ((sc->sc_flags & SCF_SACK) ? 4 : 0);
}
tlen = hlen + sizeof(struct tcphdr) + optlen;
@ -1244,6 +1252,11 @@ syncache_respond(sc, m)
optp += TCPOLEN_SIGNATURE + 2;
}
#endif /* TCP_SIGNATURE */
if (sc->sc_flags & SCF_SACK) {
*(u_int32_t *)optp = htonl(TCPOPT_SACK_PERMIT_HDR);
optp += 4;
}
}
#ifdef INET6

View File

@ -32,6 +32,7 @@
#include "opt_inet6.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/kernel.h>
@ -217,6 +218,7 @@ tcp_timer_2msl(xtp)
return;
}
INP_LOCK(inp);
tcp_free_sackholes(tp);
if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) {
INP_UNLOCK(tp->t_inpcb);
INP_INFO_WUNLOCK(&tcbinfo);
@ -497,6 +499,7 @@ tcp_timer_rexmt(xtp)
return;
}
callout_deactivate(tp->tt_rexmt);
tcp_free_sackholes(tp);
/*
* Retransmission timer went off. Message has not
* been acked within retransmit interval. Back off

View File

@ -36,6 +36,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@ -201,6 +202,17 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
int tcp_do_sack = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW,
&tcp_do_sack, 0, "Enable/Disable TCP SACK support");
int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW,
&tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements");
uma_zone_t sack_hole_zone;
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
static void tcp_isn_tick(void *);
@ -292,6 +304,8 @@ tcp_init()
tcp_isn_tick(NULL);
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
}
void
@ -606,6 +620,7 @@ tcp_newtcpcb(inp)
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
if (tcp_do_rfc1644)
tp->t_flags |= TF_REQ_CC;
tp->sack_enable = tcp_do_sack;
tp->t_inpcb = inp; /* XXX */
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
@ -739,6 +754,7 @@ tcp_discardcb(tp)
tp->t_segqlen--;
tcp_reass_qsize--;
}
tcp_free_sackholes(tp);
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(tcpcb_zone, tp);

View File

@ -52,6 +52,17 @@ LIST_HEAD(tsegqe_head, tseg_qent);
extern int tcp_reass_qsize;
extern struct uma_zone *tcp_reass_zone;
struct sackblk {
tcp_seq start; /* start seq no. of sack block */
tcp_seq end; /* end seq no. */
};
struct sackhole {
tcp_seq start; /* start seq no. of hole */
tcp_seq end; /* end seq no. */
tcp_seq rxmit; /* next seq. no in hole to be retransmitted */
struct sackhole *next; /* next in list */
};
struct tcptemp {
u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
struct tcphdr tt_t;
@ -179,6 +190,16 @@ struct tcpcb {
u_long rcv_second; /* start of interval second */
u_long rcv_pps; /* received packets per second */
u_long rcv_byps; /* received bytes per second */
/* SACK related state */
int sack_enable; /* enable SACK for this connection */
int snd_numholes; /* number of holes seen by sender */
struct sackhole *snd_holes; /* linked list of holes (sorted) */
tcp_seq rcv_laststart; /* start of last segment recd. */
tcp_seq rcv_lastend; /* end of ... */
tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
int rcv_numsacks; /* # distinct sack blks present */
struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
};
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
@ -216,6 +237,7 @@ struct tcpopt {
#define TOF_SCALE 0x0020
#define TOF_SIGNATURE 0x0040 /* signature option present */
#define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */
#define TOF_SACK 0x0100 /* Peer sent SACK option */
u_int32_t to_tsval;
u_int32_t to_tsecr;
tcp_cc to_cc; /* holds CC or CCnew */
@ -249,6 +271,7 @@ struct syncache {
#define SCF_CC 0x08 /* negotiated CC */
#define SCF_UNREACH 0x10 /* icmp unreachable received */
#define SCF_SIGNATURE 0x20 /* send MD5 digests */
#define SCF_SACK 0x80 /* send SACK option */
TAILQ_ENTRY(syncache) sc_hash;
TAILQ_ENTRY(syncache) sc_timerq;
};
@ -434,6 +457,13 @@ struct tcpstat {
u_long tcps_hc_added; /* entry added to hostcache */
u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
/* SACK related stats */
u_long tcps_sack_recovery_episode; /* SACK recovery episodes */
u_long tcps_sack_rexmits; /* SACK rexmit segments */
u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
};
/*
@ -467,7 +497,8 @@ struct xtcpcb {
#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */
#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */
#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */
#define TCPCTL_MAXID 14
#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */
#define TCPCTL_MAXID 15
#define TCPCTL_NAMES { \
{ 0, 0 }, \
@ -505,6 +536,8 @@ extern int path_mtu_discovery;
extern int ss_fltsz;
extern int ss_fltsz_local;
extern int tcp_do_sack; /* SACK enabled/disabled */
void tcp_canceltimers(struct tcpcb *);
struct tcpcb *
tcp_close(struct tcpcb *);
@ -578,6 +611,20 @@ extern u_long tcp_sendspace;
extern u_long tcp_recvspace;
tcp_seq tcp_new_isn(struct tcpcb *);
int tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int);
void tcp_update_sack_list(struct tcpcb *tp);
void tcp_del_sackholes(struct tcpcb *, struct tcphdr *);
void tcp_clean_sackreport(struct tcpcb *tp);
void tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp);
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);
u_long tcp_seq_subtract(u_long, u_long );
#ifdef TCP_SACK_DEBUG
void tcp_print_holes(struct tcpcb *tp);
#endif /* TCP_SACK_DEBUG */
#endif /* _KERNEL */
#endif /* _NETINET_TCP_VAR_H_ */