Add support for TCP Selective Acknowledgements. The work for this
originated on RELENG_4 and was ported to -CURRENT. The scoreboarding code was obtained from OpenBSD, and many of the remaining changes were inspired by OpenBSD, but not taken directly from there. You can enable/disable sack using net.inet.tcp.do_sack. You can also limit the number of sack holes that all senders can have in the scoreboard with net.inet.tcp.sackhole_limit. Reviewed by: gnn Obtained from: Yahoo! (Mohan Srinivasan, Jayanth Vijayaraghavan)
This commit is contained in:
parent
89ec2c3c42
commit
6d90faf3d8
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=130989
@ -1465,6 +1465,7 @@ netinet/tcp_debug.c optional tcpdebug
|
||||
netinet/tcp_hostcache.c optional inet
|
||||
netinet/tcp_input.c optional inet
|
||||
netinet/tcp_output.c optional inet
|
||||
netinet/tcp_sack.c optional inet
|
||||
netinet/tcp_subr.c optional inet
|
||||
netinet/tcp_syncache.c optional inet
|
||||
netinet/tcp_timer.c optional inet
|
||||
|
@ -354,6 +354,7 @@ RANDOM_IP_ID
|
||||
SLIP_IFF_OPTS opt_slip.h
|
||||
TCPDEBUG
|
||||
TCP_SIGNATURE opt_inet.h
|
||||
TCP_SACK_DEBUG opt_tcp_sack.h
|
||||
TCP_DROP_SYNFIN opt_tcp_input.h
|
||||
XBONEHACK
|
||||
|
||||
|
@ -85,12 +85,15 @@ struct tcphdr {
|
||||
#define TCPOPT_SACK_PERMITTED 4 /* Experimental */
|
||||
#define TCPOLEN_SACK_PERMITTED 2
|
||||
#define TCPOPT_SACK 5 /* Experimental */
|
||||
#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */
|
||||
#define TCPOPT_TIMESTAMP 8
|
||||
#define TCPOLEN_TIMESTAMP 10
|
||||
#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
|
||||
#define TCPOPT_TSTAMP_HDR \
|
||||
(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
|
||||
|
||||
#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */
|
||||
|
||||
#define TCPOPT_CC 11 /* CC options: RFC-1644 */
|
||||
#define TCPOPT_CCNEW 12
|
||||
#define TCPOPT_CCECHO 13
|
||||
@ -101,6 +104,15 @@ struct tcphdr {
|
||||
#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */
|
||||
#define TCPOLEN_SIGNATURE 18
|
||||
|
||||
/* Option definitions */
|
||||
#define TCPOPT_SACK_PERMIT_HDR \
|
||||
(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
|
||||
#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
|
||||
/* Miscellaneous constants */
|
||||
#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */
|
||||
#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */
|
||||
|
||||
|
||||
/*
|
||||
* Default maximum segment size for TCP.
|
||||
* With an IP MTU of 576, this is 536,
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include "opt_mac.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_tcp_input.h"
|
||||
#include "opt_tcp_sack.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/kernel.h>
|
||||
@ -159,7 +160,9 @@ struct inpcbhead tcb;
|
||||
struct inpcbinfo tcbinfo;
|
||||
struct mtx *tcbinfo_mtx;
|
||||
|
||||
static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
|
||||
static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *,
|
||||
int, int, struct tcphdr *);
|
||||
|
||||
static void tcp_pulloutofband(struct socket *,
|
||||
struct tcphdr *, struct mbuf *, int);
|
||||
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
|
||||
@ -724,7 +727,7 @@ tcp_input(m, off0)
|
||||
* present in a SYN segment. See tcp_timewait().
|
||||
*/
|
||||
if (thflags & TH_SYN)
|
||||
tcp_dooptions(&to, optp, optlen, 1);
|
||||
tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th);
|
||||
if (tcp_timewait((struct tcptw *)inp->inp_ppcb,
|
||||
&to, th, m, tlen))
|
||||
goto findpcb;
|
||||
@ -938,7 +941,7 @@ tcp_input(m, off0)
|
||||
tcp_trace(TA_INPUT, ostate, tp,
|
||||
(void *)tcp_saveipgen, &tcp_savetcp, 0);
|
||||
#endif
|
||||
tcp_dooptions(&to, optp, optlen, 1);
|
||||
tcp_dooptions(tp, &to, optp, optlen, 1, th);
|
||||
if (!syncache_add(&inc, &to, th, &so, m))
|
||||
goto drop;
|
||||
if (so == NULL) {
|
||||
@ -1054,7 +1057,7 @@ tcp_input(m, off0)
|
||||
* for incoming connections is handled in tcp_syncache.
|
||||
* XXX this is traditional behavior, may need to be cleaned up.
|
||||
*/
|
||||
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
|
||||
tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th);
|
||||
if (thflags & TH_SYN) {
|
||||
if (to.to_flags & TOF_SCALE) {
|
||||
tp->t_flags |= TF_RCVD_SCALE;
|
||||
@ -1069,6 +1072,20 @@ tcp_input(m, off0)
|
||||
tp->t_flags |= TF_RCVD_CC;
|
||||
if (to.to_flags & TOF_MSS)
|
||||
tcp_mss(tp, to.to_mss);
|
||||
if (tp->sack_enable) {
|
||||
if (!(to.to_flags & TOF_SACK))
|
||||
tp->sack_enable = 0;
|
||||
else
|
||||
tp->t_flags |= TF_SACK_PERMIT;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (tp->sack_enable) {
|
||||
/* Delete stale (cumulatively acked) SACK holes */
|
||||
tcp_del_sackholes(tp, th);
|
||||
tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
|
||||
tp->rcv_lastend = th->th_seq + tlen;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1120,9 +1137,10 @@ tcp_input(m, off0)
|
||||
if (SEQ_GT(th->th_ack, tp->snd_una) &&
|
||||
SEQ_LEQ(th->th_ack, tp->snd_max) &&
|
||||
tp->snd_cwnd >= tp->snd_wnd &&
|
||||
((!tcp_do_newreno &&
|
||||
((!tcp_do_newreno && !tp->sack_enable &&
|
||||
tp->t_dupacks < tcprexmtthresh) ||
|
||||
(tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
|
||||
((tcp_do_newreno || tp->sack_enable) &&
|
||||
!IN_FASTRECOVERY(tp)))) {
|
||||
KASSERT(headlocked, ("headlocked"));
|
||||
INP_INFO_WUNLOCK(&tcbinfo);
|
||||
/*
|
||||
@ -1218,6 +1236,9 @@ tcp_input(m, off0)
|
||||
* with nothing on the reassembly queue and
|
||||
* we have enough buffer space to take it.
|
||||
*/
|
||||
/* Clean receiver SACK report if present */
|
||||
if (tp->sack_enable && tp->rcv_numsacks)
|
||||
tcp_clean_sackreport(tp);
|
||||
++tcpstat.tcps_preddat;
|
||||
tp->rcv_nxt += tlen;
|
||||
/*
|
||||
@ -1898,7 +1919,7 @@ tcp_input(m, off0)
|
||||
th->th_ack != tp->snd_una)
|
||||
tp->t_dupacks = 0;
|
||||
else if (++tp->t_dupacks > tcprexmtthresh ||
|
||||
(tcp_do_newreno &&
|
||||
((tcp_do_newreno || tp->sack_enable) &&
|
||||
IN_FASTRECOVERY(tp))) {
|
||||
tp->snd_cwnd += tp->t_maxseg;
|
||||
(void) tcp_output(tp);
|
||||
@ -1906,7 +1927,8 @@ tcp_input(m, off0)
|
||||
} else if (tp->t_dupacks == tcprexmtthresh) {
|
||||
tcp_seq onxt = tp->snd_nxt;
|
||||
u_int win;
|
||||
if (tcp_do_newreno &&
|
||||
if ((tcp_do_newreno ||
|
||||
tp->sack_enable) &&
|
||||
SEQ_LEQ(th->th_ack,
|
||||
tp->snd_recover)) {
|
||||
tp->t_dupacks = 0;
|
||||
@ -1921,6 +1943,17 @@ tcp_input(m, off0)
|
||||
tp->snd_recover = tp->snd_max;
|
||||
callout_stop(tp->tt_rexmt);
|
||||
tp->t_rtttime = 0;
|
||||
if (tp->sack_enable) {
|
||||
tcpstat.tcps_sack_recovery_episode++;
|
||||
tp->snd_cwnd =
|
||||
tp->t_maxseg *
|
||||
tp->t_dupacks;
|
||||
(void) tcp_output(tp);
|
||||
tp->snd_cwnd =
|
||||
tp->snd_ssthresh;
|
||||
goto drop;
|
||||
}
|
||||
|
||||
tp->snd_nxt = th->th_ack;
|
||||
tp->snd_cwnd = tp->t_maxseg;
|
||||
(void) tcp_output(tp);
|
||||
@ -1971,12 +2004,16 @@ tcp_input(m, off0)
|
||||
* If the congestion window was inflated to account
|
||||
* for the other side's cached packets, retract it.
|
||||
*/
|
||||
if (tcp_do_newreno) {
|
||||
if (tcp_do_newreno || tp->sack_enable) {
|
||||
if (IN_FASTRECOVERY(tp)) {
|
||||
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
|
||||
tcp_newreno_partial_ack(tp, th);
|
||||
if (tp->sack_enable)
|
||||
tcp_sack_partialack(tp, th);
|
||||
else
|
||||
tcp_newreno_partial_ack(tp, th);
|
||||
} else {
|
||||
/*
|
||||
* Out of fast recovery.
|
||||
* Window inflation should have left us
|
||||
* with approximately snd_ssthresh
|
||||
* outstanding data.
|
||||
@ -2098,7 +2135,8 @@ tcp_input(m, off0)
|
||||
* Otherwise open linearly: maxseg per window
|
||||
* (maxseg^2 / cwnd per packet).
|
||||
*/
|
||||
if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) {
|
||||
if ((!tcp_do_newreno && !tp->sack_enable) ||
|
||||
!IN_FASTRECOVERY(tp)) {
|
||||
register u_int cw = tp->snd_cwnd;
|
||||
register u_int incr = tp->t_maxseg;
|
||||
if (cw > tp->snd_ssthresh)
|
||||
@ -2116,14 +2154,20 @@ tcp_input(m, off0)
|
||||
}
|
||||
sowwakeup(so);
|
||||
/* detect una wraparound */
|
||||
if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
|
||||
if ((tcp_do_newreno || tp->sack_enable) &&
|
||||
!IN_FASTRECOVERY(tp) &&
|
||||
SEQ_GT(tp->snd_una, tp->snd_recover) &&
|
||||
SEQ_LEQ(th->th_ack, tp->snd_recover))
|
||||
tp->snd_recover = th->th_ack - 1;
|
||||
if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
|
||||
if ((tcp_do_newreno || tp->sack_enable) &&
|
||||
IN_FASTRECOVERY(tp) &&
|
||||
SEQ_GEQ(th->th_ack, tp->snd_recover))
|
||||
EXIT_FASTRECOVERY(tp);
|
||||
tp->snd_una = th->th_ack;
|
||||
if (tp->sack_enable) {
|
||||
if (SEQ_GT(tp->snd_una, tp->snd_recover))
|
||||
tp->snd_recover = tp->snd_una;
|
||||
}
|
||||
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
|
||||
tp->snd_nxt = tp->snd_una;
|
||||
|
||||
@ -2327,7 +2371,8 @@ tcp_input(m, off0)
|
||||
thflags = tcp_reass(tp, th, &tlen, m);
|
||||
tp->t_flags |= TF_ACKNOW;
|
||||
}
|
||||
|
||||
if (tp->sack_enable)
|
||||
tcp_update_sack_list(tp);
|
||||
/*
|
||||
* Note the amount of data that peer has sent into
|
||||
* our window, in order to estimate the sender's
|
||||
@ -2530,11 +2575,13 @@ tcp_input(m, off0)
|
||||
* Parse TCP options and place in tcpopt.
|
||||
*/
|
||||
static void
|
||||
tcp_dooptions(to, cp, cnt, is_syn)
|
||||
tcp_dooptions(tp, to, cp, cnt, is_syn, th)
|
||||
struct tcpcb *tp;
|
||||
struct tcpopt *to;
|
||||
u_char *cp;
|
||||
u_char *cp;
|
||||
int cnt;
|
||||
int is_syn;
|
||||
struct tcphdr *th;
|
||||
{
|
||||
int opt, optlen;
|
||||
|
||||
@ -2623,6 +2670,20 @@ tcp_dooptions(to, cp, cnt, is_syn)
|
||||
to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
|
||||
break;
|
||||
#endif
|
||||
case TCPOPT_SACK_PERMITTED:
|
||||
if (!tcp_do_sack ||
|
||||
optlen != TCPOLEN_SACK_PERMITTED)
|
||||
continue;
|
||||
if (is_syn) {
|
||||
/* MUST only be set on SYN */
|
||||
to->to_flags |= TOF_SACK;
|
||||
}
|
||||
break;
|
||||
|
||||
case TCPOPT_SACK:
|
||||
if (!tp || tcp_sack_option(tp, th, cp, optlen))
|
||||
continue;
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "opt_ipsec.h"
|
||||
#include "opt_mac.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_tcp_sack.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
@ -122,6 +123,8 @@ tcp_output(struct tcpcb *tp)
|
||||
u_char opt[TCP_MAXOLEN];
|
||||
unsigned ipoptlen, optlen, hdrlen;
|
||||
int idle, sendalot;
|
||||
int i, sack_rxmit;
|
||||
struct sackhole *p;
|
||||
#if 0
|
||||
int maxburst = TCP_MAXBURST;
|
||||
#endif
|
||||
@ -171,12 +174,49 @@ tcp_output(struct tcpcb *tp)
|
||||
}
|
||||
}
|
||||
again:
|
||||
/*
|
||||
* If we've recently taken a timeout, snd_max will be greater than
|
||||
* snd_nxt. There may be SACK information that allows us to avoid
|
||||
* resending already delivered data. Adjust snd_nxt accordingly.
|
||||
*/
|
||||
if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
|
||||
tcp_sack_adjust(tp);
|
||||
sendalot = 0;
|
||||
off = tp->snd_nxt - tp->snd_una;
|
||||
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
|
||||
sendwin = min(sendwin, tp->snd_bwnd);
|
||||
|
||||
flags = tcp_outflags[tp->t_state];
|
||||
/*
|
||||
* Send any SACK-generated retransmissions. If we're explicitly trying
|
||||
* to send out new data (when sendalot is 1), bypass this function.
|
||||
* If we retransmit in fast recovery mode, decrement snd_cwnd, since
|
||||
* we're replacing a (future) new transmission with a retransmission
|
||||
* now, and we previously incremented snd_cwnd in tcp_input().
|
||||
*/
|
||||
/*
|
||||
* Still in sack recovery , reset rxmit flag to zero.
|
||||
*/
|
||||
sack_rxmit = 0;
|
||||
len = 0;
|
||||
p = NULL;
|
||||
if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
|
||||
(p = tcp_sack_output(tp))) {
|
||||
sack_rxmit = 1;
|
||||
sendalot = 1;
|
||||
off = p->rxmit - tp->snd_una;
|
||||
KASSERT(tp->snd_cwnd >= 0,("%s: CWIN is negative: %ld", __func__, tp->snd_cwnd));
|
||||
/* Do not retransmit SACK segments beyond snd_recover */
|
||||
if (SEQ_GT(p->end, tp->snd_recover))
|
||||
len = min(tp->snd_cwnd, tp->snd_recover - p->rxmit);
|
||||
else
|
||||
len = min(tp->snd_cwnd, p->end - p->rxmit);
|
||||
if (len > 0) {
|
||||
tcpstat.tcps_sack_rexmits++;
|
||||
tcpstat.tcps_sack_rexmit_bytes +=
|
||||
min(len, tp->t_maxseg);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Get standard flags, and add SYN or FIN if requested by 'hidden'
|
||||
* state flags.
|
||||
@ -230,9 +270,12 @@ tcp_output(struct tcpcb *tp)
|
||||
* In the normal retransmit-FIN-only case, however, snd_nxt will
|
||||
* be set to snd_una, the offset will be 0, and the length may
|
||||
* wind up 0.
|
||||
*
|
||||
* If sack_rxmit is true we are retransmitting from the scoreboard
|
||||
* in which case len is already set.
|
||||
*/
|
||||
len = (long)ulmin(so->so_snd.sb_cc, sendwin) - off;
|
||||
|
||||
if (!sack_rxmit)
|
||||
len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
|
||||
|
||||
/*
|
||||
* Lop off SYN bit if it has already been sent. However, if this
|
||||
@ -331,6 +374,8 @@ tcp_output(struct tcpcb *tp)
|
||||
goto send;
|
||||
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
|
||||
goto send;
|
||||
if (sack_rxmit)
|
||||
goto send;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -374,7 +419,18 @@ tcp_output(struct tcpcb *tp)
|
||||
if (flags & TH_FIN &&
|
||||
((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
|
||||
goto send;
|
||||
|
||||
/*
|
||||
* In SACK, it is possible for tcp_output to fail to send a segment
|
||||
* after the retransmission timer has been turned off. Make sure
|
||||
* that the retransmission timer is set.
|
||||
*/
|
||||
if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
|
||||
!callout_active(tp->tt_rexmt) &&
|
||||
!callout_active(tp->tt_persist)) {
|
||||
callout_reset(tp->tt_rexmt, tp->t_rxtcur,
|
||||
tcp_timer_rexmt, tp);
|
||||
return (0);
|
||||
}
|
||||
/*
|
||||
* TCP window updates are not reliable, rather a polling protocol
|
||||
* using ``persist'' packets is used to insure receipt of window
|
||||
@ -435,6 +491,19 @@ tcp_output(struct tcpcb *tp)
|
||||
(void)memcpy(opt + 2, &mss, sizeof(mss));
|
||||
optlen = TCPOLEN_MAXSEG;
|
||||
|
||||
/*
|
||||
* If this is the first SYN of connection (not a SYN
|
||||
* ACK), include SACK_PERMIT_HDR option. If this is a
|
||||
* SYN ACK, include SACK_PERMIT_HDR option if peer has
|
||||
* already done so. This is only for active connect,
|
||||
* since the syncache takes care of the passive connect.
|
||||
*/
|
||||
if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
|
||||
(tp->t_flags & TF_SACK_PERMIT))) {
|
||||
*((u_int32_t *) (opt + optlen)) =
|
||||
htonl(TCPOPT_SACK_PERMIT_HDR);
|
||||
optlen += 4;
|
||||
}
|
||||
if ((tp->t_flags & TF_REQ_SCALE) &&
|
||||
((flags & TH_ACK) == 0 ||
|
||||
(tp->t_flags & TF_RCVD_SCALE))) {
|
||||
@ -466,6 +535,32 @@ tcp_output(struct tcpcb *tp)
|
||||
optlen += TCPOLEN_TSTAMP_APPA;
|
||||
}
|
||||
|
||||
/*
|
||||
* Send SACKs if necessary. This should be the last option processed.
|
||||
* Only as many SACKs are sent as are permitted by the maximum options
|
||||
* size. No more than three SACKs are sent.
|
||||
*/
|
||||
if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
|
||||
(tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
|
||||
tp->rcv_numsacks) {
|
||||
u_int32_t *lp = (u_int32_t *)(opt + optlen);
|
||||
u_int32_t *olp = lp++;
|
||||
int count = 0; /* actual number of SACKs inserted */
|
||||
int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
|
||||
|
||||
tcpstat.tcps_sack_send_blocks++;
|
||||
maxsack = min(maxsack, TCP_MAX_SACK);
|
||||
for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
|
||||
struct sackblk sack = tp->sackblks[i];
|
||||
if (sack.start == 0 && sack.end == 0)
|
||||
continue;
|
||||
*lp++ = htonl(sack.start);
|
||||
*lp++ = htonl(sack.end);
|
||||
count++;
|
||||
}
|
||||
*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
|
||||
optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
|
||||
}
|
||||
/*
|
||||
* Send `CC-family' options if our side wants to use them (TF_REQ_CC),
|
||||
* options are allowed (!TF_NOOPT) and it's not a RST.
|
||||
@ -734,6 +829,10 @@ tcp_output(struct tcpcb *tp)
|
||||
th->th_seq = htonl(tp->snd_nxt);
|
||||
else
|
||||
th->th_seq = htonl(tp->snd_max);
|
||||
if (sack_rxmit) {
|
||||
th->th_seq = htonl(p->rxmit);
|
||||
p->rxmit += len;
|
||||
}
|
||||
th->th_ack = htonl(tp->rcv_nxt);
|
||||
if (optlen) {
|
||||
bcopy(opt, th + 1, optlen);
|
||||
@ -831,6 +930,8 @@ tcp_output(struct tcpcb *tp)
|
||||
tp->t_flags |= TF_SENTFIN;
|
||||
}
|
||||
}
|
||||
if (tp->sack_enable && sack_rxmit && (p->rxmit != tp->snd_nxt))
|
||||
goto timer;
|
||||
tp->snd_nxt += len;
|
||||
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
|
||||
tp->snd_max = tp->snd_nxt;
|
||||
@ -853,6 +954,17 @@ tcp_output(struct tcpcb *tp)
|
||||
* Initialize shift counter which is used for backoff
|
||||
* of retransmit time.
|
||||
*/
|
||||
timer:
|
||||
if (tp->sack_enable && sack_rxmit &&
|
||||
!callout_active(tp->tt_rexmt) &&
|
||||
tp->snd_nxt != tp->snd_max) {
|
||||
callout_reset(tp->tt_rexmt, tp->t_rxtcur,
|
||||
tcp_timer_rexmt, tp);
|
||||
if (callout_active(tp->tt_persist)) {
|
||||
callout_stop(tp->tt_persist);
|
||||
tp->t_rxtshift = 0;
|
||||
}
|
||||
}
|
||||
if (!callout_active(tp->tt_rexmt) &&
|
||||
tp->snd_nxt != tp->snd_una) {
|
||||
if (callout_active(tp->tt_persist)) {
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include "opt_mac.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_tcp_input.h"
|
||||
#include "opt_tcp_sack.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/kernel.h>
|
||||
@ -159,7 +160,9 @@ struct inpcbhead tcb;
|
||||
struct inpcbinfo tcbinfo;
|
||||
struct mtx *tcbinfo_mtx;
|
||||
|
||||
static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
|
||||
static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *,
|
||||
int, int, struct tcphdr *);
|
||||
|
||||
static void tcp_pulloutofband(struct socket *,
|
||||
struct tcphdr *, struct mbuf *, int);
|
||||
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
|
||||
@ -724,7 +727,7 @@ tcp_input(m, off0)
|
||||
* present in a SYN segment. See tcp_timewait().
|
||||
*/
|
||||
if (thflags & TH_SYN)
|
||||
tcp_dooptions(&to, optp, optlen, 1);
|
||||
tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th);
|
||||
if (tcp_timewait((struct tcptw *)inp->inp_ppcb,
|
||||
&to, th, m, tlen))
|
||||
goto findpcb;
|
||||
@ -938,7 +941,7 @@ tcp_input(m, off0)
|
||||
tcp_trace(TA_INPUT, ostate, tp,
|
||||
(void *)tcp_saveipgen, &tcp_savetcp, 0);
|
||||
#endif
|
||||
tcp_dooptions(&to, optp, optlen, 1);
|
||||
tcp_dooptions(tp, &to, optp, optlen, 1, th);
|
||||
if (!syncache_add(&inc, &to, th, &so, m))
|
||||
goto drop;
|
||||
if (so == NULL) {
|
||||
@ -1054,7 +1057,7 @@ tcp_input(m, off0)
|
||||
* for incoming connections is handled in tcp_syncache.
|
||||
* XXX this is traditional behavior, may need to be cleaned up.
|
||||
*/
|
||||
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
|
||||
tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th);
|
||||
if (thflags & TH_SYN) {
|
||||
if (to.to_flags & TOF_SCALE) {
|
||||
tp->t_flags |= TF_RCVD_SCALE;
|
||||
@ -1069,6 +1072,20 @@ tcp_input(m, off0)
|
||||
tp->t_flags |= TF_RCVD_CC;
|
||||
if (to.to_flags & TOF_MSS)
|
||||
tcp_mss(tp, to.to_mss);
|
||||
if (tp->sack_enable) {
|
||||
if (!(to.to_flags & TOF_SACK))
|
||||
tp->sack_enable = 0;
|
||||
else
|
||||
tp->t_flags |= TF_SACK_PERMIT;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (tp->sack_enable) {
|
||||
/* Delete stale (cumulatively acked) SACK holes */
|
||||
tcp_del_sackholes(tp, th);
|
||||
tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
|
||||
tp->rcv_lastend = th->th_seq + tlen;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1120,9 +1137,10 @@ tcp_input(m, off0)
|
||||
if (SEQ_GT(th->th_ack, tp->snd_una) &&
|
||||
SEQ_LEQ(th->th_ack, tp->snd_max) &&
|
||||
tp->snd_cwnd >= tp->snd_wnd &&
|
||||
((!tcp_do_newreno &&
|
||||
((!tcp_do_newreno && !tp->sack_enable &&
|
||||
tp->t_dupacks < tcprexmtthresh) ||
|
||||
(tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
|
||||
((tcp_do_newreno || tp->sack_enable) &&
|
||||
!IN_FASTRECOVERY(tp)))) {
|
||||
KASSERT(headlocked, ("headlocked"));
|
||||
INP_INFO_WUNLOCK(&tcbinfo);
|
||||
/*
|
||||
@ -1218,6 +1236,9 @@ tcp_input(m, off0)
|
||||
* with nothing on the reassembly queue and
|
||||
* we have enough buffer space to take it.
|
||||
*/
|
||||
/* Clean receiver SACK report if present */
|
||||
if (tp->sack_enable && tp->rcv_numsacks)
|
||||
tcp_clean_sackreport(tp);
|
||||
++tcpstat.tcps_preddat;
|
||||
tp->rcv_nxt += tlen;
|
||||
/*
|
||||
@ -1898,7 +1919,7 @@ tcp_input(m, off0)
|
||||
th->th_ack != tp->snd_una)
|
||||
tp->t_dupacks = 0;
|
||||
else if (++tp->t_dupacks > tcprexmtthresh ||
|
||||
(tcp_do_newreno &&
|
||||
((tcp_do_newreno || tp->sack_enable) &&
|
||||
IN_FASTRECOVERY(tp))) {
|
||||
tp->snd_cwnd += tp->t_maxseg;
|
||||
(void) tcp_output(tp);
|
||||
@ -1906,7 +1927,8 @@ tcp_input(m, off0)
|
||||
} else if (tp->t_dupacks == tcprexmtthresh) {
|
||||
tcp_seq onxt = tp->snd_nxt;
|
||||
u_int win;
|
||||
if (tcp_do_newreno &&
|
||||
if ((tcp_do_newreno ||
|
||||
tp->sack_enable) &&
|
||||
SEQ_LEQ(th->th_ack,
|
||||
tp->snd_recover)) {
|
||||
tp->t_dupacks = 0;
|
||||
@ -1921,6 +1943,17 @@ tcp_input(m, off0)
|
||||
tp->snd_recover = tp->snd_max;
|
||||
callout_stop(tp->tt_rexmt);
|
||||
tp->t_rtttime = 0;
|
||||
if (tp->sack_enable) {
|
||||
tcpstat.tcps_sack_recovery_episode++;
|
||||
tp->snd_cwnd =
|
||||
tp->t_maxseg *
|
||||
tp->t_dupacks;
|
||||
(void) tcp_output(tp);
|
||||
tp->snd_cwnd =
|
||||
tp->snd_ssthresh;
|
||||
goto drop;
|
||||
}
|
||||
|
||||
tp->snd_nxt = th->th_ack;
|
||||
tp->snd_cwnd = tp->t_maxseg;
|
||||
(void) tcp_output(tp);
|
||||
@ -1971,12 +2004,16 @@ tcp_input(m, off0)
|
||||
* If the congestion window was inflated to account
|
||||
* for the other side's cached packets, retract it.
|
||||
*/
|
||||
if (tcp_do_newreno) {
|
||||
if (tcp_do_newreno || tp->sack_enable) {
|
||||
if (IN_FASTRECOVERY(tp)) {
|
||||
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
|
||||
tcp_newreno_partial_ack(tp, th);
|
||||
if (tp->sack_enable)
|
||||
tcp_sack_partialack(tp, th);
|
||||
else
|
||||
tcp_newreno_partial_ack(tp, th);
|
||||
} else {
|
||||
/*
|
||||
* Out of fast recovery.
|
||||
* Window inflation should have left us
|
||||
* with approximately snd_ssthresh
|
||||
* outstanding data.
|
||||
@ -2098,7 +2135,8 @@ tcp_input(m, off0)
|
||||
* Otherwise open linearly: maxseg per window
|
||||
* (maxseg^2 / cwnd per packet).
|
||||
*/
|
||||
if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) {
|
||||
if ((!tcp_do_newreno && !tp->sack_enable) ||
|
||||
!IN_FASTRECOVERY(tp)) {
|
||||
register u_int cw = tp->snd_cwnd;
|
||||
register u_int incr = tp->t_maxseg;
|
||||
if (cw > tp->snd_ssthresh)
|
||||
@ -2116,14 +2154,20 @@ tcp_input(m, off0)
|
||||
}
|
||||
sowwakeup(so);
|
||||
/* detect una wraparound */
|
||||
if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
|
||||
if ((tcp_do_newreno || tp->sack_enable) &&
|
||||
!IN_FASTRECOVERY(tp) &&
|
||||
SEQ_GT(tp->snd_una, tp->snd_recover) &&
|
||||
SEQ_LEQ(th->th_ack, tp->snd_recover))
|
||||
tp->snd_recover = th->th_ack - 1;
|
||||
if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
|
||||
if ((tcp_do_newreno || tp->sack_enable) &&
|
||||
IN_FASTRECOVERY(tp) &&
|
||||
SEQ_GEQ(th->th_ack, tp->snd_recover))
|
||||
EXIT_FASTRECOVERY(tp);
|
||||
tp->snd_una = th->th_ack;
|
||||
if (tp->sack_enable) {
|
||||
if (SEQ_GT(tp->snd_una, tp->snd_recover))
|
||||
tp->snd_recover = tp->snd_una;
|
||||
}
|
||||
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
|
||||
tp->snd_nxt = tp->snd_una;
|
||||
|
||||
@ -2327,7 +2371,8 @@ tcp_input(m, off0)
|
||||
thflags = tcp_reass(tp, th, &tlen, m);
|
||||
tp->t_flags |= TF_ACKNOW;
|
||||
}
|
||||
|
||||
if (tp->sack_enable)
|
||||
tcp_update_sack_list(tp);
|
||||
/*
|
||||
* Note the amount of data that peer has sent into
|
||||
* our window, in order to estimate the sender's
|
||||
@ -2530,11 +2575,13 @@ tcp_input(m, off0)
|
||||
* Parse TCP options and place in tcpopt.
|
||||
*/
|
||||
static void
|
||||
tcp_dooptions(to, cp, cnt, is_syn)
|
||||
tcp_dooptions(tp, to, cp, cnt, is_syn, th)
|
||||
struct tcpcb *tp;
|
||||
struct tcpopt *to;
|
||||
u_char *cp;
|
||||
u_char *cp;
|
||||
int cnt;
|
||||
int is_syn;
|
||||
struct tcphdr *th;
|
||||
{
|
||||
int opt, optlen;
|
||||
|
||||
@ -2623,6 +2670,20 @@ tcp_dooptions(to, cp, cnt, is_syn)
|
||||
to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
|
||||
break;
|
||||
#endif
|
||||
case TCPOPT_SACK_PERMITTED:
|
||||
if (!tcp_do_sack ||
|
||||
optlen != TCPOLEN_SACK_PERMITTED)
|
||||
continue;
|
||||
if (is_syn) {
|
||||
/* MUST only be set on SYN */
|
||||
to->to_flags |= TOF_SACK;
|
||||
}
|
||||
break;
|
||||
|
||||
case TCPOPT_SACK:
|
||||
if (!tp || tcp_sack_option(tp, th, cp, optlen))
|
||||
continue;
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
592
sys/netinet/tcp_sack.c
Normal file
592
sys/netinet/tcp_sack.c
Normal file
@ -0,0 +1,592 @@
|
||||
/*
|
||||
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 4. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
|
||||
* $FreeBSD$
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
|
||||
*
|
||||
* NRL grants permission for redistribution and use in source and binary
|
||||
* forms, with or without modification, of the software and documentation
|
||||
* created at NRL provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgements:
|
||||
* This product includes software developed by the University of
|
||||
* California, Berkeley and its contributors.
|
||||
* This product includes software developed at the Information
|
||||
* Technology Division, US Naval Research Laboratory.
|
||||
* 4. Neither the name of the NRL nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
|
||||
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* The views and conclusions contained in the software and documentation
|
||||
* are those of the authors and should not be interpreted as representing
|
||||
* official policies, either expressed or implied, of the US Naval
|
||||
* Research Laboratory (NRL).
|
||||
*/
|
||||
#include "opt_ipfw.h" /* for ipfw_fwd */
|
||||
#include "opt_inet.h"
|
||||
#include "opt_inet6.h"
|
||||
#include "opt_ipsec.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_tcp_input.h"
|
||||
#include "opt_tcp_sack.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/kernel.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/mbuf.h>
|
||||
#include <sys/proc.h> /* for proc0 declaration */
|
||||
#include <sys/protosw.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/socketvar.h>
|
||||
#include <sys/syslog.h>
|
||||
#include <sys/systm.h>
|
||||
|
||||
#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
|
||||
|
||||
#include <vm/uma.h>
|
||||
|
||||
#include <net/if.h>
|
||||
#include <net/route.h>
|
||||
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/in_systm.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
|
||||
#include <netinet/in_var.h>
|
||||
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
|
||||
#include <netinet/in_pcb.h>
|
||||
#include <netinet/ip_var.h>
|
||||
#include <netinet/ip6.h>
|
||||
#include <netinet/icmp6.h>
|
||||
#include <netinet6/nd6.h>
|
||||
#include <netinet6/ip6_var.h>
|
||||
#include <netinet6/in6_pcb.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/tcp_fsm.h>
|
||||
#include <netinet/tcp_seq.h>
|
||||
#include <netinet/tcp_timer.h>
|
||||
#include <netinet/tcp_var.h>
|
||||
#include <netinet6/tcp6_var.h>
|
||||
#include <netinet/tcpip.h>
|
||||
#ifdef TCPDEBUG
|
||||
#include <netinet/tcp_debug.h>
|
||||
|
||||
u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
|
||||
struct tcphdr tcp_savetcp;
|
||||
#endif /* TCPDEBUG */
|
||||
|
||||
#ifdef FAST_IPSEC
|
||||
#include <netipsec/ipsec.h>
|
||||
#include <netipsec/ipsec6.h>
|
||||
#endif
|
||||
|
||||
#ifdef IPSEC
|
||||
#include <netinet6/ipsec.h>
|
||||
#include <netinet6/ipsec6.h>
|
||||
#include <netkey/key.h>
|
||||
#endif /*IPSEC*/
|
||||
#include <machine/in_cksum.h>
|
||||
|
||||
extern struct uma_zone *sack_hole_zone;
|
||||
|
||||
/*
|
||||
* This function is called upon receipt of new valid data (while not in header
|
||||
* prediction mode), and it updates the ordered list of sacks.
|
||||
*/
|
||||
void
|
||||
tcp_update_sack_list(tp)
|
||||
struct tcpcb *tp;
|
||||
{
|
||||
/*
|
||||
* First reported block MUST be the most recent one. Subsequent
|
||||
* blocks SHOULD be in the order in which they arrived at the
|
||||
* receiver. These two conditions make the implementation fully
|
||||
* compliant with RFC 2018.
|
||||
*/
|
||||
int i, j = 0, count = 0, lastpos = -1;
|
||||
struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
|
||||
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
/* First clean up current list of sacks */
|
||||
for (i = 0; i < tp->rcv_numsacks; i++) {
|
||||
sack = tp->sackblks[i];
|
||||
if (sack.start == 0 && sack.end == 0) {
|
||||
count++; /* count = number of blocks to be discarded */
|
||||
continue;
|
||||
}
|
||||
if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
|
||||
tp->sackblks[i].start = tp->sackblks[i].end = 0;
|
||||
count++;
|
||||
} else {
|
||||
temp[j].start = tp->sackblks[i].start;
|
||||
temp[j++].end = tp->sackblks[i].end;
|
||||
}
|
||||
}
|
||||
tp->rcv_numsacks -= count;
|
||||
if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
|
||||
tcp_clean_sackreport(tp);
|
||||
if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) {
|
||||
/* ==> need first sack block */
|
||||
tp->sackblks[0].start = tp->rcv_laststart;
|
||||
tp->sackblks[0].end = tp->rcv_lastend;
|
||||
tp->rcv_numsacks = 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* Otherwise, sack blocks are already present. */
|
||||
for (i = 0; i < tp->rcv_numsacks; i++)
|
||||
tp->sackblks[i] = temp[i]; /* first copy back sack list */
|
||||
if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend))
|
||||
return; /* sack list remains unchanged */
|
||||
/*
|
||||
* From here, segment just received should be (part of) the 1st sack.
|
||||
* Go through list, possibly coalescing sack block entries.
|
||||
*/
|
||||
firstsack.start = tp->rcv_laststart;
|
||||
firstsack.end = tp->rcv_lastend;
|
||||
for (i = 0; i < tp->rcv_numsacks; i++) {
|
||||
sack = tp->sackblks[i];
|
||||
if (SEQ_LT(sack.end, firstsack.start) ||
|
||||
SEQ_GT(sack.start, firstsack.end))
|
||||
continue; /* no overlap */
|
||||
if (sack.start == firstsack.start && sack.end == firstsack.end){
|
||||
/*
|
||||
* identical block; delete it here since we will
|
||||
* move it to the front of the list.
|
||||
*/
|
||||
tp->sackblks[i].start = tp->sackblks[i].end = 0;
|
||||
lastpos = i; /* last posn with a zero entry */
|
||||
continue;
|
||||
}
|
||||
if (SEQ_LEQ(sack.start, firstsack.start))
|
||||
firstsack.start = sack.start; /* merge blocks */
|
||||
if (SEQ_GEQ(sack.end, firstsack.end))
|
||||
firstsack.end = sack.end; /* merge blocks */
|
||||
tp->sackblks[i].start = tp->sackblks[i].end = 0;
|
||||
lastpos = i; /* last posn with a zero entry */
|
||||
}
|
||||
if (lastpos != -1) { /* at least one merge */
|
||||
for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
|
||||
sack = tp->sackblks[i];
|
||||
if (sack.start == 0 && sack.end == 0)
|
||||
continue;
|
||||
temp[j++] = sack;
|
||||
}
|
||||
tp->rcv_numsacks = j; /* including first blk (added later) */
|
||||
for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
|
||||
tp->sackblks[i] = temp[i];
|
||||
} else { /* no merges -- shift sacks by 1 */
|
||||
if (tp->rcv_numsacks < MAX_SACK_BLKS)
|
||||
tp->rcv_numsacks++;
|
||||
for (i = tp->rcv_numsacks-1; i > 0; i--)
|
||||
tp->sackblks[i] = tp->sackblks[i-1];
|
||||
}
|
||||
tp->sackblks[0] = firstsack;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Delete all receiver-side SACK information.
|
||||
*/
|
||||
void
|
||||
tcp_clean_sackreport(tp)
|
||||
struct tcpcb *tp;
|
||||
{
|
||||
int i;
|
||||
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
tp->rcv_numsacks = 0;
|
||||
for (i = 0; i < MAX_SACK_BLKS; i++)
|
||||
tp->sackblks[i].start = tp->sackblks[i].end=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue,
|
||||
* and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list
|
||||
* of holes (oldest to newest, in terms of the sequence space).
|
||||
*/
|
||||
int
|
||||
tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
|
||||
{
|
||||
int tmp_olen;
|
||||
u_char *tmp_cp;
|
||||
struct sackhole *cur, *p, *temp;
|
||||
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
if (!tp->sack_enable)
|
||||
return (1);
|
||||
|
||||
/* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
|
||||
if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
|
||||
return (1);
|
||||
tmp_cp = cp + 2;
|
||||
tmp_olen = optlen - 2;
|
||||
tcpstat.tcps_sack_rcv_blocks++;
|
||||
if (tp->snd_numholes < 0)
|
||||
tp->snd_numholes = 0;
|
||||
if (tp->t_maxseg == 0)
|
||||
panic("tcp_sack_option"); /* Should never happen */
|
||||
while (tmp_olen > 0) {
|
||||
struct sackblk sack;
|
||||
|
||||
bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
|
||||
sack.start = ntohl(sack.start);
|
||||
bcopy(tmp_cp + sizeof(tcp_seq),
|
||||
(char *) &(sack.end), sizeof(tcp_seq));
|
||||
sack.end = ntohl(sack.end);
|
||||
tmp_olen -= TCPOLEN_SACK;
|
||||
tmp_cp += TCPOLEN_SACK;
|
||||
if (SEQ_LEQ(sack.end, sack.start))
|
||||
continue; /* bad SACK fields */
|
||||
if (SEQ_LEQ(sack.end, tp->snd_una))
|
||||
continue; /* old block */
|
||||
if (SEQ_GT(th->th_ack, tp->snd_una)) {
|
||||
if (SEQ_LT(sack.start, th->th_ack))
|
||||
continue;
|
||||
}
|
||||
if (SEQ_GT(sack.end, tp->snd_max))
|
||||
continue;
|
||||
if (tp->snd_holes == NULL) { /* first hole */
|
||||
tp->snd_holes = (struct sackhole *)
|
||||
uma_zalloc(sack_hole_zone,M_NOWAIT);
|
||||
if (tp->snd_holes == NULL) {
|
||||
/* ENOBUFS, so ignore SACKed block for now*/
|
||||
continue;
|
||||
}
|
||||
cur = tp->snd_holes;
|
||||
cur->start = th->th_ack;
|
||||
cur->end = sack.start;
|
||||
cur->rxmit = cur->start;
|
||||
cur->next = NULL;
|
||||
tp->snd_numholes = 1;
|
||||
tp->rcv_lastsack = sack.end;
|
||||
continue; /* with next sack block */
|
||||
}
|
||||
/* Go thru list of holes: p = previous, cur = current */
|
||||
p = cur = tp->snd_holes;
|
||||
while (cur) {
|
||||
if (SEQ_LEQ(sack.end, cur->start))
|
||||
/* SACKs data before the current hole */
|
||||
break; /* no use going through more holes */
|
||||
if (SEQ_GEQ(sack.start, cur->end)) {
|
||||
/* SACKs data beyond the current hole */
|
||||
p = cur;
|
||||
cur = cur->next;
|
||||
continue;
|
||||
}
|
||||
if (SEQ_LEQ(sack.start, cur->start)) {
|
||||
/* Data acks at least the beginning of hole */
|
||||
if (SEQ_GEQ(sack.end, cur->end)) {
|
||||
/* Acks entire hole, so delete hole */
|
||||
if (p != cur) {
|
||||
p->next = cur->next;
|
||||
uma_zfree(sack_hole_zone, cur);
|
||||
cur = p->next;
|
||||
} else {
|
||||
cur = cur->next;
|
||||
uma_zfree(sack_hole_zone, p);
|
||||
p = cur;
|
||||
tp->snd_holes = p;
|
||||
}
|
||||
tp->snd_numholes--;
|
||||
continue;
|
||||
}
|
||||
/* otherwise, move start of hole forward */
|
||||
cur->start = sack.end;
|
||||
cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
|
||||
p = cur;
|
||||
cur = cur->next;
|
||||
continue;
|
||||
}
|
||||
/* move end of hole backward */
|
||||
if (SEQ_GEQ(sack.end, cur->end)) {
|
||||
cur->end = sack.start;
|
||||
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
|
||||
p = cur;
|
||||
cur = cur->next;
|
||||
continue;
|
||||
}
|
||||
if (SEQ_LT(cur->start, sack.start) &&
|
||||
SEQ_GT(cur->end, sack.end)) {
|
||||
/*
|
||||
* ACKs some data in middle of a hole; need to
|
||||
* split current hole
|
||||
*/
|
||||
temp = (struct sackhole *)
|
||||
uma_zalloc(sack_hole_zone,M_NOWAIT);
|
||||
if (temp == NULL)
|
||||
continue; /* ENOBUFS */
|
||||
temp->next = cur->next;
|
||||
temp->start = sack.end;
|
||||
temp->end = cur->end;
|
||||
temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
|
||||
cur->end = sack.start;
|
||||
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
|
||||
cur->next = temp;
|
||||
p = temp;
|
||||
cur = p->next;
|
||||
tp->snd_numholes++;
|
||||
}
|
||||
}
|
||||
/* At this point, p points to the last hole on the list */
|
||||
if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
|
||||
/*
|
||||
* Need to append new hole at end.
|
||||
* Last hole is p (and it's not NULL).
|
||||
*/
|
||||
temp = (struct sackhole *)
|
||||
uma_zalloc(sack_hole_zone,M_NOWAIT);
|
||||
if (temp == NULL)
|
||||
continue; /* ENOBUFS */
|
||||
temp->start = tp->rcv_lastsack;
|
||||
temp->end = sack.start;
|
||||
temp->rxmit = temp->start;
|
||||
temp->next = 0;
|
||||
p->next = temp;
|
||||
tp->rcv_lastsack = sack.end;
|
||||
tp->snd_numholes++;
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
|
||||
* it is completely acked; otherwise, tcp_sack_option(), called from
|
||||
* tcp_dooptions(), will fix up the hole.
|
||||
*/
|
||||
void
|
||||
tcp_del_sackholes(tp, th)
|
||||
struct tcpcb *tp;
|
||||
struct tcphdr *th;
|
||||
{
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
|
||||
/* max because this could be an older ack just arrived */
|
||||
tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
|
||||
th->th_ack : tp->snd_una;
|
||||
struct sackhole *cur = tp->snd_holes;
|
||||
struct sackhole *prev;
|
||||
while (cur)
|
||||
if (SEQ_LEQ(cur->end, lastack)) {
|
||||
prev = cur;
|
||||
cur = cur->next;
|
||||
uma_zfree(sack_hole_zone, prev);
|
||||
tp->snd_numholes--;
|
||||
} else if (SEQ_LT(cur->start, lastack)) {
|
||||
cur->start = lastack;
|
||||
if (SEQ_LT(cur->rxmit, cur->start))
|
||||
cur->rxmit = cur->start;
|
||||
break;
|
||||
} else
|
||||
break;
|
||||
tp->snd_holes = cur;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
tcp_free_sackholes(struct tcpcb *tp)
|
||||
{
|
||||
struct sackhole *p, *q;
|
||||
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
q = tp->snd_holes;
|
||||
while (q != NULL) {
|
||||
p = q;
|
||||
q = q->next;
|
||||
uma_zfree(sack_hole_zone, p);
|
||||
}
|
||||
tp->snd_holes = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks for partial ack. If partial ack arrives, turn off retransmission
|
||||
* timer, deflate the window, do not clear tp->t_dupacks, and return 1.
|
||||
* If the ack advances at least to tp->snd_recover, return 0.
|
||||
*/
|
||||
void
|
||||
tcp_sack_partialack(tp, th)
|
||||
struct tcpcb *tp;
|
||||
struct tcphdr *th;
|
||||
{
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
u_long ocwnd = tp->snd_cwnd;
|
||||
|
||||
callout_stop(tp->tt_rexmt);
|
||||
tp->t_rtttime = 0;
|
||||
/*
|
||||
* Set snd_cwnd to one segment beyond acknowledged offset
|
||||
* (tp->snd_una has not yet been updated when this function is called.)
|
||||
*/
|
||||
/*
|
||||
* Should really be
|
||||
* min(tp->snd_cwnd, tp->t_maxseg + (th->th_ack - tp->snd_una))
|
||||
*/
|
||||
tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
|
||||
tp->t_flags |= TF_ACKNOW;
|
||||
(void) tcp_output(tp);
|
||||
tp->snd_cwnd = ocwnd;
|
||||
/*
|
||||
* Partial window deflation. Relies on fact that tp->snd_una
|
||||
* not updated yet.
|
||||
*/
|
||||
tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
|
||||
}
|
||||
|
||||
#ifdef TCP_SACK_DEBUG
|
||||
void
|
||||
tcp_print_holes(struct tcpcb *tp)
|
||||
{
|
||||
struct sackhole *p = tp->snd_holes;
|
||||
if (p == 0)
|
||||
return;
|
||||
printf("Hole report: start--end dups rxmit\n");
|
||||
while (p) {
|
||||
printf("%x--%x r %x\n", p->start, p->end, p->rxmit);
|
||||
p = p->next;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
#endif /* TCP_SACK_DEBUG */
|
||||
|
||||
/*
|
||||
* Returns pointer to a sackhole if there are any pending retransmissions;
|
||||
* NULL otherwise.
|
||||
*/
|
||||
struct sackhole *
|
||||
tcp_sack_output(struct tcpcb *tp)
|
||||
{
|
||||
struct sackhole *p;
|
||||
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
if (!tp->sack_enable)
|
||||
return (NULL);
|
||||
p = tp->snd_holes;
|
||||
while (p) {
|
||||
if (SEQ_LT(p->rxmit, p->end)) {
|
||||
if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
|
||||
p = p->next;
|
||||
continue;
|
||||
}
|
||||
#ifdef TCP_SACK_DEBUG
|
||||
if (p)
|
||||
tcp_print_holes(tp);
|
||||
#endif
|
||||
return (p);
|
||||
}
|
||||
p = p->next;
|
||||
}
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* After a timeout, the SACK list may be rebuilt. This SACK information
|
||||
* should be used to avoid retransmitting SACKed data. This function
|
||||
* traverses the SACK list to see if snd_nxt should be moved forward.
|
||||
*/
|
||||
void
|
||||
tcp_sack_adjust(struct tcpcb *tp)
|
||||
{
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
struct sackhole *cur = tp->snd_holes;
|
||||
if (cur == NULL)
|
||||
return; /* No holes */
|
||||
if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
|
||||
return; /* We're already beyond any SACKed blocks */
|
||||
/*
|
||||
* Two cases for which we want to advance snd_nxt:
|
||||
* i) snd_nxt lies between end of one hole and beginning of another
|
||||
* ii) snd_nxt lies between end of last hole and rcv_lastsack
|
||||
*/
|
||||
while (cur->next) {
|
||||
if (SEQ_LT(tp->snd_nxt, cur->end))
|
||||
return;
|
||||
if (SEQ_GEQ(tp->snd_nxt, cur->next->start))
|
||||
cur = cur->next;
|
||||
else {
|
||||
tp->snd_nxt = cur->next->start;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (SEQ_LT(tp->snd_nxt, cur->end))
|
||||
return;
|
||||
tp->snd_nxt = tp->rcv_lastsack;
|
||||
return;
|
||||
}
|
||||
|
@ -42,6 +42,9 @@
|
||||
#define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
|
||||
#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
|
||||
|
||||
#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
|
||||
#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
|
||||
|
||||
/* for modulo comparisons of timestamps */
|
||||
#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
|
||||
#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
|
||||
|
@ -36,6 +36,7 @@
|
||||
#include "opt_ipsec.h"
|
||||
#include "opt_mac.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_tcp_sack.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
@ -201,6 +202,17 @@ static int tcp_inflight_stab = 20;
|
||||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
|
||||
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
|
||||
|
||||
|
||||
int tcp_do_sack = 1;
|
||||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW,
|
||||
&tcp_do_sack, 0, "Enable/Disable TCP SACK support");
|
||||
|
||||
int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */
|
||||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW,
|
||||
&tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements");
|
||||
|
||||
uma_zone_t sack_hole_zone;
|
||||
|
||||
static struct inpcb *tcp_notify(struct inpcb *, int);
|
||||
static void tcp_discardcb(struct tcpcb *);
|
||||
static void tcp_isn_tick(void *);
|
||||
@ -292,6 +304,8 @@ tcp_init()
|
||||
tcp_isn_tick(NULL);
|
||||
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
|
||||
SHUTDOWN_PRI_DEFAULT);
|
||||
sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
|
||||
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
||||
}
|
||||
|
||||
void
|
||||
@ -606,6 +620,7 @@ tcp_newtcpcb(inp)
|
||||
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
|
||||
if (tcp_do_rfc1644)
|
||||
tp->t_flags |= TF_REQ_CC;
|
||||
tp->sack_enable = tcp_do_sack;
|
||||
tp->t_inpcb = inp; /* XXX */
|
||||
/*
|
||||
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
|
||||
@ -739,6 +754,7 @@ tcp_discardcb(tp)
|
||||
tp->t_segqlen--;
|
||||
tcp_reass_qsize--;
|
||||
}
|
||||
tcp_free_sackholes(tp);
|
||||
inp->inp_ppcb = NULL;
|
||||
tp->t_inpcb = NULL;
|
||||
uma_zfree(tcpcb_zone, tp);
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include "opt_ipsec.h"
|
||||
#include "opt_mac.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_tcp_sack.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
@ -704,7 +705,10 @@ syncache_socket(sc, lso, m)
|
||||
if (sc->sc_flags & SCF_SIGNATURE)
|
||||
tp->t_flags |= TF_SIGNATURE;
|
||||
#endif
|
||||
|
||||
if (sc->sc_flags & SCF_SACK) {
|
||||
tp->sack_enable = 1;
|
||||
tp->t_flags |= TF_SACK_PERMIT;
|
||||
}
|
||||
/*
|
||||
* Set up MSS and get cached values from tcp_hostcache.
|
||||
* This might overwrite some of the defaults we just set.
|
||||
@ -991,6 +995,9 @@ syncache_add(inc, to, th, sop, m)
|
||||
sc->sc_flags = SCF_SIGNATURE;
|
||||
#endif
|
||||
|
||||
if (to->to_flags & TOF_SACK)
|
||||
sc->sc_flags |= SCF_SACK;
|
||||
|
||||
/*
|
||||
* XXX
|
||||
* We have the option here of not doing TAO (even if the segment
|
||||
@ -1107,6 +1114,7 @@ syncache_respond(sc, m)
|
||||
optlen += (sc->sc_flags & SCF_SIGNATURE) ?
|
||||
TCPOLEN_SIGNATURE + 2 : 0;
|
||||
#endif
|
||||
optlen += ((sc->sc_flags & SCF_SACK) ? 4 : 0);
|
||||
}
|
||||
tlen = hlen + sizeof(struct tcphdr) + optlen;
|
||||
|
||||
@ -1244,6 +1252,11 @@ syncache_respond(sc, m)
|
||||
optp += TCPOLEN_SIGNATURE + 2;
|
||||
}
|
||||
#endif /* TCP_SIGNATURE */
|
||||
|
||||
if (sc->sc_flags & SCF_SACK) {
|
||||
*(u_int32_t *)optp = htonl(TCPOPT_SACK_PERMIT_HDR);
|
||||
optp += 4;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INET6
|
||||
|
@ -32,6 +32,7 @@
|
||||
|
||||
#include "opt_inet6.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_tcp_sack.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/kernel.h>
|
||||
@ -217,6 +218,7 @@ tcp_timer_2msl(xtp)
|
||||
return;
|
||||
}
|
||||
INP_LOCK(inp);
|
||||
tcp_free_sackholes(tp);
|
||||
if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) {
|
||||
INP_UNLOCK(tp->t_inpcb);
|
||||
INP_INFO_WUNLOCK(&tcbinfo);
|
||||
@ -497,6 +499,7 @@ tcp_timer_rexmt(xtp)
|
||||
return;
|
||||
}
|
||||
callout_deactivate(tp->tt_rexmt);
|
||||
tcp_free_sackholes(tp);
|
||||
/*
|
||||
* Retransmission timer went off. Message has not
|
||||
* been acked within retransmit interval. Back off
|
||||
|
@ -36,6 +36,7 @@
|
||||
#include "opt_ipsec.h"
|
||||
#include "opt_mac.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_tcp_sack.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
@ -201,6 +202,17 @@ static int tcp_inflight_stab = 20;
|
||||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
|
||||
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
|
||||
|
||||
|
||||
int tcp_do_sack = 1;
|
||||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW,
|
||||
&tcp_do_sack, 0, "Enable/Disable TCP SACK support");
|
||||
|
||||
int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */
|
||||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW,
|
||||
&tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements");
|
||||
|
||||
uma_zone_t sack_hole_zone;
|
||||
|
||||
static struct inpcb *tcp_notify(struct inpcb *, int);
|
||||
static void tcp_discardcb(struct tcpcb *);
|
||||
static void tcp_isn_tick(void *);
|
||||
@ -292,6 +304,8 @@ tcp_init()
|
||||
tcp_isn_tick(NULL);
|
||||
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
|
||||
SHUTDOWN_PRI_DEFAULT);
|
||||
sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
|
||||
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
||||
}
|
||||
|
||||
void
|
||||
@ -606,6 +620,7 @@ tcp_newtcpcb(inp)
|
||||
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
|
||||
if (tcp_do_rfc1644)
|
||||
tp->t_flags |= TF_REQ_CC;
|
||||
tp->sack_enable = tcp_do_sack;
|
||||
tp->t_inpcb = inp; /* XXX */
|
||||
/*
|
||||
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
|
||||
@ -739,6 +754,7 @@ tcp_discardcb(tp)
|
||||
tp->t_segqlen--;
|
||||
tcp_reass_qsize--;
|
||||
}
|
||||
tcp_free_sackholes(tp);
|
||||
inp->inp_ppcb = NULL;
|
||||
tp->t_inpcb = NULL;
|
||||
uma_zfree(tcpcb_zone, tp);
|
||||
|
@ -52,6 +52,17 @@ LIST_HEAD(tsegqe_head, tseg_qent);
|
||||
extern int tcp_reass_qsize;
|
||||
extern struct uma_zone *tcp_reass_zone;
|
||||
|
||||
struct sackblk {
|
||||
tcp_seq start; /* start seq no. of sack block */
|
||||
tcp_seq end; /* end seq no. */
|
||||
};
|
||||
|
||||
struct sackhole {
|
||||
tcp_seq start; /* start seq no. of hole */
|
||||
tcp_seq end; /* end seq no. */
|
||||
tcp_seq rxmit; /* next seq. no in hole to be retransmitted */
|
||||
struct sackhole *next; /* next in list */
|
||||
};
|
||||
struct tcptemp {
|
||||
u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
|
||||
struct tcphdr tt_t;
|
||||
@ -179,6 +190,16 @@ struct tcpcb {
|
||||
u_long rcv_second; /* start of interval second */
|
||||
u_long rcv_pps; /* received packets per second */
|
||||
u_long rcv_byps; /* received bytes per second */
|
||||
/* SACK related state */
|
||||
int sack_enable; /* enable SACK for this connection */
|
||||
int snd_numholes; /* number of holes seen by sender */
|
||||
struct sackhole *snd_holes; /* linked list of holes (sorted) */
|
||||
|
||||
tcp_seq rcv_laststart; /* start of last segment recd. */
|
||||
tcp_seq rcv_lastend; /* end of ... */
|
||||
tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
|
||||
int rcv_numsacks; /* # distinct sack blks present */
|
||||
struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
|
||||
};
|
||||
|
||||
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
|
||||
@ -216,6 +237,7 @@ struct tcpopt {
|
||||
#define TOF_SCALE 0x0020
|
||||
#define TOF_SIGNATURE 0x0040 /* signature option present */
|
||||
#define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */
|
||||
#define TOF_SACK 0x0100 /* Peer sent SACK option */
|
||||
u_int32_t to_tsval;
|
||||
u_int32_t to_tsecr;
|
||||
tcp_cc to_cc; /* holds CC or CCnew */
|
||||
@ -249,6 +271,7 @@ struct syncache {
|
||||
#define SCF_CC 0x08 /* negotiated CC */
|
||||
#define SCF_UNREACH 0x10 /* icmp unreachable received */
|
||||
#define SCF_SIGNATURE 0x20 /* send MD5 digests */
|
||||
#define SCF_SACK 0x80 /* send SACK option */
|
||||
TAILQ_ENTRY(syncache) sc_hash;
|
||||
TAILQ_ENTRY(syncache) sc_timerq;
|
||||
};
|
||||
@ -434,6 +457,13 @@ struct tcpstat {
|
||||
|
||||
u_long tcps_hc_added; /* entry added to hostcache */
|
||||
u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
|
||||
|
||||
/* SACK related stats */
|
||||
u_long tcps_sack_recovery_episode; /* SACK recovery episodes */
|
||||
u_long tcps_sack_rexmits; /* SACK rexmit segments */
|
||||
u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
|
||||
u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
|
||||
u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -467,7 +497,8 @@ struct xtcpcb {
|
||||
#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */
|
||||
#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */
|
||||
#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */
|
||||
#define TCPCTL_MAXID 14
|
||||
#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */
|
||||
#define TCPCTL_MAXID 15
|
||||
|
||||
#define TCPCTL_NAMES { \
|
||||
{ 0, 0 }, \
|
||||
@ -505,6 +536,8 @@ extern int path_mtu_discovery;
|
||||
extern int ss_fltsz;
|
||||
extern int ss_fltsz_local;
|
||||
|
||||
extern int tcp_do_sack; /* SACK enabled/disabled */
|
||||
|
||||
void tcp_canceltimers(struct tcpcb *);
|
||||
struct tcpcb *
|
||||
tcp_close(struct tcpcb *);
|
||||
@ -578,6 +611,20 @@ extern u_long tcp_sendspace;
|
||||
extern u_long tcp_recvspace;
|
||||
tcp_seq tcp_new_isn(struct tcpcb *);
|
||||
|
||||
int tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int);
|
||||
void tcp_update_sack_list(struct tcpcb *tp);
|
||||
void tcp_del_sackholes(struct tcpcb *, struct tcphdr *);
|
||||
void tcp_clean_sackreport(struct tcpcb *tp);
|
||||
void tcp_sack_adjust(struct tcpcb *tp);
|
||||
struct sackhole *tcp_sack_output(struct tcpcb *tp);
|
||||
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
|
||||
void tcp_free_sackholes(struct tcpcb *tp);
|
||||
int tcp_newreno(struct tcpcb *, struct tcphdr *);
|
||||
u_long tcp_seq_subtract(u_long, u_long );
|
||||
#ifdef TCP_SACK_DEBUG
|
||||
void tcp_print_holes(struct tcpcb *tp);
|
||||
#endif /* TCP_SACK_DEBUG */
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
||||
#endif /* _NETINET_TCP_VAR_H_ */
|
||||
|
Loading…
Reference in New Issue
Block a user