- Estimate the amount of data in flight in sack recovery and use it

to control the packets injected while in sack recovery (for both
  retransmissions and new data).
- Cleanups to the sack codepaths in tcp_output.c and tcp_sack.c.
- Add a new sysctl (net.inet.tcp.sack.initburst) that controls the
  number of sack retransmissions done upon initiation of sack recovery.

Submitted by:	Mohan Srinivasan <mohans@yahoo-inc.com>
This commit is contained in:
ps 2004-10-05 18:36:24 +00:00
parent 9536269a6d
commit c8e4aa1cd5
7 changed files with 83 additions and 59 deletions

View File

@ -155,6 +155,12 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
&tcp_reass_overflows, 0,
"Global number of TCP Segment Reassembly Queue Overflows");
static int tcp_sack_recovery_initburst = 3;
SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO,
initburst, CTLFLAG_RW,
&tcp_sack_recovery_initburst, 0,
"Initial Number of Rexmits when sack recovery is set up");
struct inpcbhead tcb;
#define tcb6 tcb /* for KAME src sync over BSD*'s */
struct inpcbinfo tcbinfo;
@ -1980,9 +1986,9 @@ trimthenstep6:
tp->t_rtttime = 0;
if (tp->sack_enable) {
tcpstat.tcps_sack_recovery_episode++;
tp->snd_cwnd =
tp->t_maxseg *
tp->t_dupacks;
tp->sack_newdata = tp->snd_nxt;
tp->snd_cwnd =
tp->t_maxseg * tcp_sack_recovery_initburst;
(void) tcp_output(tp);
tp->snd_cwnd +=
tp->snd_ssthresh;

View File

@ -124,6 +124,7 @@ tcp_output(struct tcpcb *tp)
unsigned ipoptlen, optlen, hdrlen;
int idle, sendalot;
int i, sack_rxmit;
int sack_bytes_rxmt;
struct sackhole *p;
#if 0
int maxburst = TCP_MAXBURST;
@ -198,12 +199,16 @@ again:
* Still in sack recovery , reset rxmit flag to zero.
*/
sack_rxmit = 0;
sack_bytes_rxmt = 0;
len = 0;
p = NULL;
if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
(p = tcp_sack_output(tp))) {
KASSERT(tp->snd_cwnd >= 0,
("%s: CWIN is negative : %ld", __func__, tp->snd_cwnd));
(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
long cwin;
cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
/* Do not retransmit SACK segments beyond snd_recover */
if (SEQ_GT(p->end, tp->snd_recover)) {
/*
@ -222,10 +227,10 @@ again:
goto after_sack_rexmit;
} else
/* Can rexmit part of the current hole */
len = ((long)ulmin(tp->snd_cwnd,
tp->snd_recover - p->rxmit));
len = ((long)ulmin(cwin,
tp->snd_recover - p->rxmit));
} else
len = ((long)ulmin(tp->snd_cwnd, p->end - p->rxmit));
len = ((long)ulmin(cwin, p->end - p->rxmit));
sack_rxmit = 1;
sendalot = 1;
off = p->rxmit - tp->snd_una;
@ -295,8 +300,25 @@ after_sack_rexmit:
* If sack_rxmit is true we are retransmitting from the scoreboard
* in which case len is already set.
*/
if (!sack_rxmit)
len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
if (sack_rxmit == 0) {
if (sack_bytes_rxmt == 0)
len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
else {
long cwin;
/*
* We are inside of a SACK recovery episode and are
* sending new data, having retransmitted all the
* data possible in the scoreboard.
*/
len = so->so_snd.sb_cc - off;
cwin = sendwin - (tp->snd_nxt - tp->sack_newdata) -
sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
len = lmin(len, cwin);
}
}
/*
* Lop off SYN bit if it has already been sent. However, if this
@ -850,12 +872,13 @@ send:
* case, since we know we aren't doing a retransmission.
* (retransmit and persist are mutually exclusive...)
*/
if (len || (flags & (TH_SYN|TH_FIN))
|| callout_active(tp->tt_persist))
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
if (sack_rxmit) {
if (sack_rxmit == 0) {
if (len || (flags & (TH_SYN|TH_FIN))
|| callout_active(tp->tt_persist))
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
} else {
th->th_seq = htonl(p->rxmit);
p->rxmit += len;
}
@ -956,7 +979,7 @@ send:
tp->t_flags |= TF_SENTFIN;
}
}
if (tp->sack_enable && sack_rxmit)
if (sack_rxmit)
goto timer;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
@ -981,18 +1004,9 @@ send:
* of retransmit time.
*/
timer:
if (tp->sack_enable && sack_rxmit &&
!callout_active(tp->tt_rexmt) &&
tp->snd_nxt != tp->snd_max) {
callout_reset(tp->tt_rexmt, tp->t_rxtcur,
tcp_timer_rexmt, tp);
if (callout_active(tp->tt_persist)) {
callout_stop(tp->tt_persist);
tp->t_rxtshift = 0;
}
}
if (!callout_active(tp->tt_rexmt) &&
tp->snd_nxt != tp->snd_una) {
((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
(tp->snd_nxt != tp->snd_una))) {
if (callout_active(tp->tt_persist)) {
callout_stop(tp->tt_persist);
tp->t_rxtshift = 0;

View File

@ -155,6 +155,12 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
&tcp_reass_overflows, 0,
"Global number of TCP Segment Reassembly Queue Overflows");
static int tcp_sack_recovery_initburst = 3;
SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO,
initburst, CTLFLAG_RW,
&tcp_sack_recovery_initburst, 0,
"Initial Number of Rexmits when sack recovery is set up");
struct inpcbhead tcb;
#define tcb6 tcb /* for KAME src sync over BSD*'s */
struct inpcbinfo tcbinfo;
@ -1980,9 +1986,9 @@ trimthenstep6:
tp->t_rtttime = 0;
if (tp->sack_enable) {
tcpstat.tcps_sack_recovery_episode++;
tp->snd_cwnd =
tp->t_maxseg *
tp->t_dupacks;
tp->sack_newdata = tp->snd_nxt;
tp->snd_cwnd =
tp->t_maxseg * tcp_sack_recovery_initburst;
(void) tcp_output(tp);
tp->snd_cwnd +=
tp->snd_ssthresh;

View File

@ -164,6 +164,11 @@ struct tcphdr tcp_savetcp;
extern struct uma_zone *sack_hole_zone;
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK");
int tcp_do_sack = 1;
SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW,
&tcp_do_sack, 0, "Enable/Disable TCP SACK support");
/*
* This function is called upon receipt of new valid data (while not in header
* prediction mode), and it updates the ordered list of sacks.
@ -486,18 +491,19 @@ tcp_sack_partialack(tp, th)
{
INP_LOCK_ASSERT(tp->t_inpcb);
u_long ocwnd = tp->snd_cwnd;
int sack_bytes_rexmt = 0;
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
/*
* Set snd_cwnd to one segment beyond acknowledged offset
* (tp->snd_una has not yet been updated when this function is called.)
* Set cwnd so we can send one more segment (either rexmit based on
* scoreboard or new segment). Set cwnd to the amount of data
* rexmitted from scoreboard plus the amount of new data transmitted
* in this sack recovery episode plus one segment.
*/
/*
* Should really be
* min(tp->snd_cwnd, tp->t_maxseg + (th->th_ack - tp->snd_una))
*/
tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
(void)tcp_sack_output(tp, &sack_bytes_rexmt);
tp->snd_cwnd = sack_bytes_rexmt + (tp->snd_nxt - tp->sack_newdata) +
tp->t_maxseg;
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
tp->snd_cwnd = ocwnd;
@ -529,29 +535,29 @@ tcp_print_holes(struct tcpcb *tp)
* NULL otherwise.
*/
struct sackhole *
tcp_sack_output(struct tcpcb *tp)
tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
{
struct sackhole *p;
struct sackhole *p = NULL;
INP_LOCK_ASSERT(tp->t_inpcb);
if (!tp->sack_enable)
return (NULL);
p = tp->snd_holes;
while (p) {
*sack_bytes_rexmt = 0;
for (p = tp->snd_holes; p ; p = p->next) {
if (SEQ_LT(p->rxmit, p->end)) {
if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
p = p->next;
continue;
}
#ifdef TCP_SACK_DEBUG
if (p)
tcp_print_holes(tp);
#endif
return (p);
*sack_bytes_rexmt += (p->rxmit - p->start);
break;
}
p = p->next;
*sack_bytes_rexmt += (p->rxmit - p->start);
}
return (NULL);
return (p);
}
/*
@ -588,4 +594,3 @@ tcp_sack_adjust(struct tcpcb *tp)
tp->snd_nxt = tp->rcv_lastsack;
return;
}

View File

@ -206,11 +206,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK");
int tcp_do_sack = 1;
SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW,
&tcp_do_sack, 0, "Enable/Disable TCP SACK support");
uma_zone_t sack_hole_zone;
static struct inpcb *tcp_notify(struct inpcb *, int);

View File

@ -206,11 +206,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK");
int tcp_do_sack = 1;
SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW,
&tcp_do_sack, 0, "Enable/Disable TCP SACK support");
uma_zone_t sack_hole_zone;
static struct inpcb *tcp_notify(struct inpcb *, int);

View File

@ -200,6 +200,8 @@ struct tcpcb {
tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
int rcv_numsacks; /* # distinct sack blks present */
struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
tcp_seq sack_newdata; /* New data xmitted in this recovery
episode starts at this seq number */
};
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
@ -523,6 +525,7 @@ struct xtcpcb {
#ifdef _KERNEL
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet_tcp);
SYSCTL_DECL(_net_inet_tcp_sack);
#endif
extern struct inpcbhead tcb; /* head of queue of active tcpcb's */
@ -617,7 +620,7 @@ void tcp_update_sack_list(struct tcpcb *tp);
void tcp_del_sackholes(struct tcpcb *, struct tcphdr *);
void tcp_clean_sackreport(struct tcpcb *tp);
void tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);