When looking for the next hole to retransmit from the scoreboard,

or to compute the total retransmitted bytes in this sack recovery
episode, the scoreboard is traversed. While in sack recovery, this
traversal occurs on every call to tcp_output(), every dupack and
every partial ack. The scoreboard could potentially get quite large,
making this traversal expensive.

This change optimizes this by storing hints (for the next hole to
retransmit and the total retransmitted bytes in this sack recovery
episode) reducing the complexity to find these values from O(n) to
constant time.

The debug code that sanity checks the hints against the computed
value will be removed eventually.

Submitted by:   Mohan Srinivasan, Noritoshi Demizu, Raja Mukerji.
This commit is contained in:
ps 2005-05-11 21:37:42 +00:00
parent b2be2c6b08
commit 0ee2317201
5 changed files with 134 additions and 82 deletions

View File

@ -1843,7 +1843,6 @@ trimthenstep6:
if (tp->sack_enable && IN_FASTRECOVERY(tp)) {
int data_in_pipe;
int sacked, lost_not_rexmitted;
/*
* Compute the amount of data in flight first.
@ -1851,9 +1850,8 @@ trimthenstep6:
* we have less than 1/2 the original window's
* worth of data in flight.
*/
sacked = tcp_sacked_bytes(tp, &lost_not_rexmitted);
data_in_pipe = (tp->snd_nxt - tp->snd_una) -
(sacked + lost_not_rexmitted);
data_in_pipe = (tp->snd_nxt - tp->rcv_lastsack) +
tp->sackhint.sack_bytes_rexmit;
if (data_in_pipe < tp->snd_ssthresh) {
tp->snd_cwnd += tp->t_maxseg;
if (tp->snd_cwnd > tp->snd_ssthresh)
@ -1896,6 +1894,9 @@ trimthenstep6:
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
if (tp->sack_enable) {
KASSERT(tp->sackhint.
sack_bytes_rexmit == 0,
("sackhint rexmit == 0"));
tcpstat.tcps_sack_recovery_episode++;
tp->sack_newdata = tp->snd_nxt;
tp->snd_cwnd = tp->t_maxseg;

View File

@ -863,6 +863,7 @@ send:
} else {
th->th_seq = htonl(p->rxmit);
p->rxmit += len;
tp->sackhint.sack_bytes_rexmit += len;
}
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) {
@ -1091,9 +1092,13 @@ timer:
* the TF_SENTFIN flag handles that case.
*/
if ((flags & TH_SYN) == 0) {
if (sack_rxmit)
if (sack_rxmit) {
p->rxmit -= len;
else
tp->sackhint.sack_bytes_rexmit -= len;
KASSERT(tp->sackhint.sack_bytes_rexmit
>= 0,
("sackhint bytes rtx >= 0"));
} else
tp->snd_nxt -= len;
}
}

View File

@ -1843,7 +1843,6 @@ trimthenstep6:
if (tp->sack_enable && IN_FASTRECOVERY(tp)) {
int data_in_pipe;
int sacked, lost_not_rexmitted;
/*
* Compute the amount of data in flight first.
@ -1851,9 +1850,8 @@ trimthenstep6:
* we have less than 1/2 the original window's
* worth of data in flight.
*/
sacked = tcp_sacked_bytes(tp, &lost_not_rexmitted);
data_in_pipe = (tp->snd_nxt - tp->snd_una) -
(sacked + lost_not_rexmitted);
data_in_pipe = (tp->snd_nxt - tp->rcv_lastsack) +
tp->sackhint.sack_bytes_rexmit;
if (data_in_pipe < tp->snd_ssthresh) {
tp->snd_cwnd += tp->t_maxseg;
if (tp->snd_cwnd > tp->snd_ssthresh)
@ -1896,6 +1894,9 @@ trimthenstep6:
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
if (tp->sack_enable) {
KASSERT(tp->sackhint.
sack_bytes_rexmit == 0,
("sackhint rexmit == 0"));
tcpstat.tcps_sack_recovery_episode++;
tp->sack_newdata = tp->snd_nxt;
tp->snd_cwnd = tp->t_maxseg;

View File

@ -352,6 +352,8 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
cur->rxmit = cur->start;
tp->snd_numholes = 1;
tcp_sack_globalholes++;
/* Update the sack scoreboard "cache" */
tp->sackhint.nexthole = cur;
tp->rcv_lastsack = sack.end;
TAILQ_INSERT_HEAD(&tp->snd_holes, cur, scblink);
continue; /* with next sack block */
@ -367,10 +369,17 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
cur = TAILQ_NEXT(cur, scblink);
continue;
}
tp->sackhint.sack_bytes_rexmit -=
(cur->rxmit - cur->start);
KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
("sackhint bytes rtx >= 0"));
if (SEQ_LEQ(sack.start, cur->start)) {
/* Data acks at least the beginning of hole */
if (SEQ_GEQ(sack.end, cur->end)) {
/* Acks entire hole, so delete hole */
if (tp->sackhint.nexthole == cur)
tp->sackhint.nexthole =
TAILQ_NEXT(cur, scblink);
temp = cur;
cur = TAILQ_NEXT(cur, scblink);
TAILQ_REMOVE(&tp->snd_holes,
@ -379,22 +388,16 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
tp->snd_numholes--;
tcp_sack_globalholes--;
continue;
} else {
/* Move start of hole forward */
cur->start = sack.end;
cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
}
/* otherwise, move start of hole forward */
cur->start = sack.end;
cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
cur = TAILQ_NEXT(cur, scblink);
continue;
}
/* move end of hole backward */
if (SEQ_GEQ(sack.end, cur->end)) {
} else if (SEQ_GEQ(sack.end, cur->end)) {
/* Move end of hole backward */
cur->end = sack.start;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
cur = TAILQ_NEXT(cur, scblink);
continue;
}
if (SEQ_LT(cur->start, sack.start) &&
SEQ_GT(cur->end, sack.end)) {
} else {
/*
* ACKs some data in middle of a hole; need to
* split current hole
@ -403,25 +406,34 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
tcp_sack_globalholes >=
tcp_sack_globalmaxholes) {
tcpstat.tcps_sack_sboverflow++;
continue;
temp = NULL;
} else {
temp = (struct sackhole *)
uma_zalloc(sack_hole_zone,
M_NOWAIT);
}
if (temp != NULL) {
temp->start = sack.end;
temp->end = cur->end;
temp->rxmit = SEQ_MAX(cur->rxmit,
temp->start);
cur->end = sack.start;
cur->rxmit = SEQ_MIN(cur->rxmit,
cur->end);
tp->sackhint.sack_bytes_rexmit +=
(cur->rxmit - cur->start);
TAILQ_INSERT_AFTER(&tp->snd_holes,
cur, temp, scblink);
cur = temp;
tp->snd_numholes++;
tcp_sack_globalholes++;
}
temp = (struct sackhole *)
uma_zalloc(sack_hole_zone,M_NOWAIT);
if (temp == NULL)
continue; /* ENOBUFS */
temp->start = sack.end;
temp->end = cur->end;
temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
cur->end = sack.start;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
TAILQ_INSERT_AFTER(&tp->snd_holes,
cur, temp, scblink);
cur = TAILQ_NEXT(temp, scblink);
tp->snd_numholes++;
tcp_sack_globalholes++;
}
tp->sackhint.sack_bytes_rexmit +=
(cur->rxmit - cur->start);
cur = TAILQ_NEXT(cur, scblink);
}
/* At this point, we are at the tail of the scoreboard. */
/* At this point, we have iterated the whole scoreboard. */
if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
/* Need to append new hole at end. */
if (tp->snd_numholes >= tcp_sack_maxholes ||
@ -467,15 +479,25 @@ tcp_del_sackholes(tp, th)
while (cur)
if (SEQ_LEQ(cur->end, lastack)) {
prev = cur;
tp->sackhint.sack_bytes_rexmit -=
(cur->rxmit - cur->start);
if (tp->sackhint.nexthole == cur)
tp->sackhint.nexthole =
TAILQ_NEXT(cur, scblink);
cur = TAILQ_NEXT(cur, scblink);
TAILQ_REMOVE(&tp->snd_holes, prev, scblink);
uma_zfree(sack_hole_zone, prev);
tp->snd_numholes--;
tcp_sack_globalholes--;
} else if (SEQ_LT(cur->start, lastack)) {
if (SEQ_LT(cur->rxmit, lastack)) {
tp->sackhint.sack_bytes_rexmit -=
(cur->rxmit - cur->start);
cur->rxmit = lastack;
} else
tp->sackhint.sack_bytes_rexmit -=
(lastack - cur->start);
cur->start = lastack;
if (SEQ_LT(cur->rxmit, cur->start))
cur->rxmit = cur->start;
break;
} else
break;
@ -494,6 +516,8 @@ tcp_free_sackholes(struct tcpcb *tp)
tcp_sack_globalholes--;
}
tp->snd_numholes = 0;
tp->sackhint.nexthole = NULL;
tp->sackhint.sack_bytes_rexmit = 0;
}
/*
@ -512,7 +536,6 @@ tcp_sack_partialack(tp, th)
struct tcphdr *th;
{
int num_segs = 1;
int sack_bytes_rxmt = 0;
INP_LOCK_ASSERT(tp->t_inpcb);
callout_stop(tp->tt_rexmt);
@ -520,9 +543,9 @@ tcp_sack_partialack(tp, th)
/* send one or 2 segments based on how much new data was acked */
if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2)
num_segs = 2;
(void)tcp_sack_output(tp, &sack_bytes_rxmt);
tp->snd_cwnd = sack_bytes_rxmt + (tp->snd_nxt - tp->sack_newdata) +
num_segs * tp->t_maxseg;
tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
(tp->snd_nxt - tp->sack_newdata) +
num_segs * tp->t_maxseg);
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_flags |= TF_ACKNOW;
@ -530,17 +553,15 @@ tcp_sack_partialack(tp, th)
}
/*
* Returns pointer to a sackhole if there are any pending retransmissions;
* NULL otherwise.
* Debug version of tcp_sack_output() that walks the scoreboard. Used for
* now to sanity check the hint.
*/
struct sackhole *
tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
static struct sackhole *
tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt)
{
struct sackhole *p;
INP_LOCK_ASSERT(tp->t_inpcb);
if (!tp->sack_enable)
return (NULL);
*sack_bytes_rexmt = 0;
TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
if (SEQ_LT(p->rxmit, p->end)) {
@ -555,6 +576,55 @@ tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
return (p);
}
/*
* Returns the next hole to retransmit and the number of retransmitted bytes
* from the scoreboard. We store both the next hole and the number of
* retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
* reception). This avoids scoreboard traversals completely.
*
* The loop here will traverse *at most* one link. Here's the argument.
* For the loop to traverse more than 1 link before finding the next hole to
* retransmit, we would need to have at least 1 node following the current hint
* with (rxmit == end). But, for all holes following the current hint,
* (start == rxmit), since we have not yet retransmitted from them. Therefore,
* in order to traverse more 1 link in the loop below, we need to have at least
* one node following the current hint with (start == rxmit == end).
* But that can't happen, (start == end) means that all the data in that hole
* has been sacked, in which case, the hole would have been removed from the
* scoreboard.
*/
struct sackhole *
tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
{
struct sackhole *hole = NULL, *dbg_hole = NULL;
int dbg_bytes_rexmt;
INP_LOCK_ASSERT(tp->t_inpcb);
dbg_hole = tcp_sack_output_debug(tp, &dbg_bytes_rexmt);
*sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
hole = tp->sackhint.nexthole;
if (hole == NULL || SEQ_LT(hole->rxmit, hole->end))
goto out;
while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) {
if (SEQ_LT(hole->rxmit, hole->end)) {
tp->sackhint.nexthole = hole;
break;
}
}
out:
if (dbg_hole != hole) {
printf("%s: Computed sack hole not the same as cached value\n", __func__);
hole = dbg_hole;
}
if (*sack_bytes_rexmt != dbg_bytes_rexmt) {
printf("%s: Computed sack_bytes_retransmitted (%d) not"
"the same as cached value (%d)\n",
__func__, dbg_bytes_rexmt, *sack_bytes_rexmt);
*sack_bytes_rexmt = dbg_bytes_rexmt;
}
return (hole);
}
/*
* After a timeout, the SACK list may be rebuilt. This SACK information
* should be used to avoid retransmitting SACKed data. This function
@ -590,33 +660,3 @@ tcp_sack_adjust(struct tcpcb *tp)
tp->snd_nxt = tp->rcv_lastsack;
return;
}
/*
* Calculate the number of SACKed bytes in the scoreboard by
* subtracting the amount of data accounted for in sackholes
* from the total span of the scoreboard. Also returns the
* amount of data that is "lost" and has not yet been retransmitted.
*/
int
tcp_sacked_bytes(struct tcpcb *tp, int *lost_not_rexmitted)
{
INP_LOCK_ASSERT(tp->t_inpcb);
struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
int sacked = 0;
int lost = 0;
if (cur == NULL) /* Scoreboard empty. */
goto out;
if (SEQ_GEQ(tp->snd_una, tp->rcv_lastsack)) /* Scoreboard is stale. */
goto out;
sacked = tp->rcv_lastsack - cur->start;
while (cur) {
lost += (cur->end - cur->rxmit);
sacked -= (cur->end - cur->start);
cur = TAILQ_NEXT(cur, scblink);
}
out:
if (lost_not_rexmitted)
*lost_not_rexmitted = lost;
return (sacked);
}

View File

@ -63,6 +63,11 @@ struct sackhole {
TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */
};
struct sackhint {
struct sackhole *nexthole;
int sack_bytes_rexmit;
};
struct tcptemp {
u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
struct tcphdr tt_t;
@ -192,6 +197,7 @@ struct tcpcb {
struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
tcp_seq sack_newdata; /* New data xmitted in this recovery
episode starts at this seq number */
struct sackhint sackhint; /* SACK scoreboard hint */
};
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
@ -577,7 +583,6 @@ void tcp_del_sackholes(struct tcpcb *, struct tcphdr *);
void tcp_clean_sackreport(struct tcpcb *tp);
void tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
int tcp_sacked_bytes(struct tcpcb *tp, int *lost_not_rexmitted);
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);