tcp: Rack rwnd collapse.

Currently when the peer collapses its rwnd, we mark packets to be retransmitted
and use the must_retran flags like we do when a PMTU collapses to retransmit the
collapsed packets. However this causes a problem with some middle boxes that
play with the rwnd to control flow. As soon as the rwnd increases we start resending
which may be not even a rtt.. and in fact the peer may have gotten the packets. Which
means we gratuitously retransmit packets we should not.

The fix here is to make sure that a rack time has passed before retransmitting the packets.
This makes sure that the rwnd collapse was real and the packets do need retransmission.

Reviewed by: tuexen
Sponsored by: Netflix Inc
Differential Revision: https://reviews.freebsd.org/D35166
This commit is contained in:
Randall Stewart 2022-08-23 09:17:05 -04:00
parent 4e0ce82b53
commit 62ce18fc9a
3 changed files with 300 additions and 126 deletions

View File

@ -236,7 +236,9 @@ enum tcp_log_events {
TCP_LOG_FSB, /* FSB information 63 */
RACK_DSACK_HANDLING, /* Handling of DSACK in rack for reordering window 64 */
TCP_HYSTART, /* TCP Hystart logging 65 */
TCP_LOG_END /* End (keep at end) 66 */
TCP_CHG_QUERY, /* Change query during fnc_init() 66 */
TCP_RACK_LOG_COLLAPSE, /* Window collapse by peer 67 */
TCP_LOG_END /* End (keep at end) 68 */
};
enum tcp_log_states {

View File

@ -385,6 +385,9 @@ counter_u64_t rack_move_some;
counter_u64_t rack_input_idle_reduces;
counter_u64_t rack_collapsed_win;
counter_u64_t rack_collapsed_win_seen;
counter_u64_t rack_collapsed_win_rxt;
counter_u64_t rack_collapsed_win_rxt_bytes;
counter_u64_t rack_try_scwnd;
counter_u64_t rack_hw_pace_init_fail;
counter_u64_t rack_hw_pace_lost;
@ -790,6 +793,9 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
counter_u64_zero(rack_move_some);
counter_u64_zero(rack_try_scwnd);
counter_u64_zero(rack_collapsed_win);
counter_u64_zero(rack_collapsed_win_rxt);
counter_u64_zero(rack_collapsed_win_seen);
counter_u64_zero(rack_collapsed_win_rxt_bytes);
}
rack_clear_counter = 0;
return (0);
@ -1757,12 +1763,31 @@ rack_init_sysctls(void)
OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
&rack_input_idle_reduces,
"Total number of idle reductions on input");
rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "collapsed_win_seen", CTLFLAG_RD,
&rack_collapsed_win_seen,
"Total number of collapsed window events seen (where our window shrinks)");
rack_collapsed_win = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "collapsed_win", CTLFLAG_RD,
&rack_collapsed_win,
"Total number of collapsed windows");
"Total number of collapsed window events where we mark packets");
rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD,
&rack_collapsed_win_rxt,
"Total number of packets that were retransmitted");
rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD,
&rack_collapsed_win_rxt_bytes,
"Total number of bytes that were retransmitted");
rack_try_scwnd = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@ -2772,6 +2797,9 @@ rack_counter_destroy(void)
counter_u64_free(rack_sack_splits);
counter_u64_free(rack_input_idle_reduces);
counter_u64_free(rack_collapsed_win);
counter_u64_free(rack_collapsed_win_rxt);
counter_u64_free(rack_collapsed_win_rxt_bytes);
counter_u64_free(rack_collapsed_win_seen);
counter_u64_free(rack_try_scwnd);
counter_u64_free(rack_persists_sends);
counter_u64_free(rack_persists_acks);
@ -5295,7 +5323,9 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_
goto activate_rxt;
}
/* Convert from ms to usecs */
if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
if ((rsm->r_flags & RACK_SACK_PASSED) ||
(rsm->r_flags & RACK_RWND_COLLAPSED) ||
(rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
if ((tp->t_flags & TF_SENTFIN) &&
((tp->snd_max - tp->snd_una) == 1) &&
(rsm->r_flags & RACK_HAS_FIN)) {
@ -5757,7 +5787,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* real pacing. And the tlp or rxt is smaller
* than the pacing calculation. Lets not
* pace that long since we know the calculation
* so far is not accurate.
* so far is not accurate.
*/
slot = hpts_timeout;
}
@ -6501,7 +6531,7 @@ rack_remxt_tmr(struct tcpcb *tp)
trsm = rsm;
if (rsm->r_flags & RACK_ACKED)
rsm->r_flags |= RACK_WAS_ACKED;
rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
rsm->r_flags |= RACK_MUST_RXT;
}
/* Clear the count (we just un-acked them) */
@ -8040,6 +8070,13 @@ rack_log_sack_passed(struct tcpcb *tp,
*/
continue;
}
if (nrsm->r_flags & RACK_RWND_COLLAPSED) {
/*
* If the peer dropped the rwnd on
* these then we don't worry about them.
*/
continue;
}
if (nrsm->r_flags & RACK_SACK_PASSED) {
/*
* We found one that is already marked
@ -9797,7 +9834,7 @@ rack_strike_dupack(struct tcp_rack *rack)
/* Sendmap entries that are marked to
* be retransmitted do not need dupack's
* struck. We get these marks for a number
* of reasons (rxt timeout with no sack,
* of reasons (rxt timeout with no sack,
* mtu change, or rwnd collapses). When
* these events occur, we know we must retransmit
* them and mark the sendmap entries. Dupack counting
@ -10308,47 +10345,83 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (0);
}
static void
rack_collapsed_window(struct tcp_rack *rack)
rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line,
int dir, uint32_t flags, struct rack_sendmap *rsm)
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
struct timeval tv;
memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = cnt;
log.u_bbr.flex2 = split;
log.u_bbr.flex3 = out;
log.u_bbr.flex4 = line;
log.u_bbr.flex5 = rack->r_must_retran;
log.u_bbr.flex6 = flags;
log.u_bbr.flex7 = rack->rc_has_collapsed;
log.u_bbr.flex8 = dir; /*
* 1 is collapsed, 0 is uncollapsed,
* 2 is log of a rsm being marked, 3 is a split.
*/
if (rsm == NULL)
log.u_bbr.rttProp = 0;
else
log.u_bbr.rttProp = (uint64_t)rsm;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
TCP_RACK_LOG_COLLAPSE, 0,
0, &log, false, &tv);
}
}
static void
rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line)
{
/*
* Now we must walk the
* send map and divide the
* ones left stranded. These
* guys can't cause us to abort
* the connection and are really
* "unsent". However if a buggy
* client actually did keep some
* of the data i.e. collapsed the win
* and refused to ack and then opened
* the win and acked that data. We would
* get into an ack war, the simplier
* method then of just pretending we
* did not send those segments something
* won't work.
* Here all we do is mark the collapsed point and set the flag.
* This may happen again and again, but there is no
* sense splitting our map until we know where the
* peer finally lands in the collapse.
*/
struct rack_sendmap *rsm, *nrsm, fe;
rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
if ((rack->rc_has_collapsed == 0) ||
(rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)))
counter_u64_add(rack_collapsed_win_seen, 1);
rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
rack->rc_has_collapsed = 1;
rack->r_collapse_point_valid = 1;
rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
}
static void
rack_un_collapse_window(struct tcp_rack *rack, int line)
{
struct rack_sendmap *nrsm, *rsm, fe;
int cnt = 0, split = 0;
#ifdef INVARIANTS
struct rack_sendmap *insret;
#endif
tcp_seq max_seq;
rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
memset(&fe, 0, sizeof(fe));
fe.r_start = max_seq;
/* Find the first seq past or at maxseq */
rack->rc_has_collapsed = 0;
fe.r_start = rack->r_ctl.last_collapse_point;
rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
if (rsm == NULL) {
/* Nothing to do strange */
rack->rc_has_collapsed = 0;
/* Nothing to do maybe the peer ack'ed it all */
rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
return;
}
/*
* Now do we need to split at
* the collapse point?
*/
if (SEQ_GT(max_seq, rsm->r_start)) {
/* Now do we need to split this one? */
if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) {
rack_log_collapse(rack, rsm->r_start, rsm->r_end,
rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm);
nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
if (nrsm == NULL) {
/* We can't get a rsm, mark all? */
@ -10356,7 +10429,8 @@ rack_collapsed_window(struct tcp_rack *rack)
goto no_split;
}
/* Clone it */
rack_clone_rsm(rack, nrsm, rsm, max_seq);
split = 1;
rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point);
#ifndef INVARIANTS
(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
#else
@ -10366,7 +10440,8 @@ rack_collapsed_window(struct tcp_rack *rack)
nrsm, insret, rack, rsm);
}
#endif
rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__);
rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT,
rack->r_ctl.last_collapse_point, __LINE__);
if (rsm->r_in_tmap) {
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
nrsm->r_in_tmap = 1;
@ -10378,38 +10453,15 @@ rack_collapsed_window(struct tcp_rack *rack)
rsm = nrsm;
}
no_split:
counter_u64_add(rack_collapsed_win, 1);
RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
nrsm->r_flags |= RACK_RWND_COLLAPSED;
rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm);
cnt++;
}
rack->rc_has_collapsed = 1;
}
static void
rack_un_collapse_window(struct tcp_rack *rack)
{
struct rack_sendmap *rsm;
int cnt = 0;;
rack->r_ctl.rc_out_at_rto = 0;
rack->r_ctl.rc_snd_max_at_rto = rack->rc_tp->snd_una;
RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
if (rsm->r_flags & RACK_RWND_COLLAPSED) {
rsm->r_flags &= ~RACK_RWND_COLLAPSED;
rsm->r_flags |= RACK_MUST_RXT;
if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
rack->r_ctl.rc_snd_max_at_rto = rsm->r_end;
rack->r_ctl.rc_out_at_rto += (rsm->r_end - rsm->r_start);
}
cnt++;
}
else
break;
}
rack->rc_has_collapsed = 0;
if (cnt) {
rack->r_must_retran = 1;
counter_u64_add(rack_collapsed_win, 1);
}
rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
}
static void
@ -10518,9 +10570,12 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
if (tp->snd_wnd < ctf_outstanding(tp))
/* The peer collapsed the window */
rack_collapsed_window(rack);
rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
else if (rack->rc_has_collapsed)
rack_un_collapse_window(rack);
rack_un_collapse_window(rack, __LINE__);
if ((rack->r_collapse_point_valid) &&
(SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point)))
rack->r_collapse_point_valid = 0;
/* Was persist timer active and now we have window space? */
if ((rack->rc_in_persist != 0) &&
(tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@ -11076,10 +11131,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (tp->snd_wnd < ctf_outstanding(tp)) {
/* The peer collapsed the window */
rack_collapsed_window(rack);
rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
} else if (rack->rc_has_collapsed)
rack_un_collapse_window(rack);
rack_un_collapse_window(rack, __LINE__);
if ((rack->r_collapse_point_valid) &&
(SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point)))
rack->r_collapse_point_valid = 0;
/*
* Pull snd_wl2 up to prevent seq wrap relative to th_ack.
*/
@ -13066,13 +13123,6 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uin
/* Not a valid win update */
return;
}
if (tp->snd_wnd > tp->max_sndwnd)
tp->max_sndwnd = tp->snd_wnd;
if (tp->snd_wnd < (tp->snd_max - high_seq)) {
/* The peer collapsed the window */
rack_collapsed_window(rack);
} else if (rack->rc_has_collapsed)
rack_un_collapse_window(rack);
/* Do we exit persists? */
if ((rack->rc_in_persist != 0) &&
(tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@ -13609,6 +13659,15 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
#ifdef TCP_ACCOUNTING
ts_val = get_cyclecount();
#endif
/* Tend to any collapsed window */
if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) {
/* The peer collapsed the window */
rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__);
} else if (rack->rc_has_collapsed)
rack_un_collapse_window(rack, __LINE__);
if ((rack->r_collapse_point_valid) &&
(SEQ_GT(high_seq, rack->r_ctl.high_collapse_point)))
rack->r_collapse_point_valid = 0;
acked_amount = acked = (high_seq - tp->snd_una);
if (acked) {
/*
@ -15930,6 +15989,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
if (rsm->r_flags & RACK_RWND_COLLAPSED) {
rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
counter_u64_add(rack_collapsed_win_rxt, 1);
counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
}
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
if (rack->rack_no_prr)
@ -16538,6 +16602,58 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
return (-1);
}
static struct rack_sendmap *
rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
{
struct rack_sendmap *rsm = NULL;
struct rack_sendmap fe;
int thresh;
restart:
fe.r_start = rack->r_ctl.last_collapse_point;
rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
/* Nothing, strange turn off validity */
rack->r_collapse_point_valid = 0;
return (NULL);
}
/* Can we send it yet? */
if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) {
/*
* Receiver window has not grown enough for
* the segment to be put on the wire.
*/
return (NULL);
}
if (rsm->r_flags & RACK_ACKED) {
/*
* It has been sacked, lets move to the
* next one if possible.
*/
rack->r_ctl.last_collapse_point = rsm->r_end;
/* Are we done? */
if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
rack->r_ctl.high_collapse_point)) {
rack->r_collapse_point_valid = 0;
return (NULL);
}
goto restart;
}
/* Now has it been long enough ? */
thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts);
if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
rack_log_collapse(rack, rsm->r_start,
(cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
thresh, __LINE__, 6, rsm->r_flags, rsm);
return (rsm);
}
/* Not enough time */
rack_log_collapse(rack, rsm->r_start,
(cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
thresh, __LINE__, 7, rsm->r_flags, rsm);
return (NULL);
}
static int
rack_output(struct tcpcb *tp)
{
@ -16598,7 +16714,6 @@ rack_output(struct tcpcb *tp)
struct ip6_hdr *ip6 = NULL;
int32_t isipv6;
#endif
uint8_t filled_all = 0;
bool hw_tls = false;
/* setup and take the cache hits here */
@ -16863,6 +16978,29 @@ rack_output(struct tcpcb *tp)
sb_offset = rsm->r_start - tp->snd_una;
if (len >= segsiz)
len = segsiz;
} else if (rack->r_collapse_point_valid &&
((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
/*
* If an RSM is returned then enough time has passed
* for us to retransmit it. Move up the collapse point,
* since this rsm has its chance to retransmit now.
*/
rack_trace_point(rack, RACK_TP_COLLAPSED_RXT);
rack->r_ctl.last_collapse_point = rsm->r_end;
/* Are we done? */
if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
rack->r_ctl.high_collapse_point))
rack->r_collapse_point_valid = 0;
sack_rxmit = 1;
/* We are not doing a TLP */
doing_tlp = 0;
len = rsm->r_end - rsm->r_start;
sb_offset = rsm->r_start - tp->snd_una;
sendalot = 0;
if ((rack->full_size_rxt == 0) &&
(rack->shape_rxt_to_pacing_min == 0) &&
(len >= segsiz))
len = segsiz;
} else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
/* We have a retransmit that takes precedence */
if ((!IN_FASTRECOVERY(tp->t_flags)) &&
@ -16921,53 +17059,72 @@ rack_output(struct tcpcb *tp)
}
if (rack->r_must_retran &&
(doing_tlp == 0) &&
(SEQ_GT(tp->snd_max, tp->snd_una)) &&
(rsm == NULL)) {
/*
* Non-Sack and we had a RTO or Sack/non-Sack and a
* MTU change, we need to retransmit until we reach
* the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
* There are two different ways that we
* can get into this block:
* a) This is a non-sack connection, we had a time-out
* and thus r_must_retran was set and everything
* left outstanding as been marked for retransmit.
* b) The MTU of the path shrank, so that everything
* was marked to be retransmitted with the smaller
* mtu and r_must_retran was set.
*
* This means that we expect the sendmap (outstanding)
* to all be marked must. We can use the tmap to
* look at them.
*
*/
if (SEQ_GT(tp->snd_max, tp->snd_una)) {
int sendwin, flight;
int sendwin, flight;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
if (flight >= sendwin) {
so = inp->inp_socket;
sb = &so->so_snd;
goto just_return_nolock;
}
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
if (rsm == NULL) {
/* TSNH */
rack->r_must_retran = 0;
rack->r_ctl.rc_out_at_rto = 0;
so = inp->inp_socket;
sb = &so->so_snd;
goto just_return_nolock;
}
if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
/* It does not have the flag, we are done */
rack->r_must_retran = 0;
rack->r_ctl.rc_out_at_rto = 0;
} else {
sack_rxmit = 1;
len = rsm->r_end - rsm->r_start;
sendalot = 0;
sb_offset = rsm->r_start - tp->snd_una;
if (len >= segsiz)
len = segsiz;
/*
* Delay removing the flag RACK_MUST_RXT so
* that the fastpath for retransmit will
* work with this rsm.
*/
}
} else {
/* We must be done if there is nothing outstanding */
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
if (flight >= sendwin) {
/*
* We can't send yet.
*/
so = inp->inp_socket;
sb = &so->so_snd;
goto just_return_nolock;
}
/*
* This is the case a/b mentioned above. All
* outstanding/not-acked should be marked.
* We can use the tmap to find them.
*/
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
if (rsm == NULL) {
/* TSNH */
rack->r_must_retran = 0;
rack->r_ctl.rc_out_at_rto = 0;
so = inp->inp_socket;
sb = &so->so_snd;
goto just_return_nolock;
}
if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
/*
* The first one does not have the flag, did we collapse
* further up in our list?
*/
rack->r_must_retran = 0;
rack->r_ctl.rc_out_at_rto = 0;
rsm = NULL;
sack_rxmit = 0;
} else {
sack_rxmit = 1;
len = rsm->r_end - rsm->r_start;
sb_offset = rsm->r_start - tp->snd_una;
sendalot = 0;
if ((rack->full_size_rxt == 0) &&
(rack->shape_rxt_to_pacing_min == 0) &&
(len >= segsiz))
len = segsiz;
/*
* Delay removing the flag RACK_MUST_RXT so
* that the fastpath for retransmit will
* work with this rsm.
*/
}
}
/*
@ -18177,7 +18334,7 @@ rack_output(struct tcpcb *tp)
if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
((rsm == NULL) ? hw_tls : 0)
#ifdef NETFLIX_COPY_ARGS
, &filled_all
, &s_mb, &s_moff
#endif
);
if (len <= (tp->t_maxseg - optlen)) {
@ -18548,15 +18705,17 @@ rack_output(struct tcpcb *tp)
log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
log.u_bbr.flex4 = orig_len;
if (filled_all)
log.u_bbr.flex5 = 0x80000000;
else
log.u_bbr.flex5 = 0;
/* Save off the early/late values */
log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
log.u_bbr.bw_inuse = rack_get_bw(rack);
if (rsm || sack_rxmit) {
log.u_bbr.flex8 = 0;
if (rsm) {
if (rsm->r_flags & RACK_RWND_COLLAPSED) {
rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
counter_u64_add(rack_collapsed_win_rxt, 1);
counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
}
if (doing_tlp)
log.u_bbr.flex8 = 2;
else

View File

@ -275,7 +275,7 @@ struct rack_opts_stats {
* non-zero, the default is 4 for continuous tracing.
* You also set in the number of connections you want
* have get BB logs in net.inet.tcp.<stack>.tp.count.
*
*
* Count will decrement every time BB logging is assigned
* to a connection that hit your tracepoint.
*
@ -291,6 +291,7 @@ struct rack_opts_stats {
#define RACK_TP_HWENOBUF 0x00000001 /* When we are doing hardware pacing and hit enobufs */
#define RACK_TP_ENOBUF 0x00000002 /* When we hit enobufs with software pacing */
#define RACK_TP_COLLAPSED_WND 0x00000003 /* When a peer to collapses its rwnd on us */
#define RACK_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */
#define MIN_GP_WIN 6 /* We need at least 6 MSS in a GP measurement */
#ifdef _KERNEL
@ -472,6 +473,8 @@ struct rack_control {
uint32_t roundends; /* acked value above which round ends */
uint32_t num_dsack; /* Count of dsack's seen (1 per window)*/
uint32_t forced_ack_ts;
uint32_t last_collapse_point; /* Last point peer collapsed too */
uint32_t high_collapse_point;
uint32_t rc_lower_rtt_us_cts; /* Time our GP rtt was last lowered */
uint32_t rc_time_probertt_entered;
uint32_t rc_time_probertt_starts;
@ -546,7 +549,15 @@ struct tcp_rack {
struct inpcb *rc_inp; /* The inpcb Lock(a) */
uint8_t rc_free_cnt; /* Number of free entries on the rc_free list
* Lock(a) */
uint8_t client_bufferlvl; /* 0 - 5 normaly, less than or at 2 means its real low */
uint8_t client_bufferlvl : 4, /* Expected range [0,5]: 0=unset, 1=low/empty */
rack_deferred_inited : 1,
/* ******************************************************************** */
/* Note for details of next two fields see rack_init_retransmit_rate() */
/* ******************************************************************** */
full_size_rxt: 1,
shape_rxt_to_pacing_min : 1,
/* ******************************************************************** */
spare : 1;
uint8_t no_prr_addback : 1,
gp_ready : 1,
defer_options: 1,
@ -647,7 +658,9 @@ struct tcp_rack {
r_late : 1,
r_wanted_output: 1,
r_rr_config : 2,
rc_avail_bit : 3;
r_persist_lt_bw_off : 1,
r_collapse_point_valid : 1,
rc_avail_bit : 2;
uint16_t rc_init_win : 8,
rc_gp_rtt_set : 1,
rc_gp_dyn_mul : 1,