tcp: Inconsistent use of hpts_calling flag

Gleb has noticed there were some inconsistency's in the way the inp_hpts_calls flag was being used. One
such inconsistency results in a bug when we can't allocate enough sendmap entries to entertain a call to
rack_output().. basically a timer won't get started like it should. Also in cleaning this up I find that the
"no_output" side of input needs to be adjusted to make sure we don't try to re-pace too quickly outside
the hpts assurance of 250useconds.

Another thing here is we end up with duplicate calls to tcp_output() which we should not. If packets go
from hpts for processing the input side of tcp will call the output side of tcp on the last packet if it is needed.
This means that when that occurs a second call to tcp_output would be made that is not needed and if pacing
is going on may be harmful.

Lets fix all this and explicitly state the contract that hpts is making with transports that care about the
flag.

Reviewed by: tuexen, glebius
Sponsored by: Netflix Inc
Differential Revision:https://reviews.freebsd.org/D39653
This commit is contained in:
Randall Stewart 2023-04-17 17:10:26 -04:00
parent fb5ff7384c
commit 2ad584c555
3 changed files with 39 additions and 12 deletions

View File

@ -1330,6 +1330,17 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
/*
* We set inp_hpts_calls to 1 before any possible output.
* The contract with the transport is that if it cares about
* hpts calling it should clear the flag. That way next time
* it is called it will know it is hpts.
*
* We also only call tfb_do_queued_segments() <or> tcp_output()
* it is expected that if segments are queued and come in that
* the final input mbuf will cause a call to output if it is needed.
*/
inp->inp_hpts_calls = 1;
if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) &&
!STAILQ_EMPTY(&tp->t_inqueue)) {
error = (*tp->t_fb->tfb_do_queued_segments)(tp, 0);
@ -1337,12 +1348,11 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
/* The input killed the connection */
goto skip_pacing;
}
} else {
error = tcp_output(tp);
if (error < 0)
goto skip_pacing;
}
inp->inp_hpts_calls = 1;
error = tcp_output(tp);
if (error < 0)
goto skip_pacing;
inp->inp_hpts_calls = 0;
if (ninp) {
/*
* If we have a nxt inp, see if we can

View File

@ -11581,6 +11581,9 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
/* Do we have the correct timer running? */
bbr_timer_audit(tp, bbr, lcts, &so->so_snd);
}
/* Clear the flag, it may have been cleared by output but we may not have */
if ((nxt_pkt == 0) && (inp->inp_hpts_calls))
inp->inp_hpts_calls = 0;
/* Do we have a new state */
if (bbr->r_state != tp->t_state)
bbr_set_state(tp, bbr, tiwin);
@ -11850,6 +11853,8 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
cts = tcp_tv_to_usectick(&bbr->rc_tv);
inp = bbr->rc_inp;
hpts_calling = inp->inp_hpts_calls;
inp->inp_hpts_calls = 0;
so = inp->inp_socket;
sb = &so->so_snd;
if (tp->t_nic_ktls_xmit)
@ -11985,7 +11990,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
merged_val = bbr->rc_pacer_started;
merged_val <<= 32;
merged_val |= bbr->r_ctl.rc_last_delay_val;
bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls,
bbr_log_pacing_delay_calc(bbr, hpts_calling,
bbr->r_ctl.rc_agg_early, cts, delay_calc, merged_val,
bbr->r_agg_early_set, 3);
bbr->r_ctl.rc_last_delay_val = 0;
@ -12028,8 +12033,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
* <and> c) We had room to send something.
*
*/
hpts_calling = inp->inp_hpts_calls;
inp->inp_hpts_calls = 0;
if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
int retval;

View File

@ -16427,6 +16427,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
}
did_out = 1;
}
if (rack->rc_inp->inp_hpts_calls)
rack->rc_inp->inp_hpts_calls = 0;
rack_free_trim(rack);
#ifdef TCP_ACCOUNTING
sched_unpin();
@ -16525,6 +16527,17 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
(*ts_ptr == TCP_LRO_TS_OPTION)))
no_output = 1;
}
if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) {
/*
* It is unrealistic to think we can pace in less than
* the minimum granularity of the pacer (def:250usec). So
* if we have less than that time remaining we should go
* ahead and allow output to be "early". We will attempt to
* make up for it in any pacing time we try to apply on
* the outbound packet.
*/
no_output = 0;
}
}
if (m->m_flags & M_ACKCMP) {
/*
@ -16984,6 +16997,9 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0);
rack_free_trim(rack);
}
/* Clear the flag, it may have been cleared by output but we may not have */
if ((nxt_pkt == 0) && (inp->inp_hpts_calls))
inp->inp_hpts_calls = 0;
/* Update any rounds needed */
if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))
rack_log_hystart_event(rack, high_seq, 8);
@ -19637,6 +19653,7 @@ rack_output(struct tcpcb *tp)
ts_val = get_cyclecount();
#endif
hpts_calling = inp->inp_hpts_calls;
rack->rc_inp->inp_hpts_calls = 0;
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
#ifdef TCP_ACCOUNTING
@ -19759,7 +19776,6 @@ rack_output(struct tcpcb *tp)
counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
return (0);
}
rack->rc_inp->inp_hpts_calls = 0;
/* Finish out both pacing early and late accounting */
if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
@ -19876,7 +19892,7 @@ rack_output(struct tcpcb *tp)
while (rack->rc_free_cnt < rack_free_cache) {
rsm = rack_alloc(rack);
if (rsm == NULL) {
if (inp->inp_hpts_calls)
if (hpts_calling)
/* Retry in a ms */
slot = (1 * HPTS_USEC_IN_MSEC);
so = inp->inp_socket;
@ -19887,8 +19903,6 @@ rack_output(struct tcpcb *tp)
rack->rc_free_cnt++;
rsm = NULL;
}
if (inp->inp_hpts_calls)
inp->inp_hpts_calls = 0;
sack_rxmit = 0;
len = 0;
rsm = NULL;