Add TCP feature Proportional Rate Reduction (PRR) - RFC6937

PRR improves loss recovery and avoids RTOs in a wide range
of scenarios (ACK thinning) over regular SACK loss recovery.

PRR is disabled by default, enable by net.inet.tcp.do_prr = 1.
Performance may be impeded by token bucket rate policers at
the bottleneck, where net.inet.tcp.do_prr_conservate = 1
should be enabled in addition.

Submitted by:	Aris Angelogiannopoulos
Sponsored by:	NetApp, Inc.
Differential Revision:	https://reviews.freebsd.org/D18892
This commit is contained in:
Richard Scheffenegger 2020-12-04 11:29:27 +00:00
parent 34af05ead3
commit 0e1d7c25c5
2 changed files with 131 additions and 7 deletions

View File

@ -153,6 +153,16 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(drop_synfin), 0,
"Drop TCP packets with SYN+FIN set");
VNET_DEFINE(int, tcp_do_prr_conservative) = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_prr_conservative), 0,
"Do conservative Proportional Rate Reduction");
VNET_DEFINE(int, tcp_do_prr) = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_prr), 1,
"Enable Proportional Rate Reduction per RFC 6937");
VNET_DEFINE(int, tcp_do_newcwv) = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_newcwv), 0,
@ -2554,7 +2564,55 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
IN_FASTRECOVERY(tp->t_flags)) {
cc_ack_received(tp, th, nsegs,
CC_DUPACK);
if ((tp->t_flags & TF_SACK_PERMIT) &&
if (V_tcp_do_prr &&
IN_FASTRECOVERY(tp->t_flags) &&
(tp->t_flags & TF_SACK_PERMIT)) {
long snd_cnt = 0, limit = 0;
long del_data = 0, pipe = 0;
/*
* In a duplicate ACK del_data is only the
* diff_in_sack. If no SACK is used del_data
* will be 0. Pipe is the amount of data we
* estimate to be in the network.
*/
del_data = tp->sackhint.delivered_data;
pipe = (tp->snd_nxt - tp->snd_fack) +
tp->sackhint.sack_bytes_rexmit;
tp->sackhint.prr_delivered += del_data;
if (pipe > tp->snd_ssthresh) {
snd_cnt = (tp->sackhint.prr_delivered *
tp->snd_ssthresh /
tp->sackhint.recover_fs) +
1 - tp->sackhint.sack_bytes_rexmit;
} else {
if (V_tcp_do_prr_conservative)
limit = tp->sackhint.prr_delivered -
tp->sackhint.sack_bytes_rexmit;
else
if ((tp->sackhint.prr_delivered -
tp->sackhint.sack_bytes_rexmit) >
del_data)
limit = tp->sackhint.prr_delivered -
tp->sackhint.sack_bytes_rexmit +
maxseg;
else
limit = del_data + maxseg;
if ((tp->snd_ssthresh - pipe) < limit)
snd_cnt = tp->snd_ssthresh - pipe;
else
snd_cnt = limit;
}
snd_cnt = max((snd_cnt / maxseg), 0);
/*
* Send snd_cnt new data into the network in
* response to this ACK. If there is a going
* to be a SACK retransmission, adjust snd_cwnd
* accordingly.
*/
tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
tp->sackhint.sack_bytes_rexmit +
(snd_cnt * maxseg);
} else if ((tp->t_flags & TF_SACK_PERMIT) &&
IN_FASTRECOVERY(tp->t_flags)) {
int awnd;
@ -2583,13 +2641,14 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_seq onxt = tp->snd_nxt;
/*
* If we're doing sack, check to
* see if we're already in sack
* If we're doing sack, or prr, check
* to see if we're already in sack
* recovery. If we're not doing sack,
* check to see if we're in newreno
* recovery.
*/
if (tp->t_flags & TF_SACK_PERMIT) {
if (V_tcp_do_prr ||
(tp->t_flags & TF_SACK_PERMIT)) {
if (IN_FASTRECOVERY(tp->t_flags)) {
tp->t_dupacks = 0;
break;
@ -2607,6 +2666,16 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
CC_DUPACK);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
if (V_tcp_do_prr) {
/*
* snd_ssthresh is already updated by
* cc_cong_signal.
*/
tp->sackhint.prr_delivered = 0;
tp->sackhint.sack_bytes_rexmit = 0;
if (!(tp->sackhint.recover_fs = tp->snd_nxt - tp->snd_una))
tp->sackhint.recover_fs = 1;
}
if (tp->t_flags & TF_SACK_PERMIT) {
TCPSTAT_INC(
tcps_sack_recovery_episode);
@ -2713,7 +2782,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (IN_FASTRECOVERY(tp->t_flags)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
if (tp->t_flags & TF_SACK_PERMIT)
tcp_sack_partialack(tp, th);
if (V_tcp_do_prr)
tcp_prr_partialack(tp, th);
else
tcp_sack_partialack(tp, th);
else
tcp_newreno_partial_ack(tp, th);
} else
@ -3839,6 +3911,54 @@ tcp_mssopt(struct in_conninfo *inc)
return (mss);
}
void
tcp_prr_partialack(struct tcpcb *tp, struct tcphdr *th)
{
long snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
int maxseg = tcp_maxseg(tp);
INP_WLOCK_ASSERT(tp->t_inpcb);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
/*
* Compute the amount of data that this ACK is indicating
* (del_data) and an estimate of how many bytes are in the
* network.
*/
if (SEQ_GEQ(th->th_ack, tp->snd_una))
del_data = BYTES_THIS_ACK(tp, th);
del_data += tp->sackhint.delivered_data;
pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit;
tp->sackhint.prr_delivered += del_data;
/*
* Proportional Rate Reduction
*/
if (pipe > tp->snd_ssthresh)
snd_cnt = (tp->sackhint.prr_delivered * tp->snd_ssthresh / tp->sackhint.recover_fs) -
tp->sackhint.sack_bytes_rexmit;
else {
if (V_tcp_do_prr_conservative)
limit = tp->sackhint.prr_delivered - tp->sackhint.sack_bytes_rexmit;
else
if ((tp->sackhint.prr_delivered - tp->sackhint.sack_bytes_rexmit) > del_data)
limit = tp->sackhint.prr_delivered - tp->sackhint.sack_bytes_rexmit + maxseg;
else
limit = del_data + maxseg;
snd_cnt = min((tp->snd_ssthresh - pipe), limit);
}
snd_cnt = max((snd_cnt / maxseg), 0);
/*
* Send snd_cnt new data into the network in response to this ack.
* If there is going to be a SACK retransmission, adjust snd_cwnd
* accordingly.
*/
tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
tp->sackhint.sack_bytes_rexmit + (snd_cnt * maxseg);
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
}
/*
* On a partial ack arrives, force the retransmission of the
* next unacknowledged segment. Do not clear tp->t_dupacks.

View File

@ -113,8 +113,9 @@ struct sackhint {
int32_t sacked_bytes; /* Total sacked bytes reported by the
* receiver via sack option
*/
uint32_t _pad1[1]; /* TBD */
uint64_t _pad[1]; /* TBD */
uint32_t recover_fs; /* Flight Size at the start of Loss recovery */
uint32_t prr_delivered; /* Total bytes delivered using PRR */
uint32_t _pad[1]; /* TBD */
};
#define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq)
@ -866,6 +867,8 @@ VNET_DECLARE(int, tcp_sendspace);
VNET_DECLARE(struct inpcbhead, tcb);
VNET_DECLARE(struct inpcbinfo, tcbinfo);
#define V_tcp_do_prr VNET(tcp_do_prr)
#define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative)
#define V_tcp_do_newcwv VNET(tcp_do_newcwv)
#define V_drop_synfin VNET(drop_synfin)
#define V_path_mtu_discovery VNET(path_mtu_discovery)
@ -1051,6 +1054,7 @@ void tcp_clean_dsack_blocks(struct tcpcb *tp);
void tcp_clean_sackreport(struct tcpcb *tp);
void tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
void tcp_prr_partialack(struct tcpcb *, struct tcphdr *);
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);