This brings into sync FreeBSD with the netflix versions of rack and bbr.

This fixes several breakages (panics) since the tcp_lro code was
committed that have been reported. Quite a few new features are
now in rack (prefecting of DGP -- Dynamic Goodput Pacing among the
largest). There is also support for ack-war prevention. Documents
comming soon on rack..

Sponsored by:           Netflix
Reviewed by:		rscheff, mtuexen
Differential Revision:	https://reviews.freebsd.org/D30036
This commit is contained in:
Randall Stewart 2021-05-06 11:22:26 -04:00
parent 0ec3e99111
commit 5d8fd932e4
18 changed files with 7282 additions and 1885 deletions

View File

@ -91,15 +91,20 @@ struct cc_var {
struct sctp_nets *sctp;
} ccvc;
uint16_t nsegs; /* # segments coalesced into current chain. */
uint8_t labc; /* Dont use system abc use passed in */
};
/* cc_var flags. */
#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */
#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */
#define CCF_UNUSED1 0x0004 /* unused */
#define CCF_USE_LOCAL_ABC 0x0004 /* Dont use the system l_abc val */
#define CCF_ACKNOW 0x0008 /* Will this ack be sent now? */
#define CCF_IPHDR_CE 0x0010 /* Does this packet set CE bit? */
#define CCF_TCPHDR_CWR 0x0020 /* Does this packet set CWR bit? */
#define CCF_MAX_CWND 0x0040 /* Have we reached maximum cwnd? */
#define CCF_CHG_MAX_CWND 0x0080 /* Cubic max_cwnd changed, for K */
#define CCF_USR_IWND 0x0100 /* User specified initial window */
#define CCF_USR_IWND_INIT_NSEG 0x0200 /* Convert segs to bytes on conn init */
/* ACK types passed to the ack_received() hook. */
#define CC_ACK 0x0001 /* Regular in sequence ACK. */

View File

@ -86,8 +86,8 @@ static void newreno_cong_signal(struct cc_var *ccv, uint32_t type);
static void newreno_post_recovery(struct cc_var *ccv);
static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf);
VNET_DEFINE_STATIC(uint32_t, newreno_beta) = 50;
VNET_DEFINE_STATIC(uint32_t, newreno_beta_ecn) = 80;
VNET_DEFINE(uint32_t, newreno_beta) = 50;
VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
#define V_newreno_beta VNET(newreno_beta)
#define V_newreno_beta_ecn VNET(newreno_beta_ecn)
@ -101,11 +101,6 @@ struct cc_algo newreno_cc_algo = {
.ctl_output = newreno_ctl_output,
};
struct newreno {
uint32_t beta;
uint32_t beta_ecn;
};
static inline struct newreno *
newreno_malloc(struct cc_var *ccv)
{
@ -182,9 +177,15 @@ newreno_ack_received(struct cc_var *ccv, uint16_t type)
* XXXLAS: Find a way to signal SS after RTO that
* doesn't rely on tcpcb vars.
*/
uint16_t abc_val;
if (ccv->flags & CCF_USE_LOCAL_ABC)
abc_val = ccv->labc;
else
abc_val = V_tcp_abc_l_var;
if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
incr = min(ccv->bytes_this_ack,
ccv->nsegs * V_tcp_abc_l_var *
ccv->nsegs * abc_val *
CCV(ccv, t_maxseg));
else
incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
@ -237,11 +238,19 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type)
u_int mss;
cwin = CCV(ccv, snd_cwnd);
mss = tcp_maxseg(ccv->ccvc.tcp);
mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
nreno = ccv->cc_data;
beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;
beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn;
if (V_cc_do_abe && type == CC_ECN)
/*
* Note that we only change the backoff for ECN if the
* global sysctl V_cc_do_abe is set <or> the stack itself
* has set a flag in our newreno_flags (due to pacing) telling
* us to use the lower valued back-off.
*/
if (V_cc_do_abe ||
(nreno && (nreno->newreno_flags & CC_NEWRENO_BETA_ECN) && (type == CC_ECN)))
factor = beta_ecn;
else
factor = beta;
@ -260,8 +269,7 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type)
V_cc_do_abe && V_cc_abe_frlossreduce)) {
CCV(ccv, snd_ssthresh) =
((uint64_t)CCV(ccv, snd_ssthresh) *
(uint64_t)beta) /
(100ULL * (uint64_t)beta_ecn);
(uint64_t)beta) / (uint64_t)beta_ecn;
}
if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
CCV(ccv, snd_ssthresh) = cwin;
@ -344,7 +352,7 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf)
nreno->beta = opt->val;
break;
case CC_NEWRENO_BETA_ECN:
if (!V_cc_do_abe)
if ((!V_cc_do_abe) && ((nreno->newreno_flags & CC_NEWRENO_BETA_ECN) == 0))
return (EACCES);
nreno->beta_ecn = opt->val;
break;

View File

@ -31,12 +31,17 @@
#define CCALGONAME_NEWRENO "newreno"
struct newreno {
uint32_t beta;
uint32_t beta_ecn;
uint32_t newreno_flags;
};
struct cc_newreno_opts {
int name;
int name;
uint32_t val;
};
#define CC_NEWRENO_BETA 1
#define CC_NEWRENO_BETA_ECN 2
#define CC_NEWRENO_BETA 1 /* Beta for normal DUP-ACK/Sack recovery */
#define CC_NEWRENO_BETA_ECN 2 /* ECN Beta for Abe */
#endif /* _CC_NEWRENO_H */

View File

@ -181,13 +181,24 @@ struct tcphdr {
#define TCP_TXTLS_MODE 40 /* Transmit TLS mode */
#define TCP_RXTLS_ENABLE 41 /* TLS framing and encryption for receive */
#define TCP_RXTLS_MODE 42 /* Receive TLS mode */
#define TCP_IWND_NB 43 /* Override initial window (units: bytes) */
#define TCP_IWND_NSEG 44 /* Override initial window (units: MSS segs) */
#define TCP_LOGID_CNT 46 /* get number of connections with the same ID */
#define TCP_LOG_TAG 47 /* configure tag for grouping logs */
#define TCP_USER_LOG 48 /* userspace log event */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
#define TCP_MAXUNACKTIME 68 /* maximum time without making progress (sec) */
#define TCP_MAXPEAKRATE 69 /* maximum peak rate allowed (kbps) */
#define TCP_IDLE_REDUCE 70 /* Reduce cwnd on idle input */
#define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */
#define TCP_DELACK 72 /* socket option for delayed ack */
#define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */
#define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */
#define TCP_SHARED_CWND_ALLOWED 75 /* Use of a shared cwnd is allowed */
#define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */
#define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */
#define TCP_PERF_INFO 78 /* retrieve accounting counters */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
@ -201,7 +212,7 @@ struct tcphdr {
#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */
#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacing reduction factor (divisor) */
#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */
#define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */
#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */
#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */
@ -284,6 +295,16 @@ struct tcphdr {
#define TCP_RACK_PACE_TO_FILL 1127 /* If we are not in recovery, always pace to fill the cwnd in 1 RTT */
#define TCP_SHARED_CWND_TIME_LIMIT 1128 /* we should limit to low time values the scwnd life */
#define TCP_RACK_PROFILE 1129 /* Select a profile that sets multiple options */
#define TCP_HDWR_RATE_CAP 1130 /* Allow hardware rates to cap pacing rate */
#define TCP_PACING_RATE_CAP 1131 /* Highest rate allowed in pacing in bytes per second (uint64_t) */
#define TCP_HDWR_UP_ONLY 1132 /* Allow the pacing rate to climb but not descend (with the exception of fill-cw */
#define TCP_RACK_ABC_VAL 1133 /* Set a local ABC value different then the system default */
#define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */
#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */
#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */
#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */
#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */
#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@ -295,6 +316,7 @@ struct tcphdr {
#define TCPI_OPT_WSCALE 0x04
#define TCPI_OPT_ECN 0x08
#define TCPI_OPT_TOE 0x10
#define TCPI_OPT_TFO 0x20
/* Maximum length of log ID. */
#define TCP_LOG_ID_LEN 64

View File

@ -0,0 +1,39 @@
#ifndef __tcp_accounting_h__
#define __tcp_accounting_h__
/*
* Return values from tcp_do_ack_accounting
* and indexs to the into the tcp_proc_time[]
* array.
*/
#define ACK_BEHIND 0
#define ACK_SACK 1
#define ACK_CUMACK 2
#define ACK_CUMACK_SACK 3
#define ACK_DUPACK 4
#define ACK_RWND 5
/* Added values for tracking output too */
#define SND_BLOCKED 6
#define SND_LIMITED 7
#define SND_OUT_DATA 8
#define SND_OUT_ACK 9
#define SND_OUT_FAIL 10
/* We also count in the counts array two added (MSS sent and ACKS In) */
#define CNT_OF_MSS_OUT 11
#define CNT_OF_ACKS_IN 12
/* for the tcpcb we add two more cycle counters */
#define CYC_HANDLE_MAP 11
#define CYC_HANDLE_ACK 12
/* Should the tp->xxx array's be alloc'ed? */
/* #define TCP_NUM_PROC_COUNTERS 11 defined in tcp_var.h */
/* #define TCP_NUM_CNT_COUNTERS 13 defined in tcp_var.h */
#ifdef _KERNEL
#ifdef TCP_ACCOUNTING
extern counter_u64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS];
extern counter_u64_t tcp_proc_time[TCP_NUM_PROC_COUNTERS];
#endif
#endif
#endif

View File

@ -526,7 +526,7 @@ cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
(V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
void inline
cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
@ -544,7 +544,7 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
break;
}
if (th->th_flags & TH_CWR)
if (flags & TH_CWR)
tp->ccv->flags |= CCF_TCPHDR_CWR;
else
tp->ccv->flags &= ~CCF_TCPHDR_CWR;
@ -558,6 +558,12 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
}
}
void inline
cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
{
cc_ecnpkt_handler_flags(tp, th->th_flags, iptos);
}
/*
* TCP input handling is split into multiple parts:
* tcp6_input is a thin wrapper around tcp_input for the extended

View File

@ -174,7 +174,7 @@ enum tcp_log_events {
TCP_LOG_IN = 1, /* Incoming packet 1 */
TCP_LOG_OUT, /* Transmit (without other event) 2 */
TCP_LOG_RTO, /* Retransmit timeout 3 */
TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */
TCP_LOG_SB_WAKE, /* Awaken socket buffer 4 */
TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
TCP_LOG_PRR, /* Doing PRR 6 */
TCP_LOG_REORDER, /* Detected reorder 7 */
@ -200,7 +200,7 @@ enum tcp_log_events {
BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */
BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */
BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
TCP_LOG_MAPCHG, /* Map Changes to the sendmap 30 */
TCP_LOG_USERSEND, /* User level sends data 31 */
BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */
BBR_LOG_STATE_TARGET, /* Log of target at state 33 */
@ -232,7 +232,9 @@ enum tcp_log_events {
TCP_LOG_USER_EVENT, /* User space event data 59 */
TCP_LOG_SENDFILE, /* sendfile() logging for TCP connections 60 */
TCP_LOG_HTTP_T, /* logging of http request tracking 61 */
TCP_LOG_END /* End (keep at end) 62 */
TCP_LOG_ACCOUNTING, /* Log of TCP Accounting data 62 */
TCP_LOG_FSB, /* FSB information 63 */
TCP_LOG_END /* End (keep at end) 64 */
};
enum tcp_log_states {

View File

@ -367,11 +367,22 @@ rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
OID_AUTO, "pacetime", CTLFLAG_RD,
&rs->rs_rlt[i].time_between, 0,
"Time hardware inserts between 1500 byte sends");
SYSCTL_ADD_U64(&rs->sysctl_ctx,
SYSCTL_ADD_LONG(&rs->sysctl_ctx,
SYSCTL_CHILDREN(rl_rate_num),
OID_AUTO, "rate", CTLFLAG_RD,
&rs->rs_rlt[i].rate, 0,
&rs->rs_rlt[i].rate,
"Rate in bytes per second");
SYSCTL_ADD_LONG(&rs->sysctl_ctx,
SYSCTL_CHILDREN(rl_rate_num),
OID_AUTO, "using", CTLFLAG_RD,
&rs->rs_rlt[i].using,
"Number of flows using");
SYSCTL_ADD_LONG(&rs->sysctl_ctx,
SYSCTL_CHILDREN(rl_rate_num),
OID_AUTO, "enobufs", CTLFLAG_RD,
&rs->rs_rlt[i].rs_num_enobufs,
"Number of enobufs logged on this rate");
}
}
#endif
@ -667,6 +678,8 @@ rt_setup_new_rs(struct ifnet *ifp, int *error)
*/
rs->rs_rlt[i].ptbl = rs;
rs->rs_rlt[i].tag = NULL;
rs->rs_rlt[i].using = 0;
rs->rs_rlt[i].rs_num_enobufs = 0;
/*
* Calculate the time between.
*/
@ -1063,16 +1076,28 @@ rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
static void
rl_increment_using(const struct tcp_hwrate_limit_table *rte)
{
struct tcp_hwrate_limit_table *decon_rte;
decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
atomic_add_long(&decon_rte->using, 1);
}
static void
rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
{
struct tcp_hwrate_limit_table *decon_rte;
decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
atomic_subtract_long(&decon_rte->using, 1);
}
void
tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
{
struct tcp_hwrate_limit_table *decon_rte;
decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
atomic_add_long(&decon_rte->rs_num_enobufs, 1);
}
/*

View File

@ -43,7 +43,9 @@ struct m_snd_tag;
struct tcp_hwrate_limit_table {
const struct tcp_rate_set *ptbl; /* Pointer to parent table */
struct m_snd_tag *tag; /* Send tag if needed (chelsio) */
uint64_t rate; /* Rate we get in Bytes per second (Bps) */
long rate; /* Rate we get in Bytes per second (Bps) */
long using; /* How many flows are using this hdwr rate. */
long rs_num_enobufs;
uint32_t time_between; /* Time-Gap between packets at this rate */
uint32_t flags;
};

View File

@ -156,6 +156,17 @@ SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD,
&VNET_NAME(tcp_sack_globalholes), 0,
"Global number of TCP SACK holes currently allocated");
int
tcp_dsack_block_exists(struct tcpcb *tp)
{
/* Return true if a DSACK block exists */
if (tp->rcv_numsacks == 0)
return (0);
if (SEQ_LEQ(tp->sackblks[0].end, tp->rcv_nxt))
return(1);
return (0);
}
/*
* This function will find overlaps with the currently stored sackblocks
* and add any overlap as a dsack block upfront

View File

@ -3930,6 +3930,9 @@ bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_s
struct tcp_bbr *bbr;
INP_WLOCK_ASSERT(tp->t_inpcb);
#ifdef STATS
stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
#endif
bbr = (struct tcp_bbr *)tp->t_fb_ptr;
switch (type) {
case CC_NDUPACK:
@ -4403,6 +4406,7 @@ bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap
nrsm->r_start = start;
nrsm->r_end = rsm->r_end;
nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
nrsm-> r_rtt_not_allowed = rsm->r_rtt_not_allowed;
nrsm->r_flags = rsm->r_flags;
/* We don't transfer forward the SYN flag */
nrsm->r_flags &= ~BBR_HAS_SYN;
@ -6429,65 +6433,6 @@ tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts)
bbr->r_ctl.bbr_smallest_srtt_this_state = rtt;
}
static void
bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm,
uint32_t t, uint32_t cts, int ack_type)
{
/*
* For this RSM, we acknowledged the data from a previous
* transmission, not the last one we made. This means we did a false
* retransmit.
*/
if (rsm->r_flags & BBR_HAS_FIN) {
/*
* The sending of the FIN often is multiple sent when we
* have everything outstanding ack'd. We ignore this case
* since its over now.
*/
return;
}
if (rsm->r_flags & BBR_TLP) {
/*
* We expect TLP's to have this occur often
*/
bbr->rc_tlp_rtx_out = 0;
return;
}
if (ack_type != BBR_CUM_ACKED) {
/*
* If it was not a cum-ack we
* don't really know for sure since
* the timestamp could be from some
* other transmission.
*/
return;
}
if (rsm->r_flags & BBR_WAS_SACKPASS) {
/*
* We retransmitted based on a sack and the earlier
* retransmission ack'd it - re-ordering is occuring.
*/
BBR_STAT_INC(bbr_reorder_seen);
bbr->r_ctl.rc_reorder_ts = cts;
}
/* Back down the loss count */
if (rsm->r_flags & BBR_MARKED_LOST) {
bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
rsm->r_flags &= ~BBR_MARKED_LOST;
if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
/* LT sampling also needs adjustment */
bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
}
/***** RRS HERE ************************/
/* Do we need to do this??? */
/* bbr_reset_lt_bw_sampling(bbr, cts); */
/***** RRS HERE ************************/
BBR_STAT_INC(bbr_badfr);
BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start));
}
static void
bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line)
{
@ -6869,6 +6814,10 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
/* Already done */
return (0);
}
if (rsm->r_rtt_not_allowed) {
/* Not allowed */
return (0);
}
if (rsm->r_rtr_cnt == 1) {
/*
* Only one transmit. Hopefully the normal case.
@ -6926,7 +6875,7 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
rsm->r_tim_lastsent[i], ack_type, to);
if ((i + 1) < rsm->r_rtr_cnt) {
/* Likely */
bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
return (0);
} else if (rsm->r_flags & BBR_TLP) {
bbr->rc_tlp_rtx_out = 0;
}
@ -6974,7 +6923,7 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
t = 1;
bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET,
rsm->r_tim_lastsent[i], ack_type, to);
bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
return (0);
} else {
/*
* Too many prior transmissions, just
@ -10207,7 +10156,7 @@ bbr_init(struct tcpcb *tp)
tp->t_fb_ptr = NULL;
return (ENOMEM);
}
rsm->r_flags = BBR_OVERMAX;
rsm->r_rtt_not_allowed = 1;
rsm->r_tim_lastsent[0] = cts;
rsm->r_rtr_cnt = 1;
rsm->r_rtr_bytes = 0;
@ -10320,6 +10269,10 @@ bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
counter_u64_add(bbr_flows_whdwr_pacing, -1);
else
counter_u64_add(bbr_flows_nohdwr_pacing, -1);
if (bbr->r_ctl.crte != NULL) {
tcp_rel_pacing_rate(bbr->r_ctl.crte, tp);
bbr->r_ctl.crte = NULL;
}
rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
while (rsm) {
TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
@ -13463,15 +13416,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
th->th_seq = htonl(tp->snd_max);
bbr_seq = tp->snd_max;
}
} else if (flags & TH_RST) {
/*
* For a Reset send the last cum ack in sequence
* (this like any other choice may still generate a
* challenge ack, if a ack-update packet is in
* flight).
*/
th->th_seq = htonl(tp->snd_una);
bbr_seq = tp->snd_una;
} else {
/*
* len == 0 and not persist we use snd_max, sending
@ -14536,9 +14480,9 @@ bbr_set_sockopt(struct socket *so, struct sockopt *sopt,
} else {
bbr->bbr_hdw_pace_ena = 0;
#ifdef RATELIMIT
if (bbr->bbr_hdrw_pacing) {
bbr->bbr_hdrw_pacing = 0;
in_pcbdetach_txrtlmt(bbr->rc_inp);
if (bbr->r_ctl.crte != NULL) {
tcp_rel_pacing_rate(bbr->r_ctl.crte, tp);
bbr->r_ctl.crte = NULL;
}
#endif
}

File diff suppressed because it is too large Load Diff

View File

@ -96,6 +96,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_lro.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_log_buf.h>
#ifdef TCPDEBUG
@ -161,6 +162,130 @@ ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
}
#endif
static int
ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m)
{
struct ether_header *eh;
struct tcphdr *th;
#ifdef INET6
struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
#endif
#ifdef INET
struct ip *ip = NULL; /* Keep compiler happy. */
#endif
int32_t tlen;
uint16_t drop_hdrlen;
uint16_t etype;
uint8_t iptos;
/* Is it the easy way? */
if (m->m_flags & M_LRO_EHDRSTRP)
return (m->m_pkthdr.lro_etype);
/*
* Ok this is the old style call, the ethernet header is here.
* This also means no checksum or BPF were done. This
* can happen if the race to setup the inp fails and
* LRO sees no INP at packet input, but by the time
* we queue the packets an INP gets there. Its rare
* but it can occur so we will handle it. Note that
* this means duplicated work but with the rarity of it
* its not worth worrying about.
*/
/* Let the BPF see the packet */
if (bpf_peers_present(ifp->if_bpf))
ETHER_BPF_MTAP(ifp, m);
/* Now the csum */
eh = mtod(m, struct ether_header *);
etype = ntohs(eh->ether_type);
m_adj(m, sizeof(*eh));
switch (etype) {
#ifdef INET6
case ETHERTYPE_IPV6:
{
if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
if (m == NULL) {
KMOD_TCPSTAT_INC(tcps_rcvshort);
m_freem(m);
return (-1);
}
}
ip6 = (struct ip6_hdr *)(eh + 1);
th = (struct tcphdr *)(ip6 + 1);
drop_hdrlen = sizeof(*ip6);
tlen = ntohs(ip6->ip6_plen);
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
th->th_sum = m->m_pkthdr.csum_data;
else
th->th_sum = in6_cksum_pseudo(ip6, tlen,
IPPROTO_TCP,
m->m_pkthdr.csum_data);
th->th_sum ^= 0xffff;
} else
th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
if (th->th_sum) {
KMOD_TCPSTAT_INC(tcps_rcvbadsum);
m_freem(m);
return (-1);
}
return (etype);
}
#endif
#ifdef INET
case ETHERTYPE_IP:
{
if (m->m_len < sizeof (struct tcpiphdr)) {
m = m_pullup(m, sizeof (struct tcpiphdr));
if (m == NULL) {
KMOD_TCPSTAT_INC(tcps_rcvshort);
m_freem(m);
return (-1);
}
}
ip = (struct ip *)(eh + 1);
th = (struct tcphdr *)(ip + 1);
drop_hdrlen = sizeof(*ip);
iptos = ip->ip_tos;
tlen = ntohs(ip->ip_len) - sizeof(struct ip);
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
th->th_sum = m->m_pkthdr.csum_data;
else
th->th_sum = in_pseudo(ip->ip_src.s_addr,
ip->ip_dst.s_addr,
htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP));
th->th_sum ^= 0xffff;
} else {
int len;
struct ipovly *ipov = (struct ipovly *)ip;
/*
* Checksum extended TCP header and data.
*/
len = drop_hdrlen + tlen;
bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
ipov->ih_len = htons(tlen);
th->th_sum = in_cksum(m, len);
/* Reset length for SDT probes. */
ip->ip_len = htons(len);
/* Reset TOS bits */
ip->ip_tos = iptos;
/* Re-initialization for later version check */
ip->ip_v = IPVERSION;
ip->ip_hl = sizeof(*ip) >> 2;
}
if (th->th_sum) {
KMOD_TCPSTAT_INC(tcps_rcvbadsum);
m_freem(m);
return (-1);
}
break;
}
#endif
};
return (etype);
}
/*
* The function ctf_process_inbound_raw() is used by
* transport developers to do the steps needed to
@ -170,6 +295,7 @@ ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
* - INP_SUPPORTS_MBUFQ
* - INP_MBUF_QUEUE_READY
* - INP_DONT_SACK_QUEUE
* - INP_MBUF_ACKCMP
*
* These flags help control how LRO will deliver
* packets to the transport. You first set in inp_flags2
@ -186,6 +312,18 @@ ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
* In some transport designs this is important since knowing
* the actual time we got the packet is useful information.
*
* A new special type of mbuf may also be supported by the transport
* if it has set the INP_MBUF_ACKCMP flag. If its set, LRO will
* possibly create a M_ACKCMP type mbuf. This is a mbuf with
* an array of "acks". One thing also to note is that when this
* occurs a subsequent LRO may find at the back of the untouched
* mbuf queue chain a M_ACKCMP and append on to it. This means
* that until the transport pulls in the mbuf chain queued
* for it more ack's may get on the mbufs that were already
* delivered. There currently is a limit of 6 acks condensed
* into 1 mbuf which means often when this is occuring, we
* don't get that effect but it does happen.
*
* Now there are some interesting Caveats that the transport
* designer needs to take into account when using this feature.
*
@ -247,7 +385,6 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
* shipped in, the tcb has been destroyed (or about to be destroyed).
*/
struct mbuf *m_save;
struct ether_header *eh;
struct tcphdr *th;
#ifdef INET6
struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
@ -257,20 +394,18 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
#endif
struct ifnet *ifp;
struct timeval tv;
struct inpcb *inp;
int32_t retval, nxt_pkt, tlen, off;
uint16_t etype;
int etype = 0;
uint16_t drop_hdrlen;
uint8_t iptos, no_vn=0, bpf_req=0;
uint8_t iptos, no_vn=0;
NET_EPOCH_ASSERT();
if (m && m->m_pkthdr.rcvif)
ifp = m->m_pkthdr.rcvif;
if (m)
ifp = m_rcvif(m);
else
ifp = NULL;
if (ifp) {
bpf_req = bpf_peers_present(ifp->if_bpf);
} else {
if (ifp == NULL) {
/*
* We probably should not work around
* but kassert, since lro alwasy sets rcvif.
@ -280,147 +415,86 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
}
CURVNET_SET(ifp->if_vnet);
skip_vnet:
tcp_get_usecs(&tv);
while (m) {
m_save = m->m_nextpkt;
m->m_nextpkt = NULL;
/* Now lets get the ether header */
eh = mtod(m, struct ether_header *);
etype = ntohs(eh->ether_type);
/* Let the BPF see the packet */
if (bpf_req && ifp)
ETHER_BPF_MTAP(ifp, m);
m_adj(m, sizeof(*eh));
/* Trim off the ethernet header */
switch (etype) {
if ((m->m_flags & M_ACKCMP) == 0) {
/* Now lets get the ether header */
etype = ctf_get_enet_type(ifp, m);
if (etype == -1) {
/* Skip this packet it was freed by checksum */
goto skipped_pkt;
}
KASSERT(((etype == ETHERTYPE_IPV6) || (etype == ETHERTYPE_IP)),
("tp:%p m:%p etype:0x%x -- not IP or IPv6", tp, m, etype));
/* Trim off the ethernet header */
switch (etype) {
#ifdef INET6
case ETHERTYPE_IPV6:
{
if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
if (m == NULL) {
KMOD_TCPSTAT_INC(tcps_rcvshort);
m_freem(m);
goto skipped_pkt;
}
}
ip6 = (struct ip6_hdr *)(eh + 1);
th = (struct tcphdr *)(ip6 + 1);
tlen = ntohs(ip6->ip6_plen);
drop_hdrlen = sizeof(*ip6);
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
th->th_sum = m->m_pkthdr.csum_data;
else
th->th_sum = in6_cksum_pseudo(ip6, tlen,
IPPROTO_TCP, m->m_pkthdr.csum_data);
th->th_sum ^= 0xffff;
} else
th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
if (th->th_sum) {
KMOD_TCPSTAT_INC(tcps_rcvbadsum);
m_freem(m);
goto skipped_pkt;
}
/*
* Be proactive about unspecified IPv6 address in source.
* As we use all-zero to indicate unbounded/unconnected pcb,
* unspecified IPv6 address can be used to confuse us.
*
* Note that packets with unspecified IPv6 destination is
* already dropped in ip6_input.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
/* XXX stat */
m_freem(m);
goto skipped_pkt;
}
iptos = IPV6_TRAFFIC_CLASS(ip6);
break;
}
case ETHERTYPE_IPV6:
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
tlen = ntohs(ip6->ip6_plen);
drop_hdrlen = sizeof(*ip6);
iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
break;
#endif
#ifdef INET
case ETHERTYPE_IP:
{
if (m->m_len < sizeof (struct tcpiphdr)) {
if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
== NULL) {
KMOD_TCPSTAT_INC(tcps_rcvshort);
m_freem(m);
goto skipped_pkt;
}
}
ip = (struct ip *)(eh + 1);
th = (struct tcphdr *)(ip + 1);
drop_hdrlen = sizeof(*ip);
iptos = ip->ip_tos;
tlen = ntohs(ip->ip_len) - sizeof(struct ip);
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
th->th_sum = m->m_pkthdr.csum_data;
else
th->th_sum = in_pseudo(ip->ip_src.s_addr,
ip->ip_dst.s_addr,
htonl(m->m_pkthdr.csum_data + tlen +
IPPROTO_TCP));
th->th_sum ^= 0xffff;
} else {
int len;
struct ipovly *ipov = (struct ipovly *)ip;
/*
* Checksum extended TCP header and data.
*/
len = drop_hdrlen + tlen;
bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
ipov->ih_len = htons(tlen);
th->th_sum = in_cksum(m, len);
/* Reset length for SDT probes. */
ip->ip_len = htons(len);
/* Reset TOS bits */
ip->ip_tos = iptos;
/* Re-initialization for later version check */
ip->ip_v = IPVERSION;
ip->ip_hl = sizeof(*ip) >> 2;
}
if (th->th_sum) {
KMOD_TCPSTAT_INC(tcps_rcvbadsum);
m_freem(m);
goto skipped_pkt;
}
break;
}
case ETHERTYPE_IP:
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
drop_hdrlen = sizeof(*ip);
iptos = ip->ip_tos;
tlen = ntohs(ip->ip_len) - sizeof(struct ip);
break;
#endif
}
/*
* Convert TCP protocol specific fields to host format.
*/
tcp_fields_to_host(th);
off = th->th_off << 2;
if (off < sizeof (struct tcphdr) || off > tlen) {
KMOD_TCPSTAT_INC(tcps_rcvbadoff);
} /* end switch */
/*
* Convert TCP protocol specific fields to host format.
*/
tcp_fields_to_host(th);
off = th->th_off << 2;
if (off < sizeof (struct tcphdr) || off > tlen) {
printf("off:%d < hdrlen:%zu || > tlen:%u -- dump\n",
off,
sizeof(struct tcphdr),
tlen);
KMOD_TCPSTAT_INC(tcps_rcvbadoff);
m_freem(m);
goto skipped_pkt;
}
tlen -= off;
drop_hdrlen += off;
/*
* Now lets setup the timeval to be when we should
* have been called (if we can).
*/
m->m_pkthdr.lro_nsegs = 1;
if (m->m_flags & M_TSTMP_LRO) {
tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
}
tlen -= off;
drop_hdrlen += off;
/*
* Now lets setup the timeval to be when we should
* have been called (if we can).
*/
m->m_pkthdr.lro_nsegs = 1;
/* Now what about next packet? */
} else {
/* Should not be should we kassert instead? */
tcp_get_usecs(&tv);
/*
* This mbuf is an array of acks that have
* been compressed. We assert the inp has
* the flag set to enable this!
*/
KASSERT((tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP),
("tp:%p inp:%p no INP_MBUF_ACKCMP flags?", tp, tp->t_inpcb));
tlen = 0;
drop_hdrlen = 0;
th = NULL;
iptos = 0;
}
/* Now what about next packet? */
tcp_get_usecs(&tv);
if (m_save || has_pkt)
nxt_pkt = 1;
else
nxt_pkt = 0;
KMOD_TCPSTAT_INC(tcps_rcvtotal);
if ((m->m_flags & M_ACKCMP) == 0)
KMOD_TCPSTAT_INC(tcps_rcvtotal);
else
KMOD_TCPSTAT_ADD(tcps_rcvtotal, (m->m_len / sizeof(struct tcp_ackent)));
inp = tp->t_inpcb;
INP_WLOCK_ASSERT(inp);
retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
iptos, nxt_pkt, &tv);
if (retval) {
@ -434,6 +508,7 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
}
if (no_vn == 0)
CURVNET_RESTORE();
INP_UNLOCK_ASSERT(inp);
return(retval);
}
skipped_pkt:
@ -482,11 +557,6 @@ ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
if (rc_sacked <= ctf_outstanding(tp))
return(ctf_outstanding(tp) - rc_sacked);
else {
/* TSNH */
#ifdef INVARIANTS
panic("tp:%p rc_sacked:%d > out:%d",
tp, rc_sacked, ctf_outstanding(tp));
#endif
return (0);
}
}
@ -502,6 +572,36 @@ ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
tcp_dropwithreset(m, th, NULL, tlen, rstreason);
}
void
ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt)
{
if ((ts != NULL) && (cnt != NULL) &&
(tcp_ack_war_time_window > 0) &&
(tcp_ack_war_cnt > 0)) {
/* We are possibly doing ack war prevention */
uint32_t cts;
/*
* We use a msec tick here which gives us
* roughly 49 days. We don't need the
* precision of a microsecond timestamp which
* would only give us hours.
*/
cts = tcp_ts_getticks();
if (TSTMP_LT((*ts), cts)) {
/* Timestamp is in the past */
*cnt = 0;
*ts = (cts + tcp_ack_war_time_window);
}
if (*cnt < tcp_ack_war_cnt) {
*cnt = (*cnt + 1);
tp->t_flags |= TF_ACKNOW;
} else
tp->t_flags &= ~TF_ACKNOW;
} else
tp->t_flags |= TF_ACKNOW;
}
/*
* ctf_drop_checks returns 1 for you should not proceed. It places
* in ret_val what should be returned 1/0 by the caller. The 1 indicates
@ -509,7 +609,10 @@ ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
* TCB is still valid and locked.
*/
int
ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
struct tcpcb *tp, int32_t *tlenp,
int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val,
uint32_t *ts, uint32_t *cnt)
{
int32_t todrop;
int32_t thflags;
@ -543,7 +646,7 @@ ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcp
* Send an ACK to resynchronize and drop any data.
* But keep on processing for RST or ACK.
*/
tp->t_flags |= TF_ACKNOW;
ctf_ack_war_checks(tp, ts, cnt);
todrop = tlen;
KMOD_TCPSTAT_INC(tcps_rcvduppack);
KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
@ -555,13 +658,14 @@ ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcp
* DSACK - add SACK block for dropped range
*/
if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
tcp_update_sack_list(tp, th->th_seq,
th->th_seq + todrop);
/*
* ACK now, as the next in-sequence segment
* will clear the DSACK block again
*/
tp->t_flags |= TF_ACKNOW;
ctf_ack_war_checks(tp, ts, cnt);
if (tp->t_flags & TF_ACKNOW)
tcp_update_sack_list(tp, th->th_seq,
th->th_seq + todrop);
}
*drop_hdrlen += todrop; /* drop from the top afterwards */
th->th_seq += todrop;
@ -590,10 +694,10 @@ ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcp
* ack.
*/
if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
tp->t_flags |= TF_ACKNOW;
ctf_ack_war_checks(tp, ts, cnt);
KMOD_TCPSTAT_INC(tcps_rcvwinprobe);
} else {
ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
__ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, ts, cnt);
return (1);
}
} else
@ -614,7 +718,7 @@ ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcp
* and valid.
*/
void
ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val, uint32_t *ts, uint32_t *cnt)
{
/*
* Generate an ACK dropping incoming segment if it occupies sequence
@ -638,7 +742,7 @@ ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t
return;
} else
*ret_val = 0;
tp->t_flags |= TF_ACKNOW;
ctf_ack_war_checks(tp, ts, cnt);
if (m)
m_freem(m);
}
@ -671,7 +775,7 @@ ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcp
*/
int dropped = 0;
if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
(tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
KASSERT(tp->t_state != TCPS_SYN_SENT,
@ -680,8 +784,7 @@ ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcp
if (V_tcp_insecure_rst ||
(tp->last_ack_sent == th->th_seq) ||
(tp->rcv_nxt == th->th_seq) ||
((tp->last_ack_sent - 1) == th->th_seq)) {
(tp->rcv_nxt == th->th_seq)) {
KMOD_TCPSTAT_INC(tcps_drops);
/* Drop the connection. */
switch (tp->t_state) {
@ -748,7 +851,7 @@ ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t *
}
/*
* bbr_ts_check returns 1 for you should not proceed, the state
* ctf_ts_check returns 1 for you should not proceed, the state
* machine should return. It places in ret_val what should
* be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
* that the TCB is unlocked and probably dropped. The 0 indicates the
@ -786,6 +889,32 @@ ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
return (0);
}
int
ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags)
{
if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
/*
* Invalidate ts_recent. If this segment updates ts_recent,
* the age will be reset later and ts_recent will get a
* valid value. If it does not, setting ts_recent to zero
* will at least satisfy the requirement that zero be placed
* in the timestamp echo reply when ts_recent isn't valid.
* The age isn't reset until we get a valid ts_recent
* because we don't want out-of-order segments to be dropped
* when ts_recent is old.
*/
tp->ts_recent = 0;
} else {
KMOD_TCPSTAT_INC(tcps_rcvduppack);
KMOD_TCPSTAT_INC(tcps_pawsdrop);
return (1);
}
return (0);
}
void
ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
{
@ -817,45 +946,7 @@ ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
uint32_t
ctf_fixed_maxseg(struct tcpcb *tp)
{
int optlen;
if (tp->t_flags & TF_NOOPT)
return (tp->t_maxseg);
/*
* Here we have a simplified code from tcp_addoptions(),
* without a proper loop, and having most of paddings hardcoded.
* We only consider fixed options that we would send every
* time I.e. SACK is not considered.
*
*/
#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
if (TCPS_HAVEESTABLISHED(tp->t_state)) {
if (tp->t_flags & TF_RCVD_TSTMP)
optlen = TCPOLEN_TSTAMP_APPA;
else
optlen = 0;
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (tp->t_flags & TF_SIGNATURE)
optlen += PAD(TCPOLEN_SIGNATURE);
#endif
} else {
if (tp->t_flags & TF_REQ_TSTMP)
optlen = TCPOLEN_TSTAMP_APPA;
else
optlen = PAD(TCPOLEN_MAXSEG);
if (tp->t_flags & TF_REQ_SCALE)
optlen += PAD(TCPOLEN_WINDOW);
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (tp->t_flags & TF_SIGNATURE)
optlen += PAD(TCPOLEN_SIGNATURE);
#endif
if (tp->t_flags & TF_SACK_PERMIT)
optlen += PAD(TCPOLEN_SACK_PERMITTED);
}
#undef PAD
optlen = min(optlen, TCP_MAXOLEN);
return (tp->t_maxseg - optlen);
return (tcp_fixed_maxseg(tp));
}
void

View File

@ -98,12 +98,20 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt);
uint32_t ctf_outstanding(struct tcpcb *tp);
uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
int
ctf_drop_checks(struct tcpopt *to, struct mbuf *m,
struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
int32_t * drop_hdrlen, int32_t * ret_val);
_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
struct tcpcb *tp, int32_t *tlenp,
int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val,
uint32_t *ts, uint32_t *cnt);
void ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt);
#define ctf_drop_checks(a, b, c, d, e, f, g, h) _ctf_drop_checks(a, b, c, d, e, f, g, h, NULL, NULL)
void
ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
struct tcphdr *th, int32_t thflags, int32_t tlen,
int32_t *ret_val, uint32_t *ts, uint32_t *cnt);
#define ctf_do_dropafterack(a, b, c, d, e, f) __ctf_do_dropafterack(a, b, c, d, e, f, NULL, NULL)
void
ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
struct tcphdr *th, int32_t rstreason, int32_t tlen);
@ -122,6 +130,9 @@ int
ctf_ts_check(struct mbuf *m, struct tcphdr *th,
struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
int
ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags);
void
ctf_calc_rwin(struct socket *so, struct tcpcb *tp);

View File

@ -71,7 +71,7 @@ struct bbr_sendmap {
uint32_t r_del_time; /* The time of the last delivery update */
uint8_t r_rtr_cnt:4, /* Retran count, index this -1 to get time
* sent */
unused_bit:1,
r_rtt_not_allowed:1, /* No rtt measurement allowed */
r_is_drain:1, /* In a draining cycle */
r_app_limited:1,/* We went app limited */
r_ts_valid:1; /* Timestamp field is valid (r_del_ack_ts) */
@ -588,9 +588,9 @@ struct bbr_control {
uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
uint32_t rc_init_rwnd; /* Initial rwnd when we transitioned */
/*- ---
/*- ---
* used only initial and close
*/
*/
uint32_t rc_high_rwnd; /* Highest rwnd seen */
uint32_t rc_lowest_rtt; /* Smallest RTT we have seen */

View File

@ -29,7 +29,7 @@
#define _NETINET_TCP_RACK_H_
#define RACK_ACKED 0x0001/* The remote endpoint acked this */
#define RACK_TO_MIXED 0x0002/* A timeout occurred that mixed the send order - not used */
#define RACK_TO_REXT 0x0002/* A timeout occured on this sendmap entry */
#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc - not used */
#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */
#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */
@ -39,37 +39,94 @@
#define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */
#define RACK_APP_LIMITED 0x0200/* We went app limited after this send */
#define RACK_WAS_ACKED 0x0400/* a RTO undid the ack, but it already had a rtt calc done */
#define RACK_HAS_SIN 0x0800/* SIN is on this guy */
#define RACK_HAS_SYN 0x0800/* SYN is on this guy */
#define RACK_SENT_W_DSACK 0x1000/* Sent with a dsack */
#define RACK_SENT_SP 0x2000/* sent in slow path */
#define RACK_SENT_FP 0x4000/* sent in fast path */
#define RACK_HAD_PUSH 0x8000/* Push was sent on original send */
#define RACK_NUM_OF_RETRANS 3
#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */
#define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */
#define RACK_REQ_AVG 4 /* Must be less than 256 */
#define RACK_REQ_AVG 3 /* Must be less than 256 */
struct rack_sendmap {
TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
uint32_t r_start; /* Sequence number of the segment */
uint32_t r_end; /* End seq, this is 1 beyond actually */
TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */
uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time
* sent */
uint16_t r_flags; /* Flags as defined above */
uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
uint32_t usec_orig_send; /* time of orginal send in useconds */
struct mbuf *m;
uint32_t soff;
uint32_t orig_m_len;
uint32_t r_nseq_appl; /* If this one is app limited, this is the nxt seq limited */
uint32_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */
uint8_t r_dupack; /* Dup ack count */
uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */
uint8_t r_limit_type; /* is this entry counted against a limit? */
uint8_t r_just_ret : 1, /* After sending, the next pkt was just returned, i.e. limited */
r_one_out_nr : 1, /* Special case 1 outstanding and not in recovery */
r_avail : 6;
uint8_t r_resv[36];
r_no_rtt_allowed : 1, /* No rtt measurement allowed */
r_avail : 5;
uint64_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
uint64_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */
RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */
};
struct deferred_opt_list {
TAILQ_ENTRY(deferred_opt_list) next;
int optname;
uint64_t optval;
};
/*
* Timestamps in the rack sendmap are now moving to be
* uint64_t's. This means that if you want a uint32_t
* usec timestamp (the old usecond timestamp) you simply have
* to cast it to uint32_t. The reason we do this is not for
* wrap, but we need to get back, at times, to the millisecond
* timestamp that is used in the TSTMP option. To do this we
* can use the rack_ts_to_msec() inline below which can take
* the 64bit ts and make into the correct timestamp millisecond
* wise. Thats not possible with the 32bit usecond timestamp since
* the seconds wrap too quickly to cover all bases.
*
* There are quite a few places in rack where I simply cast
* back to uint32_t and then end up using the TSTMP_XX()
* macros. This is ok, but we could do simple compares if
* we ever decided to move all of those variables to 64 bits
* as well.
*/
inline uint64_t
rack_to_usec_ts(struct timeval *tv)
{
return ((tv->tv_sec * HPTS_USEC_IN_SEC) + tv->tv_usec);
}
inline uint32_t
rack_ts_to_msec(uint64_t ts)
{
return((uint32_t)(ts / HPTS_MSEC_IN_SEC));
}
RB_HEAD(rack_rb_tree_head, rack_sendmap);
TAILQ_HEAD(rack_head, rack_sendmap);
TAILQ_HEAD(def_opt_head, deferred_opt_list);
/* Map change logging */
#define MAP_MERGE 0x01
#define MAP_SPLIT 0x02
#define MAP_NEW 0x03
#define MAP_SACK_M1 0x04
#define MAP_SACK_M2 0x05
#define MAP_SACK_M3 0x06
#define MAP_SACK_M4 0x07
#define MAP_SACK_M5 0x08
#define MAP_FREE 0x09
#define MAP_TRIM_HEAD 0x0a
#define RACK_LIMIT_TYPE_SPLIT 1
@ -128,10 +185,7 @@ struct rack_log {
#define RACK_TO_FRM_DELACK 6
struct rack_opts_stats {
uint64_t tcp_rack_prop_rate;
uint64_t tcp_rack_prop;
uint64_t tcp_rack_tlp_reduce;
uint64_t tcp_rack_early_recov;
uint64_t tcp_rack_pace_always;
uint64_t tcp_rack_pace_reduce;
uint64_t tcp_rack_max_seg;
@ -177,6 +231,20 @@ struct rack_opts_stats {
uint64_t tcp_npush;
uint64_t tcp_lscwnd;
uint64_t tcp_profile;
uint64_t tcp_hdwr_rate_cap;
uint64_t tcp_pacing_rate_cap;
uint64_t tcp_pacing_up_only;
uint64_t tcp_use_cmp_acks;
uint64_t tcp_rack_abc_val;
uint64_t tcp_rec_abc_val;
uint64_t tcp_rack_measure_cnt;
uint64_t tcp_rack_delayed_ack;
uint64_t tcp_rack_rtt_use;
uint64_t tcp_data_after_close;
uint64_t tcp_defer_opt;
uint64_t tcp_rack_fastrsm_hack;
uint64_t tcp_rack_beta;
uint64_t tcp_rack_beta_ecn;
};
/* RTT shrink reasons */
@ -247,6 +315,23 @@ extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
*/
#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */
#define RACK_NUM_FSB_DEBUG 16
struct rack_fast_send_blk {
uint32_t left_to_send;
uint16_t tcp_ip_hdr_len;
uint8_t tcp_flags;
uint8_t hoplimit;
uint8_t *tcp_ip_hdr;
uint32_t recwin;
uint32_t off;
struct tcphdr *th;
struct udphdr *udp;
struct mbuf *m;
uint32_t o_m_len;
uint32_t rfo_apply_push : 1,
unused : 31;
};
struct rack_control {
/* Second cache line 0x40 from tcp_rack */
struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */
@ -255,6 +340,7 @@ struct rack_control {
* tlp_sending Lock(a) */
struct rack_sendmap *rc_resend; /* something we have been asked to
* resend */
struct rack_fast_send_blk fsb; /* The fast-send block */
uint32_t input_pkt;
uint32_t saved_input_pkt;
uint32_t rc_hpts_flags;
@ -268,6 +354,9 @@ struct rack_control {
/* Third Cache line 0x80 */
struct rack_head rc_free; /* Allocation array */
uint64_t last_hw_bw_req;
uint64_t crte_prev_rate;
uint64_t bw_rate_cap;
uint32_t rc_time_last_sent; /* Time we last sent some data and
* logged it Lock(a). */
uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
@ -342,8 +431,8 @@ struct rack_control {
uint32_t rc_agg_delayed;
uint32_t rc_tlp_rxt_last_time;
uint32_t rc_saved_cwnd;
uint32_t rc_gp_output_ts;
uint32_t rc_gp_cumack_ts;
uint64_t rc_gp_output_ts; /* chg*/
uint64_t rc_gp_cumack_ts; /* chg*/
struct timeval act_rcv_time;
struct timeval rc_last_time_decay; /* SAD time decay happened here */
uint64_t gp_bw;
@ -354,6 +443,7 @@ struct rack_control {
uint64_t last_gp_comp_bw;
uint64_t last_max_bw; /* Our calculated max b/w last */
struct time_filter_small rc_gp_min_rtt;
struct def_opt_head opt_list;
int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */
uint32_t rc_gp_srtt; /* Current GP srtt */
uint32_t rc_prev_gp_srtt; /* Previous RTT */
@ -370,21 +460,40 @@ struct rack_control {
uint32_t rc_time_of_last_probertt;
uint32_t rc_target_probertt_flight;
uint32_t rc_probertt_sndmax_atexit; /* Highest sent to in probe-rtt */
uint32_t rc_cwnd_at_erec;
uint32_t rc_ssthresh_at_erec;
uint32_t dsack_byte_cnt;
uint32_t retran_during_recovery;
uint32_t rc_gp_lowrtt; /* Lowest rtt seen during GPUT measurement */
uint32_t rc_gp_high_rwnd; /* Highest rwnd seen during GPUT measurement */
uint32_t rc_snd_max_at_rto; /* For non-sack when the RTO occured what was snd-max */
uint32_t rc_out_at_rto;
int32_t rc_scw_index;
uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
uint32_t rc_last_timeout_snduna;
uint32_t challenge_ack_ts;
uint32_t challenge_ack_cnt;
uint32_t rc_min_to; /* Socket option value Lock(a) */
uint32_t rc_pkt_delay; /* Socket option value Lock(a) */
struct newreno rc_saved_beta; /*
* For newreno cc:
* rc_saved_cc are the values we have had
* set by the user, if pacing is not happening
* (i.e. its early and we have not turned on yet
* or it was turned off). The minute pacing
* is turned on we pull out the values currently
* being used by newreno and replace them with
* these values, then save off the old values here,
* we also set the flag (if ecn_beta is set) to make
* new_reno do less of a backoff for ecn (think abe).
*/
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
uint16_t rc_pkt_delay; /* Socket option value Lock(a) */
uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */
uint8_t num_avg; /* average count before we go to normal decay */
uint8_t rc_prop_rate; /* Socket option value Lock(a) */
uint8_t rc_prop_reduce; /* Socket option value Lock(a) */
uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */
uint8_t req_measurements; /* How many measurements are required? */
uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
uint8_t rc_early_recovery; /* Socket option value Lock(a) */
uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
uint8_t rc_min_to; /* Socket option value Lock(a) */
uint8_t rc_rate_sample_method;
uint8_t rc_gp_hist_idx;
};
@ -402,21 +511,57 @@ struct tcp_rack {
int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */
struct tcpcb *rc_tp; /* The tcpcb Lock(a) */
struct inpcb *rc_inp; /* The inpcb Lock(a) */
uint32_t rc_free_cnt; /* Number of free entries on the rc_free list
uint8_t rc_free_cnt; /* Number of free entries on the rc_free list
* Lock(a) */
uint8_t client_bufferlvl; /* 0 - 5 normaly, less than or at 2 means its real low */
uint8_t no_prr_addback : 1,
gp_ready : 1,
defer_options: 1,
fast_rsm_hack: 1,
rc_ack_can_sendout_data: 1, /*
* If set it will override pacing restrictions on not sending
* data when the pacing timer is running. I.e. you set this
* and an ACK will send data. Default is off and its only used
* without pacing when we are doing 5G speed up for there
* ack filtering.
*/
rc_pacing_cc_set: 1, /*
* If we are pacing (pace_always=1) and we have reached the
* point where we start pacing (fixed or gp has reached its
* magic gp_ready state) this flag indicates we have set in
* values to effect CC's backoff's. If pacing is turned off
* then we must restore the values saved in rc_saved_beta,
* if its going to gp_ready we need to copy the values into
* the CC module and set our flags.
*
* Note this only happens if the cc name is newreno (CCALGONAME_NEWRENO).
*/
avail :2;
uint8_t avail_bytes;
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */
rtt_limit_mul : 4, /* muliply this by low rtt */
r_limit_scw : 1,
r_avail_bits : 10; /* Available */
r_must_retran : 1, /* For non-sack customers we hit an RTO and new data should be resends */
r_use_cmp_ack: 1, /* Do we use compressed acks */
r_ent_rec_ns: 1, /* We entered recovery and have not sent */
r_might_revert: 1, /* Flag to find out if we might need to revert */
r_fast_output: 1, /* Fast output is in progress we can skip the bulk of rack_output */
r_fsb_inited: 1,
r_rack_hw_rate_caps: 1,
r_up_only: 1,
r_via_fill_cw : 1,
r_fill_less_agg : 1;
uint16_t rc_user_set_max_segs; /* Socket option value Lock(a) */
uint8_t rc_user_set_max_segs; /* Socket option value Lock(a) */
uint8_t rc_labc; /* Appropriate Byte Counting Value */
uint16_t forced_ack : 1,
rc_gp_incr : 1,
rc_gp_bwred : 1,
rc_gp_timely_inc_cnt : 3,
rc_gp_timely_dec_cnt : 3,
rc_not_backing_off: 1,
r_use_labc_for_rec: 1,
rc_highly_buffered: 1, /* The path is highly buffered */
rc_dragged_bottom: 1,
rc_dack_mode : 1, /* Mac O/S emulation of d-ack */
@ -435,7 +580,7 @@ struct tcp_rack {
rc_always_pace : 1, /* Socket option value Lock(a) */
rc_pace_to_cwnd : 1,
rc_pace_fill_if_rttin_range : 1,
xxx_avail_bits : 1;
rc_srtt_measure_made : 1;
uint8_t app_limited_needs_set : 1,
use_fixed_rate : 1,
rc_has_collapsed : 1,

View File

@ -193,6 +193,16 @@ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps,
&tcp_sad_low_pps, 100,
"What is the input pps that below which we do not decay?");
#endif
uint32_t tcp_ack_war_time_window = 1000;
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow,
CTLFLAG_RW,
&tcp_ack_war_time_window, 1000,
"If the tcp_stack does ack-war prevention how many milliseconds are in its time window?");
uint32_t tcp_ack_war_cnt = 5;
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt,
CTLFLAG_RW,
&tcp_ack_war_cnt, 5,
"If the tcp_stack does ack-war prevention how many acks can be sent in its time window?");
struct rwlock tcp_function_lock;
@ -268,6 +278,18 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_R
&VNET_NAME(tcp_ts_offset_per_conn), 0,
"Initialize TCP timestamps per connection instead of per host pair");
/* How many connections are pacing */
static volatile uint32_t number_of_tcp_connections_pacing = 0;
static uint32_t shadow_num_connections = 0;
static int tcp_pacing_limit = 10000;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
&tcp_pacing_limit, 1000,
"If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
&shadow_num_connections, 0, "Number of TCP connections being paced");
static int tcp_log_debug = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
@ -3511,6 +3533,54 @@ tcp_maxseg(const struct tcpcb *tp)
return (tp->t_maxseg - optlen);
}
u_int
tcp_fixed_maxseg(const struct tcpcb *tp)
{
int optlen;
if (tp->t_flags & TF_NOOPT)
return (tp->t_maxseg);
/*
* Here we have a simplified code from tcp_addoptions(),
* without a proper loop, and having most of paddings hardcoded.
* We only consider fixed options that we would send every
* time I.e. SACK is not considered. This is important
* for cc modules to figure out what the modulo of the
* cwnd should be.
*/
#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
if (TCPS_HAVEESTABLISHED(tp->t_state)) {
if (tp->t_flags & TF_RCVD_TSTMP)
optlen = TCPOLEN_TSTAMP_APPA;
else
optlen = 0;
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (tp->t_flags & TF_SIGNATURE)
optlen += PAD(TCPOLEN_SIGNATURE);
#endif
} else {
if (tp->t_flags & TF_REQ_TSTMP)
optlen = TCPOLEN_TSTAMP_APPA;
else
optlen = PAD(TCPOLEN_MAXSEG);
if (tp->t_flags & TF_REQ_SCALE)
optlen += PAD(TCPOLEN_WINDOW);
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (tp->t_flags & TF_SIGNATURE)
optlen += PAD(TCPOLEN_SIGNATURE);
#endif
if (tp->t_flags & TF_SACK_PERMIT)
optlen += PAD(TCPOLEN_SACK_PERMITTED);
}
#undef PAD
optlen = min(optlen, TCP_MAXOLEN);
return (tp->t_maxseg - optlen);
}
static int
sysctl_drop(SYSCTL_HANDLER_ARGS)
{
@ -3972,3 +4042,38 @@ tcp_log_end_status(struct tcpcb *tp, uint8_t status)
}
}
}
int
tcp_can_enable_pacing(void)
{
if ((tcp_pacing_limit == -1) ||
(tcp_pacing_limit > number_of_tcp_connections_pacing)) {
atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1);
shadow_num_connections = number_of_tcp_connections_pacing;
return (1);
} else {
return (0);
}
}
static uint8_t tcp_pacing_warning = 0;
void
tcp_decrement_paced_conn(void)
{
uint32_t ret;
ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1);
shadow_num_connections = number_of_tcp_connections_pacing;
KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?"));
if (ret == 0) {
if (tcp_pacing_limit != -1) {
printf("Warning all pacing is now disabled, count decrements invalidly!\n");
tcp_pacing_limit = 0;
} else if (tcp_pacing_warning == 0) {
printf("Warning pacing count is invalid, invalid decrement\n");
tcp_pacing_warning = 1;
}
}
}

View File

@ -258,6 +258,10 @@ struct tcpcb {
tcp_seq gput_seq; /* Outbound measurement seq */
tcp_seq gput_ack; /* Inbound measurement ack */
int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */
uint32_t t_maxpeakrate; /* max peak rate set by user, in bytes/s */
uint32_t t_sndtlppack; /* tail loss probe packets sent */
uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */
uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
uint32_t t_end_info_status; /* Status flag of end info */
unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */
@ -974,6 +978,7 @@ void cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
void cc_conn_init(struct tcpcb *tp);
void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos);
void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos);
void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
#ifdef TCP_HHOOK
void hhook_run_tcp_est_in(struct tcpcb *tp,
@ -1022,10 +1027,13 @@ extern int32_t tcp_sad_low_pps;
extern int32_t tcp_map_minimum;
extern int32_t tcp_attack_on_turns_on_logging;
#endif
extern uint32_t tcp_ack_war_time_window;
extern uint32_t tcp_ack_war_cnt;
uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
u_int tcp_maxseg(const struct tcpcb *);
u_int tcp_fixed_maxseg(const struct tcpcb *);
void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
struct tcp_ifcap *);
void tcp_mss(struct tcpcb *, int);
@ -1075,6 +1083,7 @@ uint32_t tcp_new_ts_offset(struct in_conninfo *);
tcp_seq tcp_new_isn(struct in_conninfo *);
int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
int tcp_dsack_block_exists(struct tcpcb *);
void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
void tcp_clean_dsack_blocks(struct tcpcb *tp);
@ -1090,6 +1099,9 @@ uint32_t tcp_compute_initwnd(uint32_t);
void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
size_t seed_len);
int tcp_can_enable_pacing(void);
void tcp_decrement_paced_conn(void);
struct mbuf *
tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls);