From 3b0b41e6132bb1ba529d20a9ecf4817f0478d6d5 Mon Sep 17 00:00:00 2001 From: Randall Stewart Date: Wed, 10 Jul 2019 20:40:39 +0000 Subject: [PATCH] This commit updates rack to what is basically being used at NF as well as sets in some of the groundwork for committing BBR. The hpts system is updated as well as some other needed utilities for the entrance of BBR. This is actually part 1 of 3 more needed commits which will finally complete with BBRv1 being added as a new tcp stack. Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D20834 --- sys/modules/tcp/rack/Makefile | 2 +- sys/netinet/in_pcb.h | 4 +- sys/netinet/tcp.h | 27 +- sys/netinet/tcp_hpts.c | 1337 ++++++++++++---------- sys/netinet/tcp_hpts.h | 130 +-- sys/netinet/tcp_log_buf.h | 29 +- sys/netinet/tcp_stacks/rack.c | 769 ++++++------- sys/netinet/tcp_stacks/rack_bbr_common.c | 859 ++++++++++++++ sys/netinet/tcp_stacks/rack_bbr_common.h | 69 +- sys/netinet/tcp_var.h | 8 +- sys/sys/mbuf.h | 1 + 11 files changed, 2090 insertions(+), 1145 deletions(-) create mode 100644 sys/netinet/tcp_stacks/rack_bbr_common.c diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile index 468492e753ba..55ca2ad080ac 100644 --- a/sys/modules/tcp/rack/Makefile +++ b/sys/modules/tcp/rack/Makefile @@ -6,7 +6,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c +SRCS= rack.c sack_filter.c rack_bbr_common.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_tcpdebug.h diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index d074ad7e0a21..1d2ee37ec9dc 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -759,7 +759,9 @@ int inp_so_options(const struct inpcb *inp); #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ #define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ - +#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */ +#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */ +#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */ /* * Flags passed to in_pcblookup*() functions. */ diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 9cd44ce80b74..6531decb0bfe 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -201,9 +201,8 @@ struct tcphdr { #define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ #define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ #define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */ -#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */ #define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ -#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */ +#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */ #define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ #define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ #define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */ @@ -211,14 +210,18 @@ struct tcphdr { #define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */ #define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ #define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */ -#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */ -#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */ -#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */ +#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */ +#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */ +#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */ +#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */ +#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */ +#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */ #define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ #define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ #define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ #define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */ -#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */ +#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */ +#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */ #define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */ #define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */ #define TCP_BBR_PACE_PER_SEC 1086 @@ -227,17 +230,27 @@ struct tcphdr { #define TCP_BBR_PACE_SEG_MIN 1089 #define TCP_BBR_PACE_CROSS 1090 #define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ -#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ #define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */ #define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */ +#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */ #define TCP_RACK_TLP_USE 1095 #define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ +#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */ #define TCP_BBR_EXTRA_GAIN 1097 #define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ #define TCP_BBR_RETRAN_WTSO 1099 #define TCP_DATA_AFTER_CLOSE 1100 #define TCP_BBR_PROBE_RTT_GAIN 1101 #define TCP_BBR_PROBE_RTT_LEN 1102 +#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */ +#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */ +#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */ +#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */ +#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */ +#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */ +#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */ +#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */ +#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */ /* Start of reserved space for third-party user-settable options. */ diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index f649a73fcf03..a557a2362053 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -37,7 +37,7 @@ __FBSDID("$FreeBSD$"); * pacing packets out onto the wire. It can be used in two ways * by a given TCP stack (and those two methods can be used simultaneously). * - * First, and probably the main thing its used by Rack and BBR for, it can + * First, and probably the main thing its used by Rack and BBR, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The @@ -59,42 +59,57 @@ __FBSDID("$FreeBSD$"); * to prevent output processing until the time alotted has gone by. * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. - * - * Now the tcp_hpts system will call tcp_output in one of two forms, - * it will first check to see if the stack as defined a - * tfb_tcp_output_wtime() function, if so that is the routine it - * will call, if that function is not defined then it will call the - * tfb_tcp_output() function. The only difference between these - * two calls is that the former passes the time in to the function - * so the function does not have to access the time (which tcp_hpts - * already has). What these functions do is of course totally up - * to the individual tcp stack. - * + * * Now the second function (actually two functions I guess :D) * the tcp_hpts system provides is the ability to either abort - * a connection (later) or process input on a connection. - * Why would you want to do this? To keep processor locality. + * a connection (later) or process input on a connection. + * Why would you want to do this? To keep processor locality + * and or not have to worry about untangling any recursive + * locks. The input function now is hooked to the new LRO + * system as well. * - * So in order to use the input redirection function the - * stack changes its tcp_do_segment() routine to instead - * of process the data call the function: + * In order to use the input redirection function the + * tcp stack must define an input function for + * tfb_do_queued_segments(). This function understands + * how to dequeue a array of packets that were input and + * knows how to call the correct processing routine. * - * tcp_queue_pkt_to_input() - * - * You will note that the arguments to this function look - * a lot like tcp_do_segments's arguments. This function - * will assure that the tcp_hpts system will - * call the functions tfb_tcp_hpts_do_segment() from the - * correct CPU. Note that multiple calls can get pushed - * into the tcp_hpts system this will be indicated by - * the next to last argument to tfb_tcp_hpts_do_segment() - * (nxt_pkt). If nxt_pkt is a 1 then another packet is - * coming. If nxt_pkt is a 0 then this is the last call - * that the tcp_hpts system has available for the tcp stack. + * Locking in this is important as well so most likely the + * stack will need to define the tfb_do_segment_nounlock() + * splitting tfb_do_segment() into two parts. The main processing + * part that does not unlock the INP and returns a value of 1 or 0. + * It returns 0 if all is well and the lock was not released. It + * returns 1 if we had to destroy the TCB (a reset received etc). + * The remains of tfb_do_segment() then become just a simple call + * to the tfb_do_segment_nounlock() function and check the return + * code and possibly unlock. * - * The other point of the input system is to be able to safely - * drop a tcp connection without worrying about the recursive - * locking that may be occuring on the INP_WLOCK. So if + * The stack must also set the flag on the INP that it supports this + * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes + * this flag as well and will queue packets when it is set. + * There are other flags as well INP_MBUF_QUEUE_READY and + * INP_DONT_SACK_QUEUE. The first flag tells the LRO code + * that we are in the pacer for output so there is no + * need to wake up the hpts system to get immediate + * input. The second tells the LRO code that its okay + * if a SACK arrives you can still defer input and let + * the current hpts timer run (this is usually set when + * a rack timer is up so we know SACK's are happening + * on the connection already and don't want to wakeup yet). + * + * There is a common functions within the rack_bbr_common code + * version i.e. ctf_do_queued_segments(). This function + * knows how to take the input queue of packets from + * tp->t_in_pkts and process them digging out + * all the arguments, calling any bpf tap and + * calling into tfb_do_segment_nounlock(). The common + * function (ctf_do_queued_segments()) requires that + * you have defined the tfb_do_segment_nounlock() as + * described above. + * + * The second feature of the input side of hpts is the + * dropping of a connection. This is due to the way that + * locking may have occured on the INP_WLOCK. So if * a stack wants to drop a connection it calls: * * tcp_set_inp_to_drop(tp, ETIMEDOUT) @@ -156,6 +171,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef tcpdebug #include @@ -168,24 +184,19 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS -#include -#include static int tcp_bind_threads = 1; #else static int tcp_bind_threads = 2; #endif TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); -static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; - -TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); - static struct tcp_hptsi tcp_pace; +static int hpts_does_tp_logging = 0; static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); -static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); +static void tcp_hptsi(struct tcp_hpts_entry *hpts); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); @@ -204,8 +215,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls"); } \ } while (0) -static int32_t logging_on = 0; -static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); static int32_t tcp_hpts_precision = 120; struct hpts_domain_info { @@ -219,44 +228,75 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, &tcp_hpts_precision, 120, "Value for PRE() precision of callout"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, - &logging_on, 0, - "Turn on logging if compiled in"); +counter_u64_t hpts_hopelessly_behind; + +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD, + &hpts_hopelessly_behind, + "Number of times hpts could not catch up and was behind hopelessly"); counter_u64_t hpts_loops; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, &hpts_loops, "Number of times hpts had to loop to catch up"); + counter_u64_t back_tosleep; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); -static int32_t in_newts_every_tcb = 0; +counter_u64_t combined_wheel_wrap; -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, - &in_newts_every_tcb, 0, - "Do we have a new cts every tcb we process for input"); -static int32_t in_ts_percision = 0; +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, + &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, - &in_ts_percision, 0, - "Do we use percise timestamp for clients on input"); -static int32_t out_newts_every_tcb = 0; +counter_u64_t wheel_wrap; + +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD, + &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, - &out_newts_every_tcb, 0, - "Do we have a new cts every tcb we process for output"); static int32_t out_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, &out_ts_percision, 0, "Do we use a percise timestamp for every output cts"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, + &hpts_does_tp_logging, 0, + "Do we add to any tp that has logging on pacer logs"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, +static int32_t max_pacer_loops = 10; +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW, + &max_pacer_loops, 10, + "What is the maximum number of times the pacer will loop trying to catch up"); + +#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2) + +static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED; + + +static int +sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS) +{ + int error; + uint32_t new; + + new = hpts_sleep_max; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if ((new < (NUM_OF_HPTSI_SLOTS / 4)) || + (new > HPTS_MAX_SLEEP_ALLOWED)) + error = EINVAL; + else + hpts_sleep_max = new; + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, + CTLTYPE_UINT | CTLFLAG_RW, &hpts_sleep_max, 0, - "The maximum time the hpts will sleep <1 - 254>"); + &sysctl_net_inet_tcp_hpts_max_sleep, "IU", + "Maximum time hpts will sleep"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, &tcp_min_hptsi_time, 0, @@ -267,55 +307,35 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, "Do we have the callout call directly to the hpts?"); static void -__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, - uint32_t ticknow, int32_t line) +tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, + int ticks_to_run, int idx) { - struct hpts_log *pl; - - HPTS_MTX_ASSERT(hpts); - if (hpts->p_log == NULL) - return; - pl = &hpts->p_log[hpts->p_log_at]; - hpts->p_log_at++; - if (hpts->p_log_at >= hpts->p_logsize) { - hpts->p_log_at = 0; - hpts->p_log_wrapped = 1; - } - pl->inp = inp; - if (inp) { - pl->t_paceslot = inp->inp_hptsslot; - pl->t_hptsreq = inp->inp_hpts_request; - pl->p_onhpts = inp->inp_in_hpts; - pl->p_oninput = inp->inp_in_input; - } else { - pl->t_paceslot = 0; - pl->t_hptsreq = 0; - pl->p_onhpts = 0; - pl->p_oninput = 0; - } - pl->is_notempty = 1; - pl->event = event; - pl->line = line; - pl->cts = tcp_get_usecs(NULL); - pl->p_curtick = hpts->p_curtick; - pl->p_prevtick = hpts->p_prevtick; - pl->p_on_queue_cnt = hpts->p_on_queue_cnt; - pl->ticknow = ticknow; - pl->slot_req = slot; - pl->p_nxt_slot = hpts->p_nxt_slot; - pl->p_cur_slot = hpts->p_cur_slot; - pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; - pl->p_flags = (hpts->p_cpu & 0x7f); - pl->p_flags <<= 7; - pl->p_flags |= (hpts->p_num & 0x7f); - pl->p_flags <<= 2; - if (hpts->p_hpts_active) { - pl->p_flags |= HPTS_HPTS_ACTIVE; - } + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = hpts->p_nxt_slot; + log.u_bbr.flex2 = hpts->p_cur_slot; + log.u_bbr.flex3 = hpts->p_prev_slot; + log.u_bbr.flex4 = idx; + log.u_bbr.flex5 = hpts->p_curtick; + log.u_bbr.flex6 = hpts->p_on_queue_cnt; + log.u_bbr.use_lt_bw = 1; + log.u_bbr.inflight = ticks_to_run; + log.u_bbr.applimited = hpts->overidden_sleep; + log.u_bbr.delivered = hpts->saved_curtick; + log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); + log.u_bbr.epoch = hpts->saved_curslot; + log.u_bbr.lt_epoch = hpts->saved_prev_slot; + log.u_bbr.pkts_out = hpts->p_delayed_by; + log.u_bbr.lost = hpts->p_hpts_sleep_time; + log.u_bbr.cur_del_rate = hpts->p_runningtick; + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, tv); } -#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) - static void hpts_timeout_swi(void *arg) { @@ -347,12 +367,6 @@ hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hpt /* We are not on the hpts? */ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); } - if (TAILQ_EMPTY(head) && - (hpts->p_on_queue_cnt != 0)) { - /* We should not be empty with a queue count */ - panic("%s hpts:%p hpts bucket empty but cnt:%d", - __FUNCTION__, hpts, hpts->p_on_queue_cnt); - } #endif TAILQ_REMOVE(head, inp, inp_hpts); hpts->p_on_queue_cnt--; @@ -456,58 +470,13 @@ hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) in_pcbref(inp); } -static int -sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) -{ - struct tcp_hpts_entry *hpts; - size_t sz; - int32_t logging_was, i; - int32_t error = 0; - - /* - * HACK: Turn off logging so no locks are required this really needs - * a memory barrier :) - */ - logging_was = logging_on; - logging_on = 0; - if (!req->oldptr) { - /* How much? */ - sz = 0; - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - if (hpts->p_log == NULL) - continue; - sz += (sizeof(struct hpts_log) * hpts->p_logsize); - } - error = SYSCTL_OUT(req, 0, sz); - } else { - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - if (hpts->p_log == NULL) - continue; - if (hpts->p_log_wrapped) - sz = (sizeof(struct hpts_log) * hpts->p_logsize); - else - sz = (sizeof(struct hpts_log) * hpts->p_log_at); - error = SYSCTL_OUT(req, hpts->p_log, sz); - } - } - logging_on = logging_was; - return error; -} - -SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); - - static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); - swi_sched(hpts->ie_cookie, 0); - if (hpts->p_hpts_active == 2) { - /* Rare sleeping on a ENOBUF */ - wakeup_one(hpts); + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); } } @@ -515,10 +484,9 @@ static void tcp_wakeinput(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); - swi_sched(hpts->ie_cookie, 0); - if (hpts->p_hpts_active == 2) { - /* Rare sleeping on a ENOBUF */ - wakeup_one(hpts); + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); } } @@ -648,8 +616,8 @@ tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int * Valid values in the flags are * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. * HPTS_REMOVE_INPUT - remove from the input of the hpts. - * Note that you can or both values together and get two - * actions. + * Note that you can use one or both values together + * and get two actions. */ void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) @@ -670,53 +638,198 @@ __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) } static inline int -hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) +hpts_tick(uint32_t wheel_tick, uint32_t plus) { - return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); + /* + * Given a slot on the wheel, what slot + * is that plus ticks out? + */ + KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick)); + return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS); +} + +static inline int +tick_to_wheel(uint32_t cts_in_wticks) +{ + /* + * Given a timestamp in wheel ticks (10usec inc's) + * map it to our limited space wheel. + */ + return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); +} + +static inline int +hpts_ticks_diff(int prev_tick, int tick_now) +{ + /* + * Given two ticks that are someplace + * on our wheel. How far are they apart? + */ + if (tick_now > prev_tick) + return (tick_now - prev_tick); + else if (tick_now == prev_tick) + /* + * Special case, same means we can go all of our + * wheel less one slot. + */ + return (NUM_OF_HPTSI_SLOTS - 1); + else + return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now); +} + +/* + * Given a tick on the wheel that is the current time + * mapped to the wheel (wheel_tick), what is the maximum + * distance forward that can be obtained without + * wrapping past either prev_tick or running_tick + * depending on the htps state? Also if passed + * a uint32_t *, fill it with the tick location. + * + * Note if you do not give this function the current + * time (that you think it is) mapped to the wheel + * then the results will not be what you expect and + * could lead to invalid inserts. + */ +static inline int32_t +max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick) +{ + uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel; + + if ((hpts->p_hpts_active == 1) && + (hpts->p_wheel_complete == 0)) { + end_tick = hpts->p_runningtick; + /* Back up one tick */ + if (end_tick == 0) + end_tick = NUM_OF_HPTSI_SLOTS - 1; + else + end_tick--; + if (target_tick) + *target_tick = end_tick; + } else { + /* + * For the case where we are + * not active, or we have + * completed the pass over + * the wheel, we can use the + * prev tick and subtract one from it. This puts us + * as far out as possible on the wheel. + */ + end_tick = hpts->p_prev_slot; + if (end_tick == 0) + end_tick = NUM_OF_HPTSI_SLOTS - 1; + else + end_tick--; + if (target_tick) + *target_tick = end_tick; + /* + * Now we have close to the full wheel left minus the + * time it has been since the pacer went to sleep. Note + * that wheel_tick, passed in, should be the current time + * from the perspective of the caller, mapped to the wheel. + */ + if (hpts->p_prev_slot != wheel_tick) + dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + else + dis_to_travel = 1; + /* + * dis_to_travel in this case is the space from when the + * pacer stopped (p_prev_slot) and where our wheel_tick + * is now. To know how many slots we can put it in we + * subtract from the wheel size. We would not want + * to place something after p_prev_slot or it will + * get ran too soon. + */ + return (NUM_OF_HPTSI_SLOTS - dis_to_travel); + } + /* + * So how many slots are open between p_runningtick -> p_cur_slot + * that is what is currently un-available for insertion. Special + * case when we are at the last slot, this gets 1, so that + * the answer to how many slots are available is all but 1. + */ + if (hpts->p_runningtick == hpts->p_cur_slot) + dis_to_travel = 1; + else + dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + /* + * How long has the pacer been running? + */ + if (hpts->p_cur_slot != wheel_tick) { + /* The pacer is a bit late */ + pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick); + } else { + /* The pacer is right on time, now == pacers start time */ + pacer_to_now = 0; + } + /* + * To get the number left we can insert into we simply + * subract the distance the pacer has to run from how + * many slots there are. + */ + avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel; + /* + * Now how many of those we will eat due to the pacer's + * time (p_cur_slot) of start being behind the + * real time (wheel_tick)? + */ + if (avail_on_wheel <= pacer_to_now) { + /* + * Wheel wrap, we can't fit on the wheel, that + * is unusual the system must be way overloaded! + * Insert into the assured tick, and return special + * "0". + */ + counter_u64_add(combined_wheel_wrap, 1); + *target_tick = hpts->p_nxt_slot; + return (0); + } else { + /* + * We know how many slots are open + * on the wheel (the reverse of what + * is left to run. Take away the time + * the pacer started to now (wheel_tick) + * and that tells you how many slots are + * open that can be inserted into that won't + * be touched by the pacer until later. + */ + return (avail_on_wheel - pacer_to_now); + } } static int tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) { - int32_t need_wake = 0; - uint32_t ticknow = 0; - + uint32_t need_wake = 0; + HPTS_MTX_ASSERT(hpts); if (inp->inp_in_hpts == 0) { /* Ok we need to set it on the hpts in the current slot */ - if (hpts->p_hpts_active == 0) { - /* A sleeping hpts we want in next slot to run */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, - hpts_tick(hpts, 1)); - } - inp->inp_hptsslot = hpts_tick(hpts, 1); - inp->inp_hpts_request = 0; - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); - } - need_wake = 1; + inp->inp_hpts_request = 0; + if ((hpts->p_hpts_active == 0) || + (hpts->p_wheel_complete)) { + /* + * A sleeping hpts we want in next slot to run + * note that in this state p_prev_slot == p_cur_slot + */ + inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1); + if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) + need_wake = 1; } else if ((void *)inp == hpts->p_inp) { /* + * The hpts system is running and the caller + * was awoken by the hpts system. * We can't allow you to go into the same slot we - * are in. We must put you out. + * are in (we don't want a loop :-D). */ inp->inp_hptsslot = hpts->p_nxt_slot; } else - inp->inp_hptsslot = hpts->p_cur_slot; + inp->inp_hptsslot = hpts->p_runningtick; hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); - inp->inp_hpts_request = 0; - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); - } if (need_wake) { /* * Activate the hpts if it is sleeping and its * timeout is not 1. */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); - } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } @@ -737,141 +850,129 @@ __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line) return (ret); } +#ifdef INVARIANTS static void -tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, - struct hpts_diag *diag, int32_t noref) +check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line) { - int32_t need_new_to = 0; - int32_t need_wakeup = 0; - uint32_t largest_slot; - uint32_t ticknow = 0; - uint32_t slot_calc; + /* + * Sanity checks for the pacer with invariants + * on insert. + */ + if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS) + panic("hpts:%p inp:%p slot:%d > max", + hpts, inp, inp_hptsslot); + if ((hpts->p_hpts_active) && + (hpts->p_wheel_complete == 0)) { + /* + * If the pacer is processing a arc + * of the wheel, we need to make + * sure we are not inserting within + * that arc. + */ + int distance, yet_to_run; + + distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot); + if (hpts->p_runningtick != hpts->p_cur_slot) + yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + else + yet_to_run = 0; /* processing last slot */ + if (yet_to_run > distance) { + panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", + hpts, inp, inp_hptsslot, + distance, yet_to_run, + hpts->p_runningtick, hpts->p_cur_slot); + } + } +} +#endif + +static void +tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line, + struct hpts_diag *diag, struct timeval *tv) +{ + uint32_t need_new_to = 0; + uint32_t wheel_cts, last_tick; + int32_t wheel_tick, maxticks; + int8_t need_wakeup = 0; HPTS_MTX_ASSERT(hpts); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; + diag->p_prev_slot = hpts->p_prev_slot; + diag->p_runningtick = hpts->p_runningtick; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; + diag->p_curtick = hpts->p_curtick; + diag->p_lasttick = hpts->p_lasttick; diag->slot_req = slot; + diag->p_on_min_sleep = hpts->p_on_min_sleep; + diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if ((inp->inp_in_hpts == 0) || noref) { - inp->inp_hpts_request = slot; + if (inp->inp_in_hpts == 0) { if (slot == 0) { /* Immediate */ - tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); + tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); return; } - if (hpts->p_hpts_active) { - /* - * Its slot - 1 since nxt_slot is the next tick that - * will go off since the hpts is awake - */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); - } - /* - * We want to make sure that we don't place a inp in - * the range of p_cur_slot <-> p_nxt_slot. If we - * take from p_nxt_slot to the end, plus p_cur_slot - * and then take away 2, we will know how many is - * the max slots we can use. - */ - if (hpts->p_nxt_slot > hpts->p_cur_slot) { - /* - * Non-wrap case nxt_slot <-> cur_slot we - * don't want to land in. So the diff gives - * us what is taken away from the number of - * slots. + /* Get the current time relative to the wheel */ + wheel_cts = tcp_tv_to_hptstick(tv); + /* Map it onto the wheel */ + wheel_tick = tick_to_wheel(wheel_cts); + /* Now what's the max we can place it at? */ + maxticks = max_ticks_available(hpts, wheel_tick, &last_tick); + if (diag) { + diag->wheel_tick = wheel_tick; + diag->maxticks = maxticks; + diag->wheel_cts = wheel_cts; + } + if (maxticks == 0) { + /* The pacer is in a wheel wrap behind, yikes! */ + if (slot > 1) { + /* + * Reduce by 1 to prevent a forever loop in + * case something else is wrong. Note this + * probably does not hurt because the pacer + * if its true is so far behind we will be + * > 1second late calling anyway. */ - largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); - } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { - largest_slot = NUM_OF_HPTSI_SLOTS - 2; - } else { - /* - * Wrap case so the diff gives us the number - * of slots that we can land in. - */ - largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; - } - /* - * We take away two so we never have a problem (20 - * usec's) out of 1024000 usecs - */ - largest_slot -= 2; - if (inp->inp_hpts_request > largest_slot) { - /* - * Restrict max jump of slots and remember - * leftover - */ - slot = largest_slot; - inp->inp_hpts_request -= largest_slot; - } else { - /* This one will run when we hit it */ - inp->inp_hpts_request = 0; - } - if (hpts->p_nxt_slot == hpts->p_cur_slot) - slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; - else - slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; - if (slot_calc == hpts->p_cur_slot) { -#ifdef INVARIANTS - /* TSNH */ - panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", - hpts, slot_calc, slot, largest_slot); -#endif - if (slot_calc) - slot_calc--; - else - slot_calc = NUM_OF_HPTSI_SLOTS - 1; - } - inp->inp_hptsslot = slot_calc; - if (diag) { - diag->inp_hptsslot = inp->inp_hptsslot; + slot--; } + inp->inp_hptsslot = last_tick; + inp->inp_hpts_request = slot; + } else if (maxticks >= slot) { + /* It all fits on the wheel */ + inp->inp_hpts_request = 0; + inp->inp_hptsslot = hpts_tick(wheel_tick, slot); } else { + /* It does not fit */ + inp->inp_hpts_request = slot - maxticks; + inp->inp_hptsslot = last_tick; + } + if (diag) { + diag->slot_remaining = inp->inp_hpts_request; + diag->inp_hptsslot = inp->inp_hptsslot; + } +#ifdef INVARIANTS + check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); +#endif + hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0); + if ((hpts->p_hpts_active == 0) && + (inp->inp_hpts_request == 0) && + (hpts->p_on_min_sleep == 0)) { /* - * The hpts is sleeping, we need to figure out where + * The hpts is sleeping and not on a minimum + * sleep time, we need to figure out where * it will wake up at and if we need to reschedule * its time-out. */ uint32_t have_slept, yet_to_sleep; - uint32_t slot_now; - struct timeval tv; - ticknow = tcp_gethptstick(&tv); - slot_now = ticknow % NUM_OF_HPTSI_SLOTS; - /* - * The user wants to be inserted at (slot_now + - * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. - */ - largest_slot = NUM_OF_HPTSI_SLOTS - 2; - if (inp->inp_hpts_request > largest_slot) { - /* Adjust the residual in inp_hpts_request */ - slot = largest_slot; - inp->inp_hpts_request -= largest_slot; - } else { - /* No residual it all fits */ - inp->inp_hpts_request = 0; - } - inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; - if (diag) { - diag->slot_now = slot_now; - diag->inp_hptsslot = inp->inp_hptsslot; - diag->p_on_min_sleep = hpts->p_on_min_sleep; - } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); - } /* Now do we need to restart the hpts's timer? */ - if (TSTMP_GT(ticknow, hpts->p_curtick)) - have_slept = ticknow - hpts->p_curtick; - else - have_slept = 0; - if (have_slept < hpts->p_hpts_sleep_time) { - /* This should be what happens */ + have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + if (have_slept < hpts->p_hpts_sleep_time) yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; - } else { + else { /* We are over-due */ yet_to_sleep = 0; need_wakeup = 1; @@ -879,29 +980,22 @@ tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t if (diag) { diag->have_slept = have_slept; diag->yet_to_sleep = yet_to_sleep; - diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { + if (yet_to_sleep && + (yet_to_sleep > slot)) { /* - * We need to reschedule the hptss time-out. + * We need to reschedule the hpts's time-out. */ hpts->p_hpts_sleep_time = slot; need_new_to = slot * HPTS_TICKS_PER_USEC; } } - hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); - } /* * Now how far is the hpts sleeping to? if active is 1, its * up and ticking we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); - } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); if (diag) { @@ -944,9 +1038,10 @@ tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t } uint32_t -tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ +tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag) +{ struct tcp_hpts_entry *hpts; - uint32_t slot_on, cts; + uint32_t slot_on; struct timeval tv; /* @@ -956,12 +1051,8 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts */ INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); - if (in_ts_percision) - microuptime(&tv); - else - getmicrouptime(&tv); - cts = tcp_tv_to_usectick(&tv); - tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); + microuptime(&tv); + tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv); slot_on = hpts->p_nxt_slot; mtx_unlock(&hpts->p_mtx); return (slot_on); @@ -971,7 +1062,6 @@ uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ return (tcp_hpts_insert_diag(inp, slot, line, NULL)); } - int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) { @@ -986,9 +1076,6 @@ __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int3 /* * Activate the hpts if it is sleeping. */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); - } retval = 2; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); @@ -1001,36 +1088,14 @@ __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int3 return (retval); } -void -tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos) -{ - /* Setup packet for input first */ - INP_WLOCK_ASSERT(tp->t_inpcb); - m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); - m->m_pkthdr.pace_tlen = (uint16_t) tlen; - m->m_pkthdr.pace_drphdrlen = drop_hdrlen; - m->m_pkthdr.pace_tos = iptos; - m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0); - if (tp->t_in_pkt == NULL) { - tp->t_in_pkt = m; - tp->t_tail_pkt = m; - } else { - tp->t_tail_pkt->m_nextpkt = m; - tp->t_tail_pkt = m; - } -} - - int32_t -__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){ +__tcp_queue_to_input(struct inpcb *inp, int line) +{ struct tcp_hpts_entry *hpts; int32_t ret; - tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos); - hpts = tcp_input_lock(tp->t_inpcb); - ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); + hpts = tcp_input_lock(inp); + ret = __tcp_queue_to_input_locked(inp, hpts, line); mtx_unlock(&hpts->p_mtx); return (ret); } @@ -1132,6 +1197,25 @@ hpts_cpuid(struct inpcb *inp){ #endif } +static void +tcp_drop_in_pkts(struct tcpcb *tp) +{ + struct mbuf *m, *n; + + m = tp->t_in_pkt; + if (m) + n = m->m_nextpkt; + else + n = NULL; + tp->t_in_pkt = NULL; + while (m) { + m_freem(m); + m = n; + if (m) + n = m->m_nextpkt; + } +} + /* * Do NOT try to optimize the processing of inp's * by first pulling off all the inp's into a temporary @@ -1142,7 +1226,7 @@ hpts_cpuid(struct inpcb *inp){ * but then while you were processing one of the inp's * some other one that you switch will get a new * packet on the different CPU. It will insert it - * on the new hptss input list. Creating a temporary + * on the new hpts's input list. Creating a temporary * link in the inp will not fix it either, since * the other hpts will be doing the same thing and * you will both end up using the temporary link. @@ -1155,16 +1239,18 @@ hpts_cpuid(struct inpcb *inp){ static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) { - struct mbuf *m, *n; struct tcpcb *tp; struct inpcb *inp; uint16_t drop_reason; int16_t set_cpu; uint32_t did_prefetch = 0; - int32_t ti_locked = TI_UNLOCKED; + int dropped; struct epoch_tracker et; HPTS_MTX_ASSERT(hpts); +#ifndef VIMAGE + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { HPTS_MTX_ASSERT(hpts); hpts_sane_input_remove(hpts, inp, 0); @@ -1177,26 +1263,22 @@ tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) drop_reason = inp->inp_hpts_drop_reas; inp->inp_in_input = 0; mtx_unlock(&hpts->p_mtx); - CURVNET_SET(inp->inp_vnet); - if (drop_reason) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } else { - ti_locked = TI_UNLOCKED; - } INP_WLOCK(inp); +#ifdef VIMAGE + CURVNET_SET(inp->inp_vnet); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { out: hpts->p_inp = NULL; - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - } if (in_pcbrele_wlocked(inp) == 0) { INP_WUNLOCK(inp); } - ti_locked = TI_UNLOCKED; +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); +#endif mtx_lock(&hpts->p_mtx); continue; } @@ -1206,26 +1288,17 @@ tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) } if (drop_reason) { /* This tcb is being destroyed for drop_reason */ - m = tp->t_in_pkt; - if (m) - n = m->m_nextpkt; - else - n = NULL; - tp->t_in_pkt = NULL; - while (m) { - m_freem(m); - m = n; - if (m) - n = m->m_nextpkt; - } + tcp_drop_in_pkts(tp); tp = tcp_drop(tp, drop_reason); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (tp == NULL) { INP_WLOCK(inp); } if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); +#endif mtx_lock(&hpts->p_mtx); continue; } @@ -1246,220 +1319,184 @@ tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) */ tcp_set_hpts(inp); } - m = tp->t_in_pkt; - n = NULL; - if (m != NULL && - (m->m_pkthdr.pace_lock == TI_RLOCKED || - tp->t_state != TCPS_ESTABLISHED)) { - ti_locked = TI_RLOCKED; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - m = tp->t_in_pkt; - } - if (in_newts_every_tcb) { - if (in_ts_percision) - microuptime(tv); - else - getmicrouptime(tv); - } if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } - /* Any input work to do, if so do it first */ - if ((m != NULL) && (m == tp->t_in_pkt)) { - struct tcphdr *th; - int32_t tlen, drop_hdrlen, nxt_pkt; - uint8_t iptos; - - n = m->m_nextpkt; - tp->t_in_pkt = tp->t_tail_pkt = NULL; - while (m) { - th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); - tlen = m->m_pkthdr.pace_tlen; - drop_hdrlen = m->m_pkthdr.pace_drphdrlen; - iptos = m->m_pkthdr.pace_tos; - m->m_nextpkt = NULL; - if (n) - nxt_pkt = 1; - else - nxt_pkt = 0; - inp->inp_input_calls = 1; - if (tp->t_fb->tfb_tcp_hpts_do_segment) { - /* Use the hpts specific do_segment */ - (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, - tp, drop_hdrlen, - tlen, iptos, nxt_pkt, tv); - } else { - /* Use the default do_segment */ - (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, - tp, drop_hdrlen, - tlen, iptos); - } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - /* - * Do segment returns unlocked we need the - * lock again but we also need some kasserts - * here. - */ - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); - INP_UNLOCK_ASSERT(inp); - m = n; - if (m) - n = m->m_nextpkt; - if (m != NULL && - m->m_pkthdr.pace_lock == TI_RLOCKED) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } else - ti_locked = TI_UNLOCKED; + if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { + if (inp->inp_in_input) + tcp_hpts_remove(inp, HPTS_REMOVE_INPUT); + dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); + if (dropped) { + /* Re-acquire the wlock so we can release the reference */ INP_WLOCK(inp); - /* - * Since we have an opening here we must - * re-check if the tcb went away while we - * were getting the lock(s). - */ - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { - while (m) { - m_freem(m); - m = n; - if (m) - n = m->m_nextpkt; - } - goto out; - } - /* - * Now that we hold the INP lock, check if - * we need to upgrade our lock. - */ - if (ti_locked == TI_UNLOCKED && - (tp->t_state != TCPS_ESTABLISHED)) { - ti_locked = TI_RLOCKED; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - } - } /** end while(m) */ - } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ + } + } else if (tp->t_in_pkt) { + /* + * We reach here only if we had a + * stack that supported INP_SUPPORTS_MBUFQ + * and then somehow switched to a stack that + * does not. The packets are basically stranded + * and would hang with the connection until + * cleanup without this code. Its not the + * best way but I know of no other way to + * handle it since the stack needs functions + * it does not have to handle queued packets. + */ + tcp_drop_in_pkts(tp); + } if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); - ti_locked = TI_UNLOCKED; +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + CURVNET_RESTORE(); +#endif mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; - CURVNET_RESTORE(); } -} - -static int -tcp_hpts_est_run(struct tcp_hpts_entry *hpts) -{ - int32_t ticks_to_run; - - if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { - ticks_to_run = hpts->p_curtick - hpts->p_prevtick; - if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { - ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; - } - } else { - if (hpts->p_prevtick == hpts->p_curtick) { - /* This happens when we get woken up right away */ - return (-1); - } - ticks_to_run = 1; - } - /* Set in where we will be when we catch up */ - hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; - if (hpts->p_nxt_slot == hpts->p_cur_slot) { - panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", - hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); - } - return (ticks_to_run); +#ifndef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); +#endif } static void -tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) +tcp_hptsi(struct tcp_hpts_entry *hpts) { + struct epoch_tracker et; struct tcpcb *tp; struct inpcb *inp = NULL, *ninp; struct timeval tv; - int32_t ticks_to_run, i, error, tick_now, interum_tick; + int32_t ticks_to_run, i, error; int32_t paced_cnt = 0; + int32_t loop_cnt = 0; int32_t did_prefetch = 0; int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; - uint32_t cts; + int32_t wrap_loop_cnt = 0; int16_t set_cpu; HPTS_MTX_ASSERT(hpts); - hpts->p_curtick = tcp_tv_to_hptstick(ctick); - cts = tcp_tv_to_usectick(ctick); - memcpy(&tv, ctick, sizeof(struct timeval)); - hpts->p_cur_slot = hpts_tick(hpts, 1); + /* record previous info for any logging */ + hpts->saved_lasttick = hpts->p_lasttick; + hpts->saved_curtick = hpts->p_curtick; + hpts->saved_curslot = hpts->p_cur_slot; + hpts->saved_prev_slot = hpts->p_prev_slot; - /* Figure out if we had missed ticks */ + hpts->p_lasttick = hpts->p_curtick; + hpts->p_curtick = tcp_gethptstick(&tv); + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + if ((hpts->p_on_queue_cnt == 0) || + (hpts->p_lasttick == hpts->p_curtick)) { + /* + * No time has yet passed, + * or nothing to do. + */ + hpts->p_prev_slot = hpts->p_cur_slot; + hpts->p_lasttick = hpts->p_curtick; + goto no_run; + } again: + hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); - ticks_to_run = tcp_hpts_est_run(hpts); - if (!TAILQ_EMPTY(&hpts->p_input)) { - tcp_input_data(hpts, &tv); + ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot); + if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) && + (hpts->p_on_queue_cnt != 0)) { + /* + * Wheel wrap is occuring, basically we + * are behind and the distance between + * run's has spread so much it has exceeded + * the time on the wheel (1.024 seconds). This + * is ugly and should NOT be happening. We + * need to run the entire wheel. We last processed + * p_prev_slot, so that needs to be the last slot + * we run. The next slot after that should be our + * reserved first slot for new, and then starts + * the running postion. Now the problem is the + * reserved "not to yet" place does not exist + * and there may be inp's in there that need + * running. We can merge those into the + * first slot at the head. + */ + wrap_loop_cnt++; + hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1); + hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2); + /* + * Adjust p_cur_slot to be where we are starting from + * hopefully we will catch up (fat chance if something + * is broken this bad :( ) + */ + hpts->p_cur_slot = hpts->p_prev_slot; + /* + * The next slot has guys to run too, and that would + * be where we would normally start, lets move them into + * the next slot (p_prev_slot + 2) so that we will + * run them, the extra 10usecs of late (by being + * put behind) does not really matter in this situation. + */ +#ifdef INVARIANTS + /* + * To prevent a panic we need to update the inpslot to the + * new location. This is safe since it takes both the + * INP lock and the pacer mutex to change the inp_hptsslot. + */ + TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) { + inp->inp_hptsslot = hpts->p_runningtick; + } +#endif + TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick], + &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts); + ticks_to_run = NUM_OF_HPTSI_SLOTS - 1; + counter_u64_add(wheel_wrap, 1); + } else { + /* + * Nxt slot is always one after p_runningtick though + * its not used usually unless we are doing wheel wrap. + */ + hpts->p_nxt_slot = hpts->p_prev_slot; + hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", - hpts, hpts->p_on_inqueue_cnt); + hpts, hpts->p_on_inqueue_cnt); } #endif HPTS_MTX_ASSERT(hpts); - /* Reset the ticks to run and time if we need too */ - interum_tick = tcp_gethptstick(&tv); - if (interum_tick != hpts->p_curtick) { - /* Save off the new time we execute to */ - *ctick = tv; - hpts->p_curtick = interum_tick; - cts = tcp_tv_to_usectick(&tv); - hpts->p_cur_slot = hpts_tick(hpts, 1); - ticks_to_run = tcp_hpts_est_run(hpts); - } - if (ticks_to_run == -1) { - goto no_run; - } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); - } if (hpts->p_on_queue_cnt == 0) { goto no_one; } HPTS_MTX_ASSERT(hpts); +#ifndef VIMAGE + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif for (i = 0; i < ticks_to_run; i++) { /* * Calculate our delay, if there are no extra ticks there - * was not any + * was not any (i.e. if ticks_to_run == 1, no delay). */ hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; HPTS_MTX_ASSERT(hpts); - while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { + while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { /* For debugging */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); - } hpts->p_inp = inp; paced_cnt++; - if (hpts->p_cur_slot != inp->inp_hptsslot) { +#ifdef INVARIANTS + if (hpts->p_runningtick != inp->inp_hptsslot) { panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", - hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); + hpts, inp, hpts->p_runningtick, inp->inp_hptsslot); } +#endif /* Now pull it */ if (inp->inp_hpts_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } - hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); - if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { + hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0); + if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { /* We prefetch the next inp if possible */ kern_prefetch(ninp, &prefetch_ninp); prefetch_ninp = 1; @@ -1467,25 +1504,36 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) if (inp->inp_hpts_request) { /* * This guy is deferred out further in time - * then our wheel had on it. Push him back - * on the wheel. + * then our wheel had available on it. + * Push him back on the wheel or run it + * depending. */ - int32_t remaining_slots; - + uint32_t maxticks, last_tick, remaining_slots; + remaining_slots = ticks_to_run - (i + 1); if (inp->inp_hpts_request > remaining_slots) { /* - * Keep INVARIANTS happy by clearing - * the flag + * How far out can we go? */ - tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); + maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick); + if (maxticks >= inp->inp_hpts_request) { + /* we can place it finally to be processed */ + inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request); + inp->inp_hpts_request = 0; + } else { + /* Work off some more time */ + inp->inp_hptsslot = last_tick; + inp->inp_hpts_request-= maxticks; + } + hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1); hpts->p_inp = NULL; continue; } inp->inp_hpts_request = 0; + /* Fall through we will so do it now */ } /* - * We clear the hpts flag here after dealing with + * We clear the hpts flag here after dealing with * remaining slots. This way anyone looking with the * TCB lock will see its on the hpts until just * before we unlock. @@ -1495,23 +1543,20 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); hpts->p_inp = NULL; continue; } - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { -out_now: + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || + (inp->inp_flags2 & INP_FREED)) { + out_now: #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif INP_WUNLOCK(inp); mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); hpts->p_inp = NULL; continue; } @@ -1539,16 +1584,14 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) */ tcp_set_hpts(inp); } - if (out_newts_every_tcb) { - struct timeval sv; - - if (out_ts_percision) - microuptime(&sv); - else - getmicrouptime(&sv); - cts = tcp_tv_to_usectick(&sv); - } +#ifdef VIMAGE CURVNET_SET(inp->inp_vnet); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif + /* Lets do any logging that we might want to */ + if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { + tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i); + } /* * There is a hole here, we get the refcnt on the * inp so it will still be preserved but to make @@ -1560,19 +1603,23 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx before tcp-output:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } - inp->inp_hpts_calls = 1; - if (tp->t_fb->tfb_tcp_output_wtime != NULL) { - error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); - } else { - error = tp->t_fb->tfb_tcp_output(tp); + if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { + error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); + if (error) { + /* The input killed the connection */ + goto skip_pacing; + } } + inp->inp_hpts_calls = 1; + error = tp->t_fb->tfb_tcp_output(tp); + inp->inp_hpts_calls = 0; if (ninp && ninp->inp_ppcb) { /* * If we have a nxt inp, see if we can @@ -1609,74 +1656,112 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) prefetch_tp = 1; } INP_WUNLOCK(inp); - INP_UNLOCK_ASSERT(inp); + skip_pacing: +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); +#endif + INP_UNLOCK_ASSERT(inp); #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); hpts->p_inp = NULL; } HPTS_MTX_ASSERT(hpts); hpts->p_inp = NULL; - hpts->p_cur_slot++; - if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { - hpts->p_cur_slot = 0; + hpts->p_runningtick++; + if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) { + hpts->p_runningtick = 0; } } +#ifndef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); +#endif no_one: HPTS_MTX_ASSERT(hpts); - hpts->p_prevtick = hpts->p_curtick; hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ - /* Re-run any input that may be there */ - (void)tcp_gethptstick(&tv); - if (!TAILQ_EMPTY(&hpts->p_input)) { - tcp_input_data(hpts, &tv); - } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", - hpts, hpts->p_on_inqueue_cnt); + hpts, hpts->p_on_inqueue_cnt); } #endif - tick_now = tcp_gethptstick(&tv); - if (SEQ_GT(tick_now, hpts->p_prevtick)) { - struct timeval res; - - /* Did we really spend a full tick or more in here? */ - timersub(&tv, ctick, &res); - if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { + hpts->p_prev_slot = hpts->p_cur_slot; + hpts->p_lasttick = hpts->p_curtick; + if (loop_cnt > max_pacer_loops) { + /* + * Something is serious slow we have + * looped through processing the wheel + * and by the time we cleared the + * needs to run max_pacer_loops time + * we still needed to run. That means + * the system is hopelessly behind and + * can never catch up :( + * + * We will just lie to this thread + * and let it thing p_curtick is + * correct. When it next awakens + * it will find itself further behind. + */ + counter_u64_add(hpts_hopelessly_behind, 1); + goto no_run; + } + hpts->p_curtick = tcp_gethptstick(&tv); + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + if ((wrap_loop_cnt < 2) && + (hpts->p_lasttick != hpts->p_curtick)) { + counter_u64_add(hpts_loops, 1); + loop_cnt++; + goto again; + } +no_run: + /* + * Set flag to tell that we are done for + * any slot input that happens during + * input. + */ + hpts->p_wheel_complete = 1; + /* + * Run any input that may be there not covered + * in running data. + */ + if (!TAILQ_EMPTY(&hpts->p_input)) { + tcp_input_data(hpts, &tv); + /* + * Now did we spend too long running + * input and need to run more ticks? + */ + KASSERT(hpts->p_prev_slot == hpts->p_cur_slot, + ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, + hpts->p_prev_slot, hpts->p_cur_slot)); + KASSERT(hpts->p_lasttick == hpts->p_curtick, + ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, + hpts->p_lasttick, hpts->p_curtick)); + hpts->p_curtick = tcp_gethptstick(&tv); + if (hpts->p_lasttick != hpts->p_curtick) { counter_u64_add(hpts_loops, 1); - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); - } - *ctick = res; - hpts->p_curtick = tick_now; + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); goto again; } } -no_run: { uint32_t t = 0, i, fnd = 0; - if (hpts->p_on_queue_cnt) { - - + if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) { /* * Find next slot that is occupied and use that to * be the sleep time. */ - for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { + for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) { if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { fnd = 1; break; @@ -1684,26 +1769,22 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) t = (t + 1) % NUM_OF_HPTSI_SLOTS; } if (fnd) { - hpts->p_hpts_sleep_time = i; + hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max); } else { - counter_u64_add(back_tosleep, 1); #ifdef INVARIANTS - panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); + panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt); #endif + counter_u64_add(back_tosleep, 1); hpts->p_on_queue_cnt = 0; goto non_found; } - t++; + } else if (wrap_loop_cnt >= 2) { + /* Special case handling */ + hpts->p_hpts_sleep_time = tcp_min_hptsi_time; } else { - /* No one on the wheel sleep for all but 2 slots */ -non_found: - if (hpts_sleep_max == 0) - hpts_sleep_max = 1; - hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); - t = 0; - } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); + /* No one on the wheel sleep for all but 400 slots or sleep max */ + non_found: + hpts->p_hpts_sleep_time = hpts_sleep_max; } } } @@ -1746,33 +1827,29 @@ tcp_hpts_thread(void *ctx) mtx_lock(&hpts->p_mtx); if (hpts->p_direct_wake) { /* Signaled by input */ - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); callout_stop(&hpts->co); } else { /* Timed out */ if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); mtx_unlock(&hpts->p_mtx); return; } callout_deactivate(&hpts->co); - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); } + hpts->p_hpts_wake_scheduled = 0; hpts->p_hpts_active = 1; - (void)tcp_gethptstick(&tv); - tcp_hptsi(hpts, &tv); + tcp_hptsi(hpts); HPTS_MTX_ASSERT(hpts); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { + hpts->overidden_sleep = tv.tv_usec; tv.tv_usec = tcp_min_hptsi_time; hpts->p_on_min_sleep = 1; } else { /* Clear the min sleep flag */ + hpts->overidden_sleep = 0; hpts->p_on_min_sleep = 0; } hpts->p_hpts_active = 0; @@ -1809,9 +1886,11 @@ tcp_init_hptsi(void *st) tcp_pace.rp_proc = NULL; tcp_pace.rp_num_hptss = ncpus; + hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); hpts_loops = counter_u64_alloc(M_WAITOK); back_tosleep = counter_u64_alloc(M_WAITOK); - + combined_wheel_wrap = counter_u64_alloc(M_WAITOK); + wheel_wrap = counter_u64_alloc(M_WAITOK); sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; @@ -1850,7 +1929,7 @@ tcp_init_hptsi(void *st) OID_AUTO, "out_qcnt", CTLFLAG_RD, &hpts->p_on_queue_cnt, 0, "Count TCB's awaiting output processing"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_ADD_U16(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "active", CTLFLAG_RD, &hpts->p_hpts_active, 0, @@ -1859,29 +1938,23 @@ tcp_init_hptsi(void *st) SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curslot", CTLFLAG_RD, &hpts->p_cur_slot, 0, - "What the current slot is if active"); + "What the current running pacers goal"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "runtick", CTLFLAG_RD, + &hpts->p_runningtick, 0, + "What the running pacers current slot is"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curtick", CTLFLAG_RD, &hpts->p_curtick, 0, - "What the current tick on if active"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "logsize", CTLFLAG_RD, - &hpts->p_logsize, 0, - "Hpts logging buffer size"); - hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; + "What the running pacers last tick mapped to the wheel was"); + hpts->p_hpts_sleep_time = hpts_sleep_max; hpts->p_num = i; - hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); - hpts->p_prevtick -= 1; - hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; + hpts->p_curtick = tcp_gethptstick(&tv); + hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); hpts->p_cpu = 0xffff; - hpts->p_nxt_slot = 1; - hpts->p_logsize = tcp_hpts_logging_size; - if (hpts->p_logsize) { - sz = (sizeof(struct hpts_log) * hpts->p_logsize); - hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); - } + hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1); callout_init(&hpts->co, 1); } diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index ea15bc1ab320..293daa2cae3d 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -45,112 +45,80 @@ TAILQ_HEAD(hptsh, inpcb); /* Number of useconds in a hpts tick */ #define HPTS_TICKS_PER_USEC 10 -#define HPTS_MS_TO_SLOTS(x) (x * 100) +#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 -#define DEFAULT_HPTS_LOG 3072 - -/* - * Log flags consist of - * 7f 7f 1 1 bits - * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE - * - * So for example cpu 10, number 10 would with - * input active would show up as: - * p_flags = 0001010 0001010 1 0 - * - * p_flags = 0x142a - */ -#define HPTS_HPTS_ACTIVE 0x01 -#define HPTS_INPUT_ACTIVE 0x02 - -#define HPTSLOG_IMMEDIATE 1 -#define HPTSLOG_INSERT_NORMAL 2 -#define HPTSLOG_INSERT_SLEEPER 3 -#define HPTSLOG_SLEEP_AFTER 4 -#define HPTSLOG_SLEEP_BEFORE 5 -#define HPTSLOG_INSERTED 6 -#define HPTSLOG_WAKEUP_HPTS 7 -#define HPTSLOG_SETTORUN 8 -#define HPTSLOG_HPTSI 9 -#define HPTSLOG_TOLONG 10 -#define HPTSLOG_AWAKENS 11 -#define HPTSLOG_TIMESOUT 12 -#define HPTSLOG_SLEEPSET 13 -#define HPTSLOG_WAKEUP_INPUT 14 -#define HPTSLOG_RESCHEDULE 15 -#define HPTSLOG_AWAKE 16 -#define HPTSLOG_INP_DONE 17 - -struct hpts_log { - struct inpcb *inp; - int32_t event; - uint32_t cts; - int32_t line; - uint32_t ticknow; - uint32_t t_paceslot; - uint32_t t_hptsreq; - uint32_t p_curtick; - uint32_t p_prevtick; - uint32_t slot_req; - uint32_t p_on_queue_cnt; - uint32_t p_nxt_slot; - uint32_t p_cur_slot; - uint32_t p_hpts_sleep_time; - uint16_t p_flags; - uint8_t p_onhpts; - uint8_t p_oninput; - uint8_t is_notempty; -}; struct hpts_diag { - uint32_t p_hpts_active; - uint32_t p_nxt_slot; - uint32_t p_cur_slot; - uint32_t slot_req; - uint32_t inp_hptsslot; - uint32_t slot_now; - uint32_t have_slept; - uint32_t hpts_sleep_time; - uint32_t yet_to_sleep; - uint32_t need_new_to; - int32_t co_ret; - uint8_t p_on_min_sleep; + uint32_t p_hpts_active; /* bbr->flex7 x */ + uint32_t p_nxt_slot; /* bbr->flex1 x */ + uint32_t p_cur_slot; /* bbr->flex2 x */ + uint32_t p_prev_slot; /* bbr->delivered */ + uint32_t p_runningtick; /* bbr->inflight */ + uint32_t slot_req; /* bbr->flex3 x */ + uint32_t inp_hptsslot; /* bbr->flex4 x */ + uint32_t slot_remaining; /* bbr->flex5 x */ + uint32_t have_slept; /* bbr->epoch x */ + uint32_t hpts_sleep_time; /* bbr->applimited x */ + uint32_t yet_to_sleep; /* bbr->lt_epoch x */ + uint32_t need_new_to; /* bbr->flex6 x */ + uint32_t wheel_tick; /* bbr->bw_inuse x */ + uint32_t maxticks; /* bbr->delRate x */ + uint32_t wheel_cts; /* bbr->rttProp x */ + int32_t co_ret; /* bbr->pkts_out x */ + uint32_t p_curtick; /* upper bbr->cur_del_rate */ + uint32_t p_lasttick; /* lower bbr->cur_del_rate */ + uint8_t p_on_min_sleep; /* bbr->flex8 x */ }; +/* Magic flags to tell whats cooking on the pacing wheel */ +#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */ +#define PACE_TMR_RACK 0x02 /* RACK timer running */ +#define PACE_TMR_TLP 0x04 /* TLP timer running */ +#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ +#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ +#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ +#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */ +#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) + #ifdef _KERNEL /* Each hpts has its own p_mtx which is used for locking */ struct tcp_hpts_entry { /* Cache line 0x00 */ struct mtx p_mtx; /* Mutex for hpts */ - uint32_t p_hpts_active; /* Flag that says hpts is awake */ - uint32_t p_curtick; /* Current tick in 10 us the hpts is at */ - uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */ + uint16_t p_hpts_active; /* Flag that says hpts is awake */ + uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */ + uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ + uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ + uint32_t p_runningtick; /* Current tick we are at if we are running */ + uint32_t p_prev_slot; /* Previous slot we were on */ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ uint32_t p_nxt_slot; /* The next slot outside the current range of * slots that the hpts is running on. */ int32_t p_on_queue_cnt; /* Count on queue in this hpts */ - uint32_t enobuf_cnt; - uint16_t p_log_at; + uint32_t p_lasttick; /* Last tick before the current one */ uint8_t p_direct_wake :1, /* boolean */ - p_log_wrapped :1, /* boolean */ - p_on_min_sleep:1; /* boolean */ - uint8_t p_fill; + p_on_min_sleep:1, /* boolean */ + p_avail:6; + uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ void *p_inp; struct hptsh p_input; /* For the tcp-input runner */ /* Hptsi wheel */ struct hptsh *p_hptss; - struct hpts_log *p_log; - uint32_t p_logsize; int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ uint32_t hit_no_enobuf; uint32_t p_dyn_adjust; uint32_t p_hpts_sleep_time; /* Current sleep interval having a max * of 255ms */ + uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ + uint32_t saved_lasttick; /* for logging */ + uint32_t saved_curtick; /* for logging */ + uint32_t saved_curslot; /* for logging */ + uint32_t saved_prev_slot; /* for logging */ uint32_t p_delayed_by; /* How much were we delayed by */ /* Cache line 0x80 */ struct sysctl_ctx_list hpts_ctx; @@ -236,13 +204,9 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line); #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__); -void -tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos); int -__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line); -#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__) +__tcp_queue_to_input(struct inpcb *inp, int32_t line); +#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__) uint16_t tcp_hpts_delayedby(struct inpcb *inp); diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index 0662d1b5d4ad..e0575a43e3dc 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -175,7 +175,7 @@ enum tcp_log_events { TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER,/* Detected reorder 7 */ - TCP_LOG_PACER, /* Pacer sending a packet 8 */ + TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ @@ -194,31 +194,38 @@ enum tcp_log_events { BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ - BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */ - BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */ + BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ + BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ - UNUSED_32, /* Unused 32 */ - UNUSED_33, /* Unused 33 */ + BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ + BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ - BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */ + BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ - BBR_LOG_PACING_CALC, /* calc the pacing time 43 */ + BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ - BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */ + BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ - TCP_LOG_END /* End (keep at end) 51 */ + TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */ + BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ + BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */ + TCP_LOG_CONNEND, /* End of connection 54 */ + TCP_LOG_LRO, /* LRO entry 55 */ + TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */ + TCP_SAD_DETECTION, /* Sack Attack Detection 57 */ + TCP_LOG_END /* End (keep at end) 58 */ }; enum tcp_log_states { @@ -275,8 +282,8 @@ struct tcp_log_dev_log_queue { #ifdef _KERNEL -#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000 -#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000 +#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000 +#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index d95a0edea790..e69f0d5fb5c5 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2016-2019 Netflix, Inc. + * Copyright (c) 2016 + * Netflix Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -44,12 +45,16 @@ __FBSDID("$FreeBSD$"); #include #include #include /* for proc0 declaration */ +#ifdef NETFLIX_STATS +#include +#endif #include #include #include #include +#include #ifdef NETFLIX_STATS -#include +#include /* Must come after qmath.h and tree.h */ #endif #include #include @@ -74,8 +79,8 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #define TCPOUTFLAGS +#include #include #include #include @@ -84,9 +89,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#ifdef NETFLIX_CWV -#include -#endif #include #ifdef TCPDEBUG #include @@ -126,6 +128,10 @@ uma_zone_t rack_pcb_zone; struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; +#ifndef TCPHPTS +fatal error missing option TCPHSTS in the build; +#endif + #define CUM_ACKED 1 #define SACKED 2 @@ -178,6 +184,9 @@ static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; +static int32_t rack_map_entries_limit = 1024; +static int32_t rack_map_split_limit = 256; + /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up @@ -202,7 +211,6 @@ static int32_t rack_always_send_oldest = 0; static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; -static uint32_t rack_map_split_limit = 0; /* unlimited by default */ /* Rack specific counters */ counter_u64_t rack_badfr; @@ -228,6 +236,7 @@ counter_u64_t rack_to_arm_tlp; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; +counter_u64_t rack_to_alloc_limited; counter_u64_t rack_alloc_limited_conns; counter_u64_t rack_split_limited; @@ -248,12 +257,21 @@ counter_u64_t rack_progress_drops; counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; +/* + * This was originally defined in tcp_timer.c, but is now reproduced here given + * the unification of the SYN and non-SYN retransmit timer exponents combined + * with wanting to retain previous behaviour for previously deployed stack + * versions. + */ +int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; + static void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); static int rack_process_ack(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp, struct tcpopt *to, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); static int rack_process_data(struct mbuf *m, struct tcphdr *th, @@ -351,14 +369,13 @@ static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); -static void -rack_do_drop(struct mbuf *m, struct tcpcb *tp); +static void rack_do_drop(struct mbuf *m, struct tcpcb *tp); static void rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); static void rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t rstreason, int32_t tlen); + struct tcphdr *th, int32_t rstreason, int32_t tlen); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -449,6 +466,7 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); + counter_u64_zero(rack_to_alloc_limited); counter_u64_zero(rack_alloc_limited_conns); counter_u64_zero(rack_split_limited); counter_u64_zero(rack_find_high); @@ -468,6 +486,18 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) static void rack_init_sysctls() { + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "map_limit", CTLFLAG_RW, + &rack_map_entries_limit , 1024, + "Is there a limit on how big the sendmap can grow? "); + + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "map_splitlimit", CTLFLAG_RW, + &rack_map_split_limit , 256, + "Is there a limit on how much splitting a peer can do?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, @@ -628,11 +658,6 @@ rack_init_sysctls() OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "split_limit", CTLFLAG_RW, - &rack_map_split_limit, 0, - "Is there a limit on the number of map split entries (0=unlimited)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "inc_var", CTLFLAG_RW, @@ -769,6 +794,12 @@ rack_init_sysctls() OID_AUTO, "allocemerg", CTLFLAG_RD, &rack_to_alloc_emerg, "Total allocations done from emergency cache"); + rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "alloc_limited", CTLFLAG_RD, + &rack_to_alloc_limited, + "Total allocations dropped due to limit"); rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -859,6 +890,7 @@ rack_init_sysctls() static inline int32_t rack_progress_timeout_check(struct tcpcb *tp) { +#ifdef NETFLIX_PROGRESS if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { /* @@ -869,13 +901,12 @@ rack_progress_timeout_check(struct tcpcb *tp) struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; counter_u64_add(rack_progress_drops, 1); -#ifdef NETFLIX_STATS TCPSTAT_INC(tcps_progdrops); -#endif rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); return (1); } } +#endif return (0); } @@ -962,6 +993,7 @@ rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) union tcp_log_stackspecific log; struct timeval tv; + memset(&log, 0, sizeof(log)); /* Convert our ms to a microsecond */ log.u_bbr.flex1 = rtt * 1000; log.u_bbr.timeStamp = tcp_get_usecs(&tv); @@ -1021,6 +1053,8 @@ rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; @@ -1127,6 +1161,8 @@ rack_counter_destroy() counter_u64_free(rack_sack_proc_short); counter_u64_free(rack_sack_proc_restart); counter_u64_free(rack_to_alloc); + counter_u64_free(rack_to_alloc_limited); + counter_u64_free(rack_split_limited); counter_u64_free(rack_find_high); counter_u64_free(rack_runt_sacks); counter_u64_free(rack_enter_tlp_calc); @@ -1146,9 +1182,8 @@ rack_alloc(struct tcp_rack *rack) rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { -alloc_done: - counter_u64_add(rack_to_alloc, 1); rack->r_ctl.rc_num_maps_alloced++; + counter_u64_add(rack_to_alloc, 1); return (rsm); } if (rack->rc_free_cnt) { @@ -1156,11 +1191,26 @@ rack_alloc(struct tcp_rack *rack) rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt--; - goto alloc_done; + return (rsm); } return (NULL); } +static struct rack_sendmap * +rack_alloc_full_limit(struct tcp_rack *rack) +{ + if ((rack_map_entries_limit > 0) && + (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { + counter_u64_add(rack_to_alloc_limited, 1); + if (!rack->alloc_limit_reported) { + rack->alloc_limit_reported = 1; + counter_u64_add(rack_alloc_limited_conns, 1); + } + return (NULL); + } + return (rack_alloc(rack)); +} + /* wrapper to allocate a sendmap entry, subject to a specific limit */ static struct rack_sendmap * rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) @@ -1196,7 +1246,6 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) /* currently there is only one limit type */ rack->r_ctl.rc_num_split_allocs--; } - rack->r_ctl.rc_num_maps_alloced--; if (rack->r_ctl.rc_tlpsend == rsm) rack->r_ctl.rc_tlpsend = NULL; if (rack->r_ctl.rc_next == rsm) @@ -1206,9 +1255,11 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) if (rack->rc_free_cnt < rack_free_cache) { memset(rsm, 0, sizeof(struct rack_sendmap)); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); + rsm->r_limit_type = 0; rack->rc_free_cnt++; return; } + rack->r_ctl.rc_num_maps_alloced--; uma_zfree(rack_zone, rsm); } @@ -1222,11 +1273,9 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui #ifdef NETFLIX_STATS int32_t gput; #endif -#ifdef NETFLIX_CWV - u_long old_cwnd = tp->snd_cwnd; -#endif INP_WLOCK_ASSERT(tp->t_inpcb); + tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { @@ -1264,7 +1313,6 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; -#ifdef NETFLIX_CWV if (tp->t_maxpeakrate) { /* * We update t_peakrate_thr. This gives us roughly @@ -1272,7 +1320,6 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui */ tcp_update_peakrate_thr(tp); } -#endif } #endif if (tp->snd_cwnd > tp->snd_ssthresh) { @@ -1298,39 +1345,10 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; } -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - /* - * Per RFC 7661: The behaviour in the non-validated phase is - * specified as: o A sender determines whether to increase - * the cwnd based upon whether it is cwnd-limited (see - * Section 4.5.3): * A sender that is cwnd-limited MAY use - * the standard TCP method to increase cwnd (i.e., the - * standard method permits a TCP sender that fully utilises - * the cwnd to increase the cwnd each time it receives an - * ACK). * A sender that is not cwnd-limited MUST NOT - * increase the cwnd when ACK packets are received in this - * phase (i.e., needs to avoid growing the cwnd when it has - * not recently sent using the current size of cwnd). - */ - if ((tp->snd_cwnd > old_cwnd) && - (tp->cwv_cwnd_valid == 0) && - (!(tp->ccv->flags & CCF_CWND_LIMITED))) { - tp->snd_cwnd = old_cwnd; - } - /* Try to update pipeAck and NCWV state */ - if (TCPS_HAVEESTABLISHED(tp->t_state) && - !IN_RECOVERY(tp->t_flags)) { - uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); - - tcp_newcwv_update_pipeack(tp, data); - } - } /* we enforce max peak rate if it is set. */ if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { tp->snd_cwnd = tp->t_peakrate_thr; } -#endif } static void @@ -1379,16 +1397,8 @@ rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; } + tp->snd_recover = tp->snd_una; EXIT_RECOVERY(tp->t_flags); - - -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - (tp->snd_cwv.in_recovery)) - tcp_newcwv_end_recovery(tp); - } -#endif } static void @@ -1450,16 +1460,6 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { - tcp_newcwv_enter_recovery(tp); - } - if (type == CC_RTO) { - tcp_newcwv_reset(tp); - } - } -#endif } @@ -1479,11 +1479,21 @@ rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); - if (tp->snd_cwnd == 1) - i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ - else - i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); - + if (V_tcp_initcwnd_segments) + i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), + max(2 * tp->t_maxseg, 14600)); + else if (V_tcp_do_rfc3390) + i_cwnd = min(4 * tp->t_maxseg, + max(2 * tp->t_maxseg, 4380)); + else { + /* Per RFC5681 Section 3.1 */ + if (tp->t_maxseg > 2190) + i_cwnd = 2 * tp->t_maxseg; + else if (tp->t_maxseg > 1095) + i_cwnd = 3 * tp->t_maxseg; + else + i_cwnd = 4 * tp->t_maxseg; + } if (reduce_largest) { /* * Do we reduce the largest cwnd to make @@ -1549,8 +1559,7 @@ rack_do_drop(struct mbuf *m, struct tcpcb *tp) } static void -rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, - int32_t rstreason, int32_t tlen) +rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); @@ -1736,7 +1745,7 @@ rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, * TCB is still valid and locked. */ static int -rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) { int32_t todrop; int32_t thflags; @@ -1778,17 +1787,6 @@ rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tc TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } - /* - * DSACK - add SACK block for dropped range - */ - if (tp->t_flags & TF_SACK_PERMIT) { - tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); - /* - * ACK now, as the next in-sequence segment - * will clear the DSACK block again - */ - tp->t_flags |= TF_ACKNOW; - } *drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; @@ -2124,8 +2122,6 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) /* We can't start any timer in persists */ return (rack_get_persists_timer_val(tp, rack)); } - if (tp->t_state < TCPS_ESTABLISHED) - goto activate_rxt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) { /* Nothing on the send map */ @@ -2184,6 +2180,12 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) */ goto activate_rxt; } + if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) { + /* + * Peer collapsed rwnd, don't do TLP. + */ + goto activate_rxt; + } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ @@ -2288,7 +2290,9 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int /* A previous call is already set up */ return; } - if (tp->t_state == TCPS_CLOSED) { + + if ((tp->t_state == TCPS_CLOSED) || + (tp->t_state == TCPS_LISTEN)) { return; } stopped = rack->rc_tmr_stopped; @@ -2307,8 +2311,8 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int * We are still left on the hpts when the to goes * it will be for output. */ - if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) - slot = cts - rack->r_ctl.rc_last_output_to; + if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) + slot = rack->r_ctl.rc_last_output_to - cts; else slot = 1; } @@ -2330,7 +2334,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int } hpts_timeout = rack_timer_start(tp, rack, cts); if (tp->t_flags & TF_DELACK) { - delayed_ack = TICKS_2_MSEC(tcp_delacktime); + delayed_ack = tcp_delacktime; rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; } if (delayed_ack && ((hpts_timeout == 0) || @@ -2487,6 +2491,43 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) return (0); } +static struct rack_sendmap * +rack_merge_rsm(struct tcp_rack *rack, + struct rack_sendmap *l_rsm, + struct rack_sendmap *r_rsm) +{ + /* + * We are merging two ack'd RSM's, + * the l_rsm is on the left (lower seq + * values) and the r_rsm is on the right + * (higher seq value). The simplest way + * to merge these is to move the right + * one into the left. I don't think there + * is any reason we need to try to find + * the oldest (or last oldest retransmitted). + */ + l_rsm->r_end = r_rsm->r_end; + if (r_rsm->r_rtr_bytes) + l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; + if (r_rsm->r_in_tmap) { + /* This really should not happen */ + TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); + } + /* Now the flags */ + if (r_rsm->r_flags & RACK_HAS_FIN) + l_rsm->r_flags |= RACK_HAS_FIN; + if (r_rsm->r_flags & RACK_TLP) + l_rsm->r_flags |= RACK_TLP; + TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next); + if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { + /* Transfer the split limit to the map we free */ + r_rsm->r_limit_type = l_rsm->r_limit_type; + l_rsm->r_limit_type = 0; + } + rack_free(rack, r_rsm); + return(l_rsm); +} + /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal rack_output() will then @@ -2590,7 +2631,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) int32_t idx; struct rack_sendmap *nrsm; - nrsm = rack_alloc(rack); + nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* * No memory to split, we will just exit and punt @@ -2937,7 +2978,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) - rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); + rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, @@ -3281,7 +3322,7 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, * Here we retransmitted less than the whole thing which means we * have to split this into what was transmitted and what was not. */ - nrsm = rack_alloc(rack); + nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* * We can't get memory, so lets not proceed. @@ -3415,9 +3456,6 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, * Hmm out of memory and the tcb got destroyed while * we tried to wait. */ -#ifdef INVARIANTS - panic("Out of memory when we should not be rack:%p", rack); -#endif return; } if (th_flags & TH_FIN) { @@ -3428,15 +3466,8 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, rsm->r_tim_lastsent[0] = ts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; - if (th_flags & TH_SYN) { - /* The data space is one beyond snd_una */ - rsm->r_start = seq_out + 1; - rsm->r_end = rsm->r_start + (len - 1); - } else { - /* Normal case */ - rsm->r_start = seq_out; - rsm->r_end = rsm->r_start + len; - } + rsm->r_start = seq_out; + rsm->r_end = rsm->r_start + len; rsm->r_sndcnt = 0; TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); @@ -3486,11 +3517,8 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, * Ok we must split off the front and then let the * update do the rest */ - nrsm = rack_alloc(rack); + nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { -#ifdef INVARIANTS - panic("Ran out of memory that was preallocated? rack:%p", rack); -#endif rack_update_rsm(tp, rack, rsm, ts); return; } @@ -3908,6 +3936,14 @@ rack_log_sack_passed(struct tcpcb *tp, if (nrsm->r_flags & RACK_ACKED) { /* Skip ack'd segments */ continue; + } + if (nrsm->r_flags & RACK_SACK_PASSED) { + /* + * We found one that is already marked + * passed, we have been here before and + * so all others below this are marked. + */ + break; } idx = nrsm->r_rtr_cnt - 1; if (ts == nrsm->r_tim_lastsent[idx]) { @@ -4114,6 +4150,26 @@ rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack rsm->r_in_tmap = 0; } out: + if (rsm && (rsm->r_flags & RACK_ACKED)) { + /* + * Now can we merge this newly acked + * block with either the previous or + * next block? + */ + nrsm = TAILQ_NEXT(rsm, r_next); + if (nrsm && + (nrsm->r_flags & RACK_ACKED)) { + /* yep this and next can be merged */ + rsm = rack_merge_rsm(rack, rsm, nrsm); + } + /* Now what about the previous? */ + nrsm = TAILQ_PREV(rsm, rack_head, r_next); + if (nrsm && + (nrsm->r_flags & RACK_ACKED)) { + /* yep the previous and this can be merged */ + rsm = rack_merge_rsm(rack, nrsm, rsm); + } + } if (used_ref == 0) { counter_u64_add(rack_sack_proc_all, 1); } else { @@ -4353,16 +4409,13 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) } sack_blocks[num_sack_blks] = sack; num_sack_blks++; -#ifdef NETFLIX_STATS } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ - tcp_record_dsack(sack.start, sack.end); -#endif +/* tcp_record_dsack(sack.start, sack.end); */ } - } if (num_sack_blks == 0) goto out; @@ -4371,7 +4424,9 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) * just one pass. */ if (rack_use_sack_filter) { - num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); + num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, + num_sack_blks, th->th_ack); + ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); } if (num_sack_blks < 2) { goto do_sack_work; @@ -4620,8 +4675,9 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, return (0); } if (rack->r_ctl.rc_early_recovery) { - if (IN_FASTRECOVERY(tp->t_flags)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (IN_RECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover) && + (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); @@ -4648,8 +4704,9 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, sowwakeup_locked(so); m_freem(mfree); if (rack->r_ctl.rc_early_recovery == 0) { - if (IN_FASTRECOVERY(tp->t_flags)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (IN_RECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover) && + (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); @@ -4707,7 +4764,11 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, * send garbage on first SYN. */ int32_t nsegs; +#ifdef TCP_RFC7413 int32_t tfo_syn; +#else +#define tfo_syn (FALSE) +#endif struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -4816,8 +4877,10 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ +#ifdef TCP_RFC7413 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && - IS_FASTOPEN(tp->t_flags)); + (tp->t_flags & TF_FASTOPEN)); +#endif if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; @@ -5024,8 +5087,9 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, /* Clean receiver SACK report if present */ - if (tp->rcv_numsacks) - tcp_clean_sackreport(tp); +/* if (tp->rcv_numsacks) + tcp_clean_sackreport(tp); +*/ TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* @@ -5284,8 +5348,6 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { - int tfo_partial = 0; - TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC @@ -5298,20 +5360,11 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); - /* - * If not all the data that was sent in the TFO SYN - * has been acked, resend the remainder right away. - */ - if (IS_FASTOPEN(tp->t_flags) && - (tp->snd_una != tp->snd_max)) { - tp->snd_nxt = th->th_ack; - tfo_partial = 1; - } /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ - if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { + if (DELAY_ACK(tp, tlen) && tlen != 0) { rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; @@ -5320,26 +5373,10 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->t_flags |= TF_ACKNOW; } - if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && - V_tcp_do_ecn) { + if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } - if (SEQ_GT(th->th_ack, tp->snd_una)) { - /* - * We advance snd_una for the - * fast open case. If th_ack is - * acknowledging data beyond - * snd_una we can't just call - * ack-processing since the - * data stream in our send-map - * will start at snd_una + 1 (one - * beyond the SYN). If its just - * equal we don't need to do that - * and there is no send_map. - */ - tp->snd_una++; - } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 @@ -5423,7 +5460,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* @@ -5447,13 +5484,13 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } - if (IS_FASTOPEN(tp->t_flags)) { +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) { /* - * When a TFO connection is in SYN_RECEIVED, the - * only valid packets are the initial SYN, a - * retransmit/copy of the initial SYN (possibly with - * a subset of the original data), a valid ACK, a - * FIN, or a RST. + * When a TFO connection is in SYN_RECEIVED, the only valid + * packets are the initial SYN, a retransmit/copy of the + * initial SYN (possibly with a subset of the original + * data), a valid ACK, a FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); @@ -5474,8 +5511,17 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, return (0); } } +#endif if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, &ret_val); + return (ret_val); + } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. @@ -5520,16 +5566,18 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } - tp->snd_wnd = tiwin; /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { - if (IS_FASTOPEN(tp->t_flags)) { +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) { + tp->snd_wnd = tiwin; cc_conn_init(tp); } +#endif return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } @@ -5539,22 +5587,13 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; + tp->snd_wnd = tiwin; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; - if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { - tcp_fastopen_decrement_counter(tp->t_tfo_pending); - tp->t_tfo_pending = NULL; - - /* - * Account for the ACK of our SYN prior to - * regular ACK processing below. - */ - tp->snd_una++; - } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; @@ -5562,13 +5601,25 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); +#ifdef TCP_RFC7413 + if (tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + + /* + * Account for the ACK of our SYN prior to regular + * ACK processing below. + */ + tp->snd_una++; + } /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ - if (!IS_FASTOPEN(tp->t_flags)) + if (!(tp->t_flags & TF_FASTOPEN)) +#endif cc_conn_init(tp); } /* @@ -5576,7 +5627,7 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) - (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, + (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { @@ -5836,7 +5887,7 @@ static int rack_check_data_after_close(struct mbuf *m, struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) { - struct tcp_rack *rack; + struct tcp_rack *rack; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -6353,7 +6404,6 @@ rack_init(struct tcpcb *tp) rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; rack->r_ctl.rc_prr_inc_var = rack_inc_var; - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; @@ -6375,6 +6425,8 @@ rack_init(struct tcpcb *tp) TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } + rack_stop_all_timers(tp); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); return (0); } @@ -6431,6 +6483,8 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } + /* Make sure snd_nxt is correctly set */ + tp->snd_nxt = tp->snd_max; } static void @@ -6473,9 +6527,6 @@ rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) case TCPS_CLOSED: case TCPS_TIME_WAIT: default: -#ifdef INVARIANTS - panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); -#endif break; }; } @@ -6585,10 +6636,6 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * allow the tcbinfo to be in either locked or unlocked, as the * caller may have unnecessarily acquired a lock due to a race. */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tp->t_state != TCPS_ESTABLISHED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); @@ -6600,37 +6647,17 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, &log, true); } - if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { - way_out = 4; - goto done_with_input; - } - /* - * If a segment with the ACK-bit set arrives in the SYN-SENT state - * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. - */ - if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && - (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); - return; - } /* * Segment received on connection. Reset idle time and keep-alive * timer. XXX: This should be done after segment validation to * ignore broken/spoofed segs. */ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { -#ifdef NETFLIX_CWV - if ((tp->cwv_enabled) && - ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { - tcp_newcwv_nvp_closedown(tp); - } else -#endif - if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { + if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { counter_u64_add(rack_input_idle_reduces, 1); rack_cc_after_idle(tp, (rack->r_idle_reduce_largest ? 1 :0)); @@ -6639,14 +6666,6 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, rack->r_ctl.rc_rcvtime = cts; tp->t_rcvtime = ticks; -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) - tcp_newcwv_nvp_closedown(tp); - } -#endif /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. @@ -6737,22 +6756,6 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; - if (IS_FASTOPEN(tp->t_flags)) { - if (to.to_flags & TOF_FASTOPEN) { - uint16_t mss; - - if (to.to_flags & TOF_MSS) - mss = to.to_mss; - else - if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) - mss = TCP6_MSS; - else - mss = TCP_MSS; - tcp_fastopen_update_cache(tp, mss, - to.to_tfo_len, to.to_tfo_cookie); - } else - tcp_fastopen_disable_path(tp); - } } /* * At this point we are at the initial call. Here we decide @@ -6769,7 +6772,6 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* Set the flag */ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); - rack_stop_all_timers(tp); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } /* @@ -6801,24 +6803,6 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ INP_WLOCK_ASSERT(tp->t_inpcb); tcp_rack_xmit_timer_commit(rack, tp); - if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && - (rack->rc_in_persist == 0)){ - /* - * The peer shrunk its window on us to the point - * where we have sent too much. The only thing - * we can do here is stop any timers and - * enter persist. We most likely lost the last - * bytes we sent but oh well, we will have to - * retransmit them after the peer is caught up. - */ - if (rack->rc_inp->inp_in_hpts) - tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); - rack_timer_cancel(tp, rack, cts, __LINE__); - rack_enter_persist(tp, rack, cts); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); - way_out = 3; - goto done_with_input; - } if (nxt_pkt == 0) { if (rack->r_wanted_output != 0) { did_out = 1; @@ -6848,7 +6832,6 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, rack_timer_audit(tp, rack, &so->so_snd); way_out = 2; } - done_with_input: rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); if (did_out) rack->r_wanted_output = 0; @@ -6871,7 +6854,7 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, #ifdef RSS struct tcp_function_block *tfb; struct tcp_rack *rack; - struct epoch_tracker et; + struct inpcb *inp; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_state == 0) { @@ -6879,11 +6862,9 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * Initial input (ACK to SYN-ACK etc)lets go ahead and get * it processed */ - INP_INFO_RLOCK_ET(&V_tcbinfo, et); tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return; } tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); @@ -6959,13 +6940,17 @@ rack_output(struct tcpcb *tp) #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif +#ifdef NETFLIX_TCP_O_UDP struct udphdr *udp = NULL; +#endif struct tcp_rack *rack; struct tcphdr *th; uint8_t pass = 0; - uint8_t wanted_cookie = 0; u_char opt[TCP_MAXOLEN]; - unsigned ipoptlen, optlen, hdrlen, ulen=0; + unsigned ipoptlen, optlen, hdrlen; +#ifdef NETFLIX_TCP_O_UDP + unsigned ulen; +#endif uint32_t rack_seq; #if defined(IPSEC) || defined(IPSEC_SUPPORT) @@ -7004,6 +6989,18 @@ rack_output(struct tcpcb *tp) if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif + +#ifdef TCP_RFC7413 + /* + * For TFO connections in SYN_RECEIVED, only allow the initial + * SYN|ACK and those sent by the retransmit timer. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ + (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ + return (0); +#endif #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ @@ -7045,17 +7042,6 @@ rack_output(struct tcpcb *tp) } rack->r_wanted_output = 0; rack->r_timer_override = 0; - /* - * For TFO connections in SYN_SENT or SYN_RECEIVED, - * only allow the initial SYN or SYN|ACK and those sent - * by the retransmit timer. - */ - if (IS_FASTOPEN(tp->t_flags) && - ((tp->t_state == TCPS_SYN_RECEIVED) || - (tp->t_state == TCPS_SYN_SENT)) && - SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ - (tp->t_rxtshift == 0)) /* not a retransmit */ - return (0); /* * Determine length of data that should be transmitted, and flags * that will be used. If there is some data or critical controls @@ -7063,14 +7049,6 @@ rack_output(struct tcpcb *tp) * further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) - tcp_newcwv_nvp_closedown(tp); - } else -#endif if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) rack_cc_after_idle(tp, @@ -7141,10 +7119,12 @@ rack_output(struct tcpcb *tp) tlen = rsm->r_end - rsm->r_start; if (tlen > tp->t_maxseg) tlen = tp->t_maxseg; - KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); +#ifdef INVARIANTS + if (SEQ_GT(tp->snd_una, rsm->r_start)) { + panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", + tp, rack, tp->snd_una, rsm, rsm->r_start); + } +#endif sb_offset = rsm->r_start - tp->snd_una; cwin = min(tp->snd_wnd, tlen); len = cwin; @@ -7155,14 +7135,12 @@ rack_output(struct tcpcb *tp) len = rsm->r_end - rsm->r_start; sack_rxmit = 1; sendalot = 0; - KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; if (len >= tp->t_maxseg) { len = tp->t_maxseg; } + KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", + __func__, sb_offset)); } else if ((rack->rc_in_persist == 0) && ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { long tlen; @@ -7187,10 +7165,6 @@ rack_output(struct tcpcb *tp) } #endif tlen = rsm->r_end - rsm->r_start; - KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; if (tlen > rack->r_ctl.rc_prr_sndcnt) { len = rack->r_ctl.rc_prr_sndcnt; @@ -7212,6 +7186,8 @@ rack_output(struct tcpcb *tp) goto just_return_nolock; } } + KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", + __func__, sb_offset)); if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; @@ -7236,6 +7212,20 @@ rack_output(struct tcpcb *tp) /* For debugging */ rack->r_ctl.rc_rsm_at_retran = rsm; #endif + /* + * Enforce a connection sendmap count limit if set + * as long as we are not retransmiting. + */ + if ((rsm == NULL) && + (rack_map_entries_limit > 0) && + (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { + counter_u64_add(rack_to_alloc_limited, 1); + if (!rack->alloc_limit_reported) { + rack->alloc_limit_reported = 1; + counter_u64_add(rack_alloc_limited_conns, 1); + } + goto just_return_nolock; + } /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. @@ -7306,7 +7296,7 @@ rack_output(struct tcpcb *tp) uint32_t avail; avail = sbavail(sb); - if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) + if (SEQ_GT(tp->snd_nxt, tp->snd_una)) sb_offset = tp->snd_nxt - tp->snd_una; else sb_offset = 0; @@ -7347,9 +7337,18 @@ rack_output(struct tcpcb *tp) * data possible so far in the scoreboard. */ outstanding = tp->snd_max - tp->snd_una; - if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) - len = 0; - else if (avail > sb_offset) + if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { + if (tp->snd_wnd > outstanding) { + len = tp->snd_wnd - outstanding; + /* Check to see if we have the data */ + if (((sb_offset + len) > avail) && + (avail > sb_offset)) + len = avail - sb_offset; + else + len = 0; + } else + len = 0; + } else if (avail > sb_offset) len = avail - sb_offset; else len = 0; @@ -7398,18 +7397,22 @@ rack_output(struct tcpcb *tp) * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ - if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && - ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { - if (tp->t_state != TCPS_SYN_RECEIVED) + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + if ((tp->t_state != TCPS_SYN_RECEIVED) && + (tp->t_state != TCPS_SYN_SENT)) flags &= ~TH_SYN; +#ifdef TCP_RFC7413 /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ - if (IS_FASTOPEN(tp->t_flags) && + if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; +#endif sb_offset--, len++; + if (sbavail(sb) == 0) + len = 0; } /* * Be careful not to send data and/or FIN on SYN segments. This @@ -7420,29 +7423,16 @@ rack_output(struct tcpcb *tp) len = 0; flags &= ~TH_FIN; } +#ifdef TCP_RFC7413 /* - * On TFO sockets, ensure no data is sent in the following cases: - * - * - When retransmitting SYN|ACK on a passively-created socket - * - * - When retransmitting SYN on an actively created socket - * - * - When sending a zero-length cookie (cookie request) on an - * actively created socket - * - * - When the socket is in the CLOSED state (RST is being sent) + * When retransmitting SYN|ACK on a passively-created TFO socket, + * don't include data, as the presence of data may have caused the + * original SYN|ACK to have been dropped by a middlebox. */ - if (IS_FASTOPEN(tp->t_flags) && - (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || - ((tp->t_state == TCPS_SYN_SENT) && - (tp->t_tfo_client_cookie_len == 0)) || - (flags & TH_RST))) { - sack_rxmit = 0; - len = 0; - } - /* Without fast-open there should never be data sent on a SYN */ - if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) + if ((tp->t_flags & TF_FASTOPEN) && + ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) len = 0; +#endif if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been @@ -7519,7 +7509,9 @@ rack_output(struct tcpcb *tp) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && +#ifdef NETFLIX_TCP_O_UDP (tp->t_port == 0) && +#endif ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0) @@ -7688,10 +7680,13 @@ rack_output(struct tcpcb *tp) * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ - if ((flags & TH_FIN) && - (tp->snd_nxt == tp->snd_una)) { - pass = 11; - goto send; + if (flags & TH_FIN) { + if ((tp->t_flags & TF_SENTFIN) || + (((tp->t_flags & TF_SENTFIN) == 0) && + (tp->snd_nxt == tp->snd_una))) { + pass = 11; + goto send; + } } /* * No reason to send a segment, just return. @@ -7750,44 +7745,27 @@ rack_output(struct tcpcb *tp) if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); -#ifdef NETFLIX_TCPOUDP +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; - +#ifdef TCP_RFC7413 /* - * On SYN or SYN|ACK transmits on TFO connections, - * only include the TFO option if it is not a - * retransmit, as the presence of the TFO option may - * have caused the original SYN or SYN|ACK to have - * been dropped by a middlebox. + * Only include the TFO option on the first + * transmission of the SYN|ACK on a + * passively-created TFO socket, as the presence of + * the TFO option may have caused the original + * SYN|ACK to have been dropped by a middlebox. */ - if (IS_FASTOPEN(tp->t_flags) && + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift == 0)) { - if (tp->t_state == TCPS_SYN_RECEIVED) { - to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; - to.to_tfo_cookie = - (u_int8_t *)&tp->t_tfo_cookie.server; - to.to_flags |= TOF_FASTOPEN; - wanted_cookie = 1; - } else if (tp->t_state == TCPS_SYN_SENT) { - to.to_tfo_len = - tp->t_tfo_client_cookie_len; - to.to_tfo_cookie = - tp->t_tfo_cookie.client; - to.to_flags |= TOF_FASTOPEN; - wanted_cookie = 1; - /* - * If we wind up having more data to - * send with the SYN than can fit in - * one segment, don't send any more - * until the SYN|ACK comes back from - * the other end. - */ - sendalot = 0; - } + to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; + to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; + to.to_flags |= TOF_FASTOPEN; } +#endif } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { @@ -7822,15 +7800,8 @@ rack_output(struct tcpcb *tp) /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); - /* - * If we wanted a TFO option to be added, but it was unable - * to fit, ensure no data is sent. - */ - if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && - !(to.to_flags & TOF_FASTOPEN)) - len = 0; } -#ifdef NETFLIX_TCPOUDP +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ @@ -7996,8 +7967,8 @@ rack_output(struct tcpcb *tp) msb = NULL; else msb = sb; - m->m_next = tcp_m_copym(mb, moff, &len, - if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); + m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len, + if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /*, 0, NULL*/); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy @@ -8031,6 +8002,8 @@ rack_output(struct tcpcb *tp) * TLP should not count in retran count, but * in its own bin */ +/* tp->t_sndtlppack++;*/ +/* tp->t_sndtlpbyte += len;*/ counter_u64_add(rack_tlp_retran, 1); counter_u64_add(rack_tlp_retran_bytes, len); } else { @@ -8156,7 +8129,7 @@ rack_output(struct tcpcb *tp) #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); -#ifdef NETFLIX_TCPOUDP +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -8164,10 +8137,10 @@ rack_output(struct tcpcb *tp) ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); - } else + } else #endif th = (struct tcphdr *)(ip6 + 1); - tcpip_fillheaders(inp, ip6, th); + tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th); } else #endif /* INET6 */ { @@ -8175,7 +8148,7 @@ rack_output(struct tcpcb *tp) #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif -#ifdef NETFLIX_TCPOUDP +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -8186,7 +8159,7 @@ rack_output(struct tcpcb *tp) } else #endif th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(inp, ip, th); + tcpip_fillheaders(inp,/*tp->t_port, */ ip, th); } /* * Fill in fields, remembering maximum advertised window for use in @@ -8277,20 +8250,15 @@ rack_output(struct tcpcb *tp) /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. - * If a RST segment is sent, advertise a window of zero. */ - if (flags & TH_RST) { + if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && + recwin < (long)tp->t_maxseg) recwin = 0; - } else { - if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && - recwin < (long)tp->t_maxseg) - recwin = 0; - if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && - recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) - recwin = (long)(tp->rcv_adv - tp->rcv_nxt); - if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) - recwin = (long)TCP_MAXWIN << tp->rcv_scale; - } + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && + recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) + recwin = (long)(tp->rcv_adv - tp->rcv_nxt); + if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) + recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a or @@ -8357,18 +8325,23 @@ rack_output(struct tcpcb *tp) * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); } else { +#endif m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); +#ifdef NETFLIX_TCP_O_UDP } +#endif } #endif #if defined(INET6) && defined(INET) @@ -8376,19 +8349,24 @@ rack_output(struct tcpcb *tp) #endif #ifdef INET { +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); } else { +#endif m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); +#ifdef NETFLIX_TCP_O_UDP } +#endif /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); @@ -8559,6 +8537,10 @@ rack_output(struct tcpcb *tp) * retransmit. In persist state, just set snd_max. */ if (error == 0) { +/* if (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->t_flags & TF_SACK_PERMIT) && + tp->rcv_numsacks > 0) + tcp_clean_dsack_blocks(tp);*/ if (len == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (len == 1) { @@ -8574,15 +8556,19 @@ rack_output(struct tcpcb *tp) } } if (sub_from_prr && (error == 0)) { - rack->r_ctl.rc_prr_sndcnt -= len; + if (rack->r_ctl.rc_prr_sndcnt >= len) + rack->r_ctl.rc_prr_sndcnt -= len; + else + rack->r_ctl.rc_prr_sndcnt = 0; } sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, pass, rsm); if ((tp->t_flags & TF_FORCEDATA) == 0 || (rack->rc_in_persist == 0)) { +#ifdef NETFLIX_STATS tcp_seq startseq = tp->snd_nxt; - +#endif /* * Advance snd_nxt over sequence space of this segment. */ @@ -8613,17 +8599,6 @@ rack_output(struct tcpcb *tp) tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt; - /* - * Time this transmission if not a retransmission and - * not currently timing anything. - * This is only relevant in case of switching back to - * the base stack. - */ - if (tp->t_rtttime == 0) { - tp->t_rtttime = ticks; - tp->t_rtseq = startseq; - TCPSTAT_INC(tcps_segstimed); - } #ifdef NETFLIX_STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; @@ -8996,9 +8971,7 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } -#ifdef NETFLIX_STATS - tcp_log_socket_option(tp, sopt->sopt_name, optval, error); -#endif +/* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/ INP_WUNLOCK(inp); return (error); } @@ -9131,7 +9104,6 @@ struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, .tfb_tcp_do_segment = rack_do_segment, - .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, .tfb_tcp_ctloutput = rack_ctloutput, .tfb_tcp_fb_init = rack_init, .tfb_tcp_fb_fini = rack_fini, @@ -9241,4 +9213,3 @@ static moduledata_t tcp_rack = { MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); -MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c new file mode 100644 index 000000000000..82e5d51310e3 --- /dev/null +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -0,0 +1,859 @@ +/*- + * Copyright (c) 2016-2018 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Author: Randall Stewart + * This work is based on the ACM Queue paper + * BBR - Congestion Based Congestion Control + * and also numerous discussions with Neal, Yuchung and Van. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_ratelimit.h" +/*#include "opt_kern_tls.h"*/ +#include +#include +#include +#ifdef TCP_HHOOK +#include +#endif +#include +#include +#include +#include +#include +#ifdef KERN_TLS +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define TCPSTATES /* for logging */ + +#include +#include +#include +#include +#include /* required for icmp_var.h */ +#include /* for ICMP_BANDLIM */ +#include +#include +#include +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include +#endif +#ifdef INET6 +#include +#endif +#include + +#include +#include +#include + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) +#include +#include +#endif /* IPSEC */ + +#include +#include +#include + +#ifdef MAC +#include +#endif +#include "rack_bbr_common.h" + +/* + * Common TCP Functions - These are shared by borth + * rack and BBR. + */ + + +#ifdef KERN_TLS +uint32_t +ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) +{ + struct sbtls_info *tls; + uint32_t len; + +again: + tls = so->so_snd.sb_tls_info; + len = tls->sb_params.sb_maxlen; /* max tls payload */ + len += tls->sb_params.sb_tls_hlen; /* tls header len */ + len += tls->sb_params.sb_tls_tlen; /* tls trailer len */ + if ((len * 4) > rwnd) { + /* + * Stroke this will suck counter and what + * else should we do Drew? From the + * TCP perspective I am not sure + * what should be done... + */ + if (tls->sb_params.sb_maxlen > 4096) { + tls->sb_params.sb_maxlen -= 4096; + if (tls->sb_params.sb_maxlen < 4096) + tls->sb_params.sb_maxlen = 4096; + goto again; + } + } + return (len); +} +#endif + +int +ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt) +{ + /* + * We are passed a raw change of mbuf packets + * that arrived in LRO. They are linked via + * the m_nextpkt link in the pkt-headers. + * + * We process each one by: + * a) saving off the next + * b) stripping off the ether-header + * c) formulating the arguments for + * the tfb_tcp_hpts_do_segment + * d) calling each mbuf to tfb_tcp_hpts_do_segment + * after adjusting the time to match the arrival time. + * Note that the LRO code assures no IP options are present. + * + * The symantics for calling tfb_tcp_hpts_do_segment are the + * following: + * 1) It returns 0 if all went well and you (the caller) need + * to release the lock. + * 2) If nxt_pkt is set, then the function will surpress calls + * to tfb_tcp_output() since you are promising to call again + * with another packet. + * 3) If it returns 1, then you must free all the packets being + * shipped in, the tcb has been destroyed (or about to be destroyed). + */ + struct mbuf *m_save; + struct ether_header *eh; + struct epoch_tracker et; + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip = NULL; /* Keep compiler happy. */ +#endif + struct ifnet *ifp; + struct timeval tv; + int32_t retval, nxt_pkt, tlen, off; + uint16_t etype; + uint16_t drop_hdrlen; + uint8_t iptos, no_vn=0, bpf_req=0; + + /* + * This is a bit deceptive, we get the + * "info epoch" which is really the network + * epoch. This covers us on both any INP + * type change but also if the ifp goes + * away it covers us as well. + */ + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + if (m && m->m_pkthdr.rcvif) + ifp = m->m_pkthdr.rcvif; + else + ifp = NULL; + if (ifp) { + bpf_req = bpf_peers_present(ifp->if_bpf); + } else { + /* + * We probably should not work around + * but kassert, since lro alwasy sets rcvif. + */ + no_vn = 1; + goto skip_vnet; + } + CURVNET_SET(ifp->if_vnet); +skip_vnet: + while (m) { + m_save = m->m_nextpkt; + m->m_nextpkt = NULL; + /* Now lets get the ether header */ + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + /* Let the BPF see the packet */ + if (bpf_req && ifp) + ETHER_BPF_MTAP(ifp, m); + m_adj(m, sizeof(*eh)); + /* Trim off the ethernet header */ + switch (etype) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { + m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); + if (m == NULL) { + TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + goto skipped_pkt; + } + } + ip6 = (struct ip6_hdr *)(eh + 1); + th = (struct tcphdr *)(ip6 + 1); + tlen = ntohs(ip6->ip6_plen); + drop_hdrlen = sizeof(*ip6); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in6_cksum_pseudo(ip6, tlen, + IPPROTO_TCP, m->m_pkthdr.csum_data); + th->th_sum ^= 0xffff; + } else + th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + goto skipped_pkt; + } + /* + * Be proactive about unspecified IPv6 address in source. + * As we use all-zero to indicate unbounded/unconnected pcb, + * unspecified IPv6 address can be used to confuse us. + * + * Note that packets with unspecified IPv6 destination is + * already dropped in ip6_input. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { + /* XXX stat */ + m_freem(m); + goto skipped_pkt; + } + iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) + == NULL) { + TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + goto skipped_pkt; + } + } + ip = (struct ip *)(eh + 1); + th = (struct tcphdr *)(ip + 1); + drop_hdrlen = sizeof(*ip); + iptos = ip->ip_tos; + tlen = ntohs(ip->ip_len) - sizeof(struct ip); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, + htonl(m->m_pkthdr.csum_data + tlen + + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + int len; + struct ipovly *ipov = (struct ipovly *)ip; + /* + * Checksum extended TCP header and data. + */ + len = drop_hdrlen + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = htons(tlen); + th->th_sum = in_cksum(m, len); + /* Reset length for SDT probes. */ + ip->ip_len = htons(len); + /* Reset TOS bits */ + ip->ip_tos = iptos; + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + } + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + goto skipped_pkt; + } + break; + } +#endif + } + /* + * Convert TCP protocol specific fields to host format. + */ + tcp_fields_to_host(th); + + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + TCPSTAT_INC(tcps_rcvbadoff); + m_freem(m); + goto skipped_pkt; + } + tlen -= off; + drop_hdrlen += off; + /* + * Now lets setup the timeval to be when we should + * have been called (if we can). + */ + m->m_pkthdr.lro_nsegs = 1; + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000; + } else { + /* Should not be should we kassert instead? */ + tcp_get_usecs(&tv); + } + /* Now what about next packet? */ + if (m_save || has_pkt) + nxt_pkt = 1; + else + nxt_pkt = 0; + retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, + iptos, nxt_pkt, &tv); + if (retval) { + /* We lost the lock and tcb probably */ + m = m_save; + while (m) { + m_save = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + m = m_save; + } + if (no_vn == 0) + CURVNET_RESTORE(); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + return (retval); + } +skipped_pkt: + m = m_save; + } + if (no_vn == 0) + CURVNET_RESTORE(); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + return (retval); +} + +int +ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) +{ + struct mbuf *m; + + /* First lets see if we have old packets */ + if (tp->t_in_pkt) { + m = tp->t_in_pkt; + tp->t_in_pkt = NULL; + tp->t_tail_pkt = NULL; + if (ctf_process_inbound_raw(tp, so, m, have_pkt)) { + /* We lost the tcpcb (maybe a RST came in)? */ + return (1); + } + } + return (0); +} + +uint32_t +ctf_outstanding(struct tcpcb *tp) +{ + return (tp->snd_max - tp->snd_una); +} + +uint32_t +ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) +{ + if (rc_sacked <= ctf_outstanding(tp)) + return (ctf_outstanding(tp) - rc_sacked); + else { + /* TSNH */ +#ifdef INVARIANTS + panic("tp:%p rc_sacked:%d > out:%d", + tp, rc_sacked, ctf_outstanding(tp)); +#endif + return (0); + } +} + +void +ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) +{ + if (tp != NULL) { + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); + } else + tcp_dropwithreset(m, th, NULL, tlen, rstreason); +} + +/* + * ctf_drop_checks returns 1 for you should not proceed. It places + * in ret_val what should be returned 1/0 by the caller. The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +int +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +{ + int32_t todrop; + int32_t thflags; + int32_t tlen; + + thflags = *thf; + tlen = *tlenp; + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, todrop); + } else { + TCPSTAT_INC(tcps_rcvpartduppack); + TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); + } + /* + * DSACK - add SACK block for dropped range + */ + if (tp->t_flags & TF_SACK_PERMIT) { + tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); + /* + * ACK now, as the next in-sequence segment + * will clear the DSACK block again + */ + tp->t_flags |= TF_ACKNOW; + } + *drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + /* + * If segment ends after window, drop trailing data (and PUSH and + * FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); + if (todrop > 0) { + TCPSTAT_INC(tcps_rcvpackafterwin); + if (todrop >= tlen) { + TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment and + * ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_rcvwinprobe); + } else { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + return (1); + } + } else + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH | TH_FIN); + } + *thf = thflags; + *tlenp = tlen; + return (0); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +void +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) +{ + /* + * Generate an ACK dropping incoming segment if it occupies sequence + * space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all paths to this + * code happen after packets containing RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the segment + * we received passes the SYN-RECEIVED ACK test. If it fails send a + * RST. This breaks the loop in the "LAND" DoS attack, and also + * prevents an ACK storm between two listening ports that have been + * sent forged SYN segments, each with the source address of the + * other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max))) { + *ret_val = 1; + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return; + } else + *ret_val = 0; + tp->t_flags |= TF_ACKNOW; + if (m) + m_freem(m); +} + +void +ctf_do_drop(struct mbuf *m, struct tcpcb *tp) +{ + + /* + * Drop space held by incoming segment and return. + */ + if (tp != NULL) + INP_WUNLOCK(tp->t_inpcb); + if (m) + m_freem(m); +} + +int +ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) +{ + /* + * RFC5961 Section 3.2 + * + * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in + * window, we send challenge ACK. + * + * Note: to take into account delayed ACKs, we should test against + * last_ack_sent instead of rcv_nxt. Note 2: we handle special case + * of closed window, not covered by the RFC. + */ + int dropped = 0; + + if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(tp->t_state != TCPS_SYN_SENT, + ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", + __func__, th, tp)); + + if (V_tcp_insecure_rst || + (tp->last_ack_sent == th->th_seq) || + (tp->rcv_nxt == th->th_seq) || + ((tp->last_ack_sent - 1) == th->th_seq)) { + TCPSTAT_INC(tcps_drops); + /* Drop the connection. */ + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + so->so_error = ECONNRESET; + close: + tcp_state_change(tp, TCPS_CLOSED); + /* FALLTHROUGH */ + default: + tp = tcp_close(tp); + } + dropped = 1; + ctf_do_drop(m, tp); + } else { + TCPSTAT_INC(tcps_badrst); + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, + tp->rcv_nxt, tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + } + } else { + m_freem(m); + } + return (dropped); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +void +ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) +{ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + TCPSTAT_INC(tcps_badsyn); + if (V_tcp_insecure_syn && + SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp = tcp_drop(tp, ECONNRESET); + *ret_val = 1; + ctf_do_drop(m, tp); + } else { + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + *ret_val = 0; + ctf_do_drop(m, NULL); + } +} + +/* + * bbr_ts_check returns 1 for you should not proceed, the state + * machine should return. It places in ret_val what should + * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +int +ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, + int32_t tlen, int32_t thflags, int32_t * ret_val) +{ + + if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates ts_recent, + * the age will be reset later and ts_recent will get a + * valid value. If it does not, setting ts_recent to zero + * will at least satisfy the requirement that zero be placed + * in the timestamp echo reply when ts_recent isn't valid. + * The age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be dropped + * when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, tlen); + TCPSTAT_INC(tcps_pawsdrop); + *ret_val = 0; + if (tlen) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + } else { + ctf_do_drop(m, NULL); + } + return (1); + } + return (0); +} + +void +ctf_calc_rwin(struct socket *so, struct tcpcb *tp) +{ + int32_t win; + + /* + * Calculate amount of space in receive window, and then do TCP + * input processing. Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); +} + +void +ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) +{ + + if (tp->t_inpcb) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + } + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); +} + +uint32_t +ctf_fixed_maxseg(struct tcpcb *tp) +{ + int optlen; + + if (tp->t_flags & TF_NOOPT) + return (tp->t_maxseg); + + /* + * Here we have a simplified code from tcp_addoptions(), + * without a proper loop, and having most of paddings hardcoded. + * We only consider fixed options that we would send every + * time I.e. SACK is not considered. + * + */ +#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + if (tp->t_flags & TF_RCVD_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = 0; +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + } else { + if (tp->t_flags & TF_REQ_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = PAD(TCPOLEN_MAXSEG); + if (tp->t_flags & TF_REQ_SCALE) + optlen += PAD(TCPOLEN_WINDOW); +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if (tp->t_flags & TF_SACK_PERMIT) + optlen += PAD(TCPOLEN_SACK_PERMITTED); + } +#undef PAD + optlen = min(optlen, TCP_MAXOLEN); + return (tp->t_maxseg - optlen); +} + +void +ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex8 = num_sack_blks; + if (num_sack_blks > 0) { + log.u_bbr.flex1 = sack_blocks[0].start; + log.u_bbr.flex2 = sack_blocks[0].end; + } + if (num_sack_blks > 1) { + log.u_bbr.flex3 = sack_blocks[1].start; + log.u_bbr.flex4 = sack_blocks[1].end; + } + if (num_sack_blks > 2) { + log.u_bbr.flex5 = sack_blocks[2].start; + log.u_bbr.flex6 = sack_blocks[2].end; + } + if (num_sack_blks > 3) { + log.u_bbr.applimited = sack_blocks[3].start; + log.u_bbr.pkts_out = sack_blocks[3].end; + } + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_SACK_FILTER_RES, 0, + 0, &log, false, &tv); + } +} + +uint32_t +ctf_decay_count(uint32_t count, uint32_t decay) +{ + /* + * Given a count, decay it by a set percentage. The + * percentage is in thousands i.e. 100% = 1000, + * 19.3% = 193. + */ + uint64_t perc_count, decay_per; + uint32_t decayed_count; + if (decay > 1000) { + /* We don't raise it */ + return (count); + } + perc_count = count; + decay_per = decay; + perc_count *= decay_per; + perc_count /= 1000; + /* + * So now perc_count holds the + * count decay value. + */ + decayed_count = count - (uint32_t)perc_count; + return (decayed_count); +} diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h index c3d661cfd972..822208338d67 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.h +++ b/sys/netinet/tcp_stacks/rack_bbr_common.h @@ -38,16 +38,7 @@ #define TCP_MSS_ACCT_SIZE 70 #define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF) - -/* Magic flags to tell whats cooking on the pacing wheel */ -#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */ -#define PACE_TMR_RACK 0x02 /* RACK timer running */ -#define PACE_TMR_TLP 0x04 /* TLP timer running */ -#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ -#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ -#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ -#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */ -#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) +#define DUP_ACK_THRESHOLD 3 /* Magic flags for tracing progress events */ #define PROGRESS_DROP 1 @@ -61,8 +52,66 @@ #define USE_RTT_LOW 1 #define USE_RTT_AVG 2 +#define PACE_MAX_IP_BYTES 65536 +#define USECS_IN_SECOND 1000000 +#define MSEC_IN_SECOND 1000 +#define MS_IN_USEC 1000 +#define USEC_TO_MSEC(x) (x / MS_IN_USEC) +#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */ + #ifdef _KERNEL /* We have only 7 bits in rack so assert its true */ CTASSERT((PACE_TMR_MASK & 0x80) == 0); +#ifdef KERN_TLS +uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd); +#endif +int +ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, + struct mbuf *m, int has_pkt); +int +ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt); +uint32_t ctf_outstanding(struct tcpcb *tp); +uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked); +int +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, + struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, + int32_t * drop_hdrlen, int32_t * ret_val); +void +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); +void +ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t rstreason, int32_t tlen); +void +ctf_do_drop(struct mbuf *m, struct tcpcb *tp); + +int +ctf_process_rst(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp); + +void +ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t * ret_val); + +int +ctf_ts_check(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); + +void +ctf_calc_rwin(struct socket *so, struct tcpcb *tp); + +void +ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen); + +uint32_t +ctf_fixed_maxseg(struct tcpcb *tp); + +void +ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks); + +uint32_t +ctf_decay_count(uint32_t count, uint32_t decay_percentage); + #endif #endif diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index b8f0838010d6..8d3ffa8b5880 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -102,7 +102,8 @@ struct tcpcb { t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ - bits_spare : 4; + t_fin_is_rst: 1, /* Are fin's treated as resets */ + bits_spare : 3; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -271,6 +272,11 @@ struct tcp_function_block { void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); + int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); + int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index 431331e8d376..46710f614114 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -407,6 +407,7 @@ void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs); #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ +#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */