This commit updates rack to what is basically being used at NF as
well as sets in some of the groundwork for committing BBR. The hpts system is updated as well as some other needed utilities for the entrance of BBR. This is actually part 1 of 3 more needed commits which will finally complete with BBRv1 being added as a new tcp stack. Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D20834
This commit is contained in:
parent
d8d6907c38
commit
b80b5fa389
@ -6,7 +6,7 @@
|
||||
|
||||
STACKNAME= rack
|
||||
KMOD= tcp_${STACKNAME}
|
||||
SRCS= rack.c sack_filter.c
|
||||
SRCS= rack.c sack_filter.c rack_bbr_common.c
|
||||
|
||||
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
|
||||
SRCS+= opt_tcpdebug.h
|
||||
|
@ -759,7 +759,9 @@ int inp_so_options(const struct inpcb *inp);
|
||||
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
|
||||
#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
|
||||
#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */
|
||||
|
||||
#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */
|
||||
#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */
|
||||
#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */
|
||||
/*
|
||||
* Flags passed to in_pcblookup*() functions.
|
||||
*/
|
||||
|
@ -201,9 +201,8 @@ struct tcphdr {
|
||||
#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
|
||||
#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */
|
||||
#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */
|
||||
#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */
|
||||
#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */
|
||||
#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */
|
||||
#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */
|
||||
#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */
|
||||
#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */
|
||||
#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */
|
||||
@ -211,14 +210,18 @@ struct tcphdr {
|
||||
#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */
|
||||
#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */
|
||||
#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */
|
||||
#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
|
||||
#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */
|
||||
#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */
|
||||
#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */
|
||||
#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */
|
||||
#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */
|
||||
#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */
|
||||
#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */
|
||||
#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */
|
||||
#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */
|
||||
#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */
|
||||
#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */
|
||||
#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
|
||||
#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */
|
||||
#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */
|
||||
#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */
|
||||
#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
|
||||
#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
|
||||
#define TCP_BBR_PACE_PER_SEC 1086
|
||||
@ -227,17 +230,27 @@ struct tcphdr {
|
||||
#define TCP_BBR_PACE_SEG_MIN 1089
|
||||
#define TCP_BBR_PACE_CROSS 1090
|
||||
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
|
||||
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
|
||||
#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */
|
||||
#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */
|
||||
#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */
|
||||
#define TCP_RACK_TLP_USE 1095
|
||||
#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
|
||||
#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */
|
||||
#define TCP_BBR_EXTRA_GAIN 1097
|
||||
#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
|
||||
#define TCP_BBR_RETRAN_WTSO 1099
|
||||
#define TCP_DATA_AFTER_CLOSE 1100
|
||||
#define TCP_BBR_PROBE_RTT_GAIN 1101
|
||||
#define TCP_BBR_PROBE_RTT_LEN 1102
|
||||
#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */
|
||||
#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */
|
||||
#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */
|
||||
#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */
|
||||
#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */
|
||||
#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */
|
||||
#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */
|
||||
#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */
|
||||
#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */
|
||||
|
||||
|
||||
/* Start of reserved space for third-party user-settable options. */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -45,112 +45,80 @@ TAILQ_HEAD(hptsh, inpcb);
|
||||
|
||||
/* Number of useconds in a hpts tick */
|
||||
#define HPTS_TICKS_PER_USEC 10
|
||||
#define HPTS_MS_TO_SLOTS(x) (x * 100)
|
||||
#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
|
||||
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
|
||||
#define HPTS_USEC_IN_SEC 1000000
|
||||
#define HPTS_MSEC_IN_SEC 1000
|
||||
#define HPTS_USEC_IN_MSEC 1000
|
||||
|
||||
#define DEFAULT_HPTS_LOG 3072
|
||||
|
||||
/*
|
||||
* Log flags consist of
|
||||
* 7f 7f 1 1 bits
|
||||
* p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
|
||||
*
|
||||
* So for example cpu 10, number 10 would with
|
||||
* input active would show up as:
|
||||
* p_flags = 0001010 0001010 1 0
|
||||
* <or>
|
||||
* p_flags = 0x142a
|
||||
*/
|
||||
#define HPTS_HPTS_ACTIVE 0x01
|
||||
#define HPTS_INPUT_ACTIVE 0x02
|
||||
|
||||
#define HPTSLOG_IMMEDIATE 1
|
||||
#define HPTSLOG_INSERT_NORMAL 2
|
||||
#define HPTSLOG_INSERT_SLEEPER 3
|
||||
#define HPTSLOG_SLEEP_AFTER 4
|
||||
#define HPTSLOG_SLEEP_BEFORE 5
|
||||
#define HPTSLOG_INSERTED 6
|
||||
#define HPTSLOG_WAKEUP_HPTS 7
|
||||
#define HPTSLOG_SETTORUN 8
|
||||
#define HPTSLOG_HPTSI 9
|
||||
#define HPTSLOG_TOLONG 10
|
||||
#define HPTSLOG_AWAKENS 11
|
||||
#define HPTSLOG_TIMESOUT 12
|
||||
#define HPTSLOG_SLEEPSET 13
|
||||
#define HPTSLOG_WAKEUP_INPUT 14
|
||||
#define HPTSLOG_RESCHEDULE 15
|
||||
#define HPTSLOG_AWAKE 16
|
||||
#define HPTSLOG_INP_DONE 17
|
||||
|
||||
struct hpts_log {
|
||||
struct inpcb *inp;
|
||||
int32_t event;
|
||||
uint32_t cts;
|
||||
int32_t line;
|
||||
uint32_t ticknow;
|
||||
uint32_t t_paceslot;
|
||||
uint32_t t_hptsreq;
|
||||
uint32_t p_curtick;
|
||||
uint32_t p_prevtick;
|
||||
uint32_t slot_req;
|
||||
uint32_t p_on_queue_cnt;
|
||||
uint32_t p_nxt_slot;
|
||||
uint32_t p_cur_slot;
|
||||
uint32_t p_hpts_sleep_time;
|
||||
uint16_t p_flags;
|
||||
uint8_t p_onhpts;
|
||||
uint8_t p_oninput;
|
||||
uint8_t is_notempty;
|
||||
};
|
||||
|
||||
struct hpts_diag {
|
||||
uint32_t p_hpts_active;
|
||||
uint32_t p_nxt_slot;
|
||||
uint32_t p_cur_slot;
|
||||
uint32_t slot_req;
|
||||
uint32_t inp_hptsslot;
|
||||
uint32_t slot_now;
|
||||
uint32_t have_slept;
|
||||
uint32_t hpts_sleep_time;
|
||||
uint32_t yet_to_sleep;
|
||||
uint32_t need_new_to;
|
||||
int32_t co_ret;
|
||||
uint8_t p_on_min_sleep;
|
||||
uint32_t p_hpts_active; /* bbr->flex7 x */
|
||||
uint32_t p_nxt_slot; /* bbr->flex1 x */
|
||||
uint32_t p_cur_slot; /* bbr->flex2 x */
|
||||
uint32_t p_prev_slot; /* bbr->delivered */
|
||||
uint32_t p_runningtick; /* bbr->inflight */
|
||||
uint32_t slot_req; /* bbr->flex3 x */
|
||||
uint32_t inp_hptsslot; /* bbr->flex4 x */
|
||||
uint32_t slot_remaining; /* bbr->flex5 x */
|
||||
uint32_t have_slept; /* bbr->epoch x */
|
||||
uint32_t hpts_sleep_time; /* bbr->applimited x */
|
||||
uint32_t yet_to_sleep; /* bbr->lt_epoch x */
|
||||
uint32_t need_new_to; /* bbr->flex6 x */
|
||||
uint32_t wheel_tick; /* bbr->bw_inuse x */
|
||||
uint32_t maxticks; /* bbr->delRate x */
|
||||
uint32_t wheel_cts; /* bbr->rttProp x */
|
||||
int32_t co_ret; /* bbr->pkts_out x */
|
||||
uint32_t p_curtick; /* upper bbr->cur_del_rate */
|
||||
uint32_t p_lasttick; /* lower bbr->cur_del_rate */
|
||||
uint8_t p_on_min_sleep; /* bbr->flex8 x */
|
||||
};
|
||||
|
||||
/* Magic flags to tell whats cooking on the pacing wheel */
|
||||
#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */
|
||||
#define PACE_TMR_RACK 0x02 /* RACK timer running */
|
||||
#define PACE_TMR_TLP 0x04 /* TLP timer running */
|
||||
#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
|
||||
#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
|
||||
#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
|
||||
#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
|
||||
#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
|
||||
|
||||
#ifdef _KERNEL
|
||||
/* Each hpts has its own p_mtx which is used for locking */
|
||||
struct tcp_hpts_entry {
|
||||
/* Cache line 0x00 */
|
||||
struct mtx p_mtx; /* Mutex for hpts */
|
||||
uint32_t p_hpts_active; /* Flag that says hpts is awake */
|
||||
uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
|
||||
uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
|
||||
uint16_t p_hpts_active; /* Flag that says hpts is awake */
|
||||
uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
|
||||
uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
|
||||
uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
|
||||
uint32_t p_runningtick; /* Current tick we are at if we are running */
|
||||
uint32_t p_prev_slot; /* Previous slot we were on */
|
||||
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
|
||||
uint32_t p_nxt_slot; /* The next slot outside the current range of
|
||||
* slots that the hpts is running on. */
|
||||
int32_t p_on_queue_cnt; /* Count on queue in this hpts */
|
||||
uint32_t enobuf_cnt;
|
||||
uint16_t p_log_at;
|
||||
uint32_t p_lasttick; /* Last tick before the current one */
|
||||
uint8_t p_direct_wake :1, /* boolean */
|
||||
p_log_wrapped :1, /* boolean */
|
||||
p_on_min_sleep:1; /* boolean */
|
||||
uint8_t p_fill;
|
||||
p_on_min_sleep:1, /* boolean */
|
||||
p_avail:6;
|
||||
uint8_t p_fill[3]; /* Fill to 32 bits */
|
||||
/* Cache line 0x40 */
|
||||
void *p_inp;
|
||||
struct hptsh p_input; /* For the tcp-input runner */
|
||||
/* Hptsi wheel */
|
||||
struct hptsh *p_hptss;
|
||||
struct hpts_log *p_log;
|
||||
uint32_t p_logsize;
|
||||
int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
|
||||
uint32_t hit_no_enobuf;
|
||||
uint32_t p_dyn_adjust;
|
||||
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
|
||||
* of 255ms */
|
||||
uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
|
||||
uint32_t saved_lasttick; /* for logging */
|
||||
uint32_t saved_curtick; /* for logging */
|
||||
uint32_t saved_curslot; /* for logging */
|
||||
uint32_t saved_prev_slot; /* for logging */
|
||||
uint32_t p_delayed_by; /* How much were we delayed by */
|
||||
/* Cache line 0x80 */
|
||||
struct sysctl_ctx_list hpts_ctx;
|
||||
@ -236,13 +204,9 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts
|
||||
int
|
||||
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
|
||||
#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
|
||||
void
|
||||
tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
|
||||
int32_t tlen, int32_t drop_hdrlen, uint8_t iptos);
|
||||
int
|
||||
__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
|
||||
int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line);
|
||||
#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
|
||||
__tcp_queue_to_input(struct inpcb *inp, int32_t line);
|
||||
#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__)
|
||||
|
||||
uint16_t tcp_hpts_delayedby(struct inpcb *inp);
|
||||
|
||||
|
@ -175,7 +175,7 @@ enum tcp_log_events {
|
||||
TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
|
||||
TCP_LOG_PRR, /* Doing PRR 6 */
|
||||
TCP_LOG_REORDER,/* Detected reorder 7 */
|
||||
TCP_LOG_PACER, /* Pacer sending a packet 8 */
|
||||
TCP_LOG_HPTS, /* Hpts sending a packet 8 */
|
||||
BBR_LOG_BBRUPD, /* We updated BBR info 9 */
|
||||
BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */
|
||||
BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */
|
||||
@ -194,31 +194,38 @@ enum tcp_log_events {
|
||||
BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */
|
||||
TCP_LOG_FLOWEND, /* End of a flow 25 */
|
||||
BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */
|
||||
BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */
|
||||
BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */
|
||||
BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */
|
||||
BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */
|
||||
BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
|
||||
BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
|
||||
TCP_LOG_USERSEND, /* User level sends data 31 */
|
||||
UNUSED_32, /* Unused 32 */
|
||||
UNUSED_33, /* Unused 33 */
|
||||
BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */
|
||||
BBR_LOG_STATE_TARGET, /* Log of target at state 33 */
|
||||
BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */
|
||||
BBR_LOG_TO_PROCESS, /* A to was processed 35 */
|
||||
BBR_LOG_BBRTSO, /* TSO update 36 */
|
||||
BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */
|
||||
BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */
|
||||
BBR_LOG_LOWGAIN, /* Low gain accounting 38 */
|
||||
BBR_LOG_PROGRESS, /* Progress timer event 39 */
|
||||
TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */
|
||||
BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */
|
||||
BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */
|
||||
BBR_LOG_PACING_CALC, /* calc the pacing time 43 */
|
||||
BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */
|
||||
BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */
|
||||
BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */
|
||||
BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/
|
||||
TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
|
||||
BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */
|
||||
BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */
|
||||
BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */
|
||||
TCP_LOG_REASS, /* Reassembly buffer logging 50 */
|
||||
TCP_LOG_END /* End (keep at end) 51 */
|
||||
TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */
|
||||
BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */
|
||||
BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */
|
||||
TCP_LOG_CONNEND, /* End of connection 54 */
|
||||
TCP_LOG_LRO, /* LRO entry 55 */
|
||||
TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */
|
||||
TCP_SAD_DETECTION, /* Sack Attack Detection 57 */
|
||||
TCP_LOG_END /* End (keep at end) 58 */
|
||||
};
|
||||
|
||||
enum tcp_log_states {
|
||||
@ -275,8 +282,8 @@ struct tcp_log_dev_log_queue {
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
||||
#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000
|
||||
#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000
|
||||
#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000
|
||||
#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000
|
||||
|
||||
/*
|
||||
* TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
|
||||
|
File diff suppressed because it is too large
Load Diff
859
sys/netinet/tcp_stacks/rack_bbr_common.c
Normal file
859
sys/netinet/tcp_stacks/rack_bbr_common.c
Normal file
@ -0,0 +1,859 @@
|
||||
/*-
|
||||
* Copyright (c) 2016-2018
|
||||
* Netflix Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
/*
|
||||
* Author: Randall Stewart <rrs@netflix.com>
|
||||
* This work is based on the ACM Queue paper
|
||||
* BBR - Congestion Based Congestion Control
|
||||
* and also numerous discussions with Neal, Yuchung and Van.
|
||||
*/
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
__FBSDID("$FreeBSD$");
|
||||
|
||||
#include "opt_inet.h"
|
||||
#include "opt_inet6.h"
|
||||
#include "opt_ipsec.h"
|
||||
#include "opt_tcpdebug.h"
|
||||
#include "opt_ratelimit.h"
|
||||
/*#include "opt_kern_tls.h"*/
|
||||
#include <sys/param.h>
|
||||
#include <sys/module.h>
|
||||
#include <sys/kernel.h>
|
||||
#ifdef TCP_HHOOK
|
||||
#include <sys/hhook.h>
|
||||
#endif
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/mbuf.h>
|
||||
#include <sys/proc.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/socketvar.h>
|
||||
#ifdef KERN_TLS
|
||||
#include <sys/sockbuf_tls.h>
|
||||
#endif
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/tree.h>
|
||||
#include <sys/refcount.h>
|
||||
#include <sys/queue.h>
|
||||
#include <sys/smp.h>
|
||||
#include <sys/kthread.h>
|
||||
#include <sys/lock.h>
|
||||
#include <sys/mutex.h>
|
||||
#include <sys/time.h>
|
||||
#include <vm/uma.h>
|
||||
#include <sys/kern_prefetch.h>
|
||||
|
||||
#include <net/route.h>
|
||||
#include <net/vnet.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <net/bpf.h>
|
||||
|
||||
#define TCPSTATES /* for logging */
|
||||
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/in_kdtrace.h>
|
||||
#include <netinet/in_pcb.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/ip_icmp.h> /* required for icmp_var.h */
|
||||
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
|
||||
#include <netinet/ip_var.h>
|
||||
#include <netinet/ip6.h>
|
||||
#include <netinet6/in6_pcb.h>
|
||||
#include <netinet6/ip6_var.h>
|
||||
#define TCPOUTFLAGS
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/tcp_fsm.h>
|
||||
#include <netinet/tcp_seq.h>
|
||||
#include <netinet/tcp_timer.h>
|
||||
#include <netinet/tcp_var.h>
|
||||
#include <netinet/tcpip.h>
|
||||
#include <netinet/tcp_hpts.h>
|
||||
#include <netinet/cc/cc.h>
|
||||
#include <netinet/tcp_log_buf.h>
|
||||
#ifdef TCPDEBUG
|
||||
#include <netinet/tcp_debug.h>
|
||||
#endif /* TCPDEBUG */
|
||||
#ifdef TCP_OFFLOAD
|
||||
#include <netinet/tcp_offload.h>
|
||||
#endif
|
||||
#ifdef INET6
|
||||
#include <netinet6/tcp6_var.h>
|
||||
#endif
|
||||
#include <netinet/tcp_fastopen.h>
|
||||
|
||||
#include <netipsec/ipsec_support.h>
|
||||
#include <net/if.h>
|
||||
#include <net/if_var.h>
|
||||
|
||||
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
|
||||
#include <netipsec/ipsec.h>
|
||||
#include <netipsec/ipsec6.h>
|
||||
#endif /* IPSEC */
|
||||
|
||||
#include <netinet/udp.h>
|
||||
#include <netinet/udp_var.h>
|
||||
#include <machine/in_cksum.h>
|
||||
|
||||
#ifdef MAC
|
||||
#include <security/mac/mac_framework.h>
|
||||
#endif
|
||||
#include "rack_bbr_common.h"
|
||||
|
||||
/*
|
||||
* Common TCP Functions - These are shared by borth
|
||||
* rack and BBR.
|
||||
*/
|
||||
|
||||
|
||||
#ifdef KERN_TLS
|
||||
uint32_t
|
||||
ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
|
||||
{
|
||||
struct sbtls_info *tls;
|
||||
uint32_t len;
|
||||
|
||||
again:
|
||||
tls = so->so_snd.sb_tls_info;
|
||||
len = tls->sb_params.sb_maxlen; /* max tls payload */
|
||||
len += tls->sb_params.sb_tls_hlen; /* tls header len */
|
||||
len += tls->sb_params.sb_tls_tlen; /* tls trailer len */
|
||||
if ((len * 4) > rwnd) {
|
||||
/*
|
||||
* Stroke this will suck counter and what
|
||||
* else should we do Drew? From the
|
||||
* TCP perspective I am not sure
|
||||
* what should be done...
|
||||
*/
|
||||
if (tls->sb_params.sb_maxlen > 4096) {
|
||||
tls->sb_params.sb_maxlen -= 4096;
|
||||
if (tls->sb_params.sb_maxlen < 4096)
|
||||
tls->sb_params.sb_maxlen = 4096;
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
return (len);
|
||||
}
|
||||
#endif
|
||||
|
||||
int
|
||||
ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
|
||||
{
|
||||
/*
|
||||
* We are passed a raw change of mbuf packets
|
||||
* that arrived in LRO. They are linked via
|
||||
* the m_nextpkt link in the pkt-headers.
|
||||
*
|
||||
* We process each one by:
|
||||
* a) saving off the next
|
||||
* b) stripping off the ether-header
|
||||
* c) formulating the arguments for
|
||||
* the tfb_tcp_hpts_do_segment
|
||||
* d) calling each mbuf to tfb_tcp_hpts_do_segment
|
||||
* after adjusting the time to match the arrival time.
|
||||
* Note that the LRO code assures no IP options are present.
|
||||
*
|
||||
* The symantics for calling tfb_tcp_hpts_do_segment are the
|
||||
* following:
|
||||
* 1) It returns 0 if all went well and you (the caller) need
|
||||
* to release the lock.
|
||||
* 2) If nxt_pkt is set, then the function will surpress calls
|
||||
* to tfb_tcp_output() since you are promising to call again
|
||||
* with another packet.
|
||||
* 3) If it returns 1, then you must free all the packets being
|
||||
* shipped in, the tcb has been destroyed (or about to be destroyed).
|
||||
*/
|
||||
struct mbuf *m_save;
|
||||
struct ether_header *eh;
|
||||
struct epoch_tracker et;
|
||||
struct tcphdr *th;
|
||||
#ifdef INET6
|
||||
struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
|
||||
#endif
|
||||
#ifdef INET
|
||||
struct ip *ip = NULL; /* Keep compiler happy. */
|
||||
#endif
|
||||
struct ifnet *ifp;
|
||||
struct timeval tv;
|
||||
int32_t retval, nxt_pkt, tlen, off;
|
||||
uint16_t etype;
|
||||
uint16_t drop_hdrlen;
|
||||
uint8_t iptos, no_vn=0, bpf_req=0;
|
||||
|
||||
/*
|
||||
* This is a bit deceptive, we get the
|
||||
* "info epoch" which is really the network
|
||||
* epoch. This covers us on both any INP
|
||||
* type change but also if the ifp goes
|
||||
* away it covers us as well.
|
||||
*/
|
||||
INP_INFO_RLOCK_ET(&V_tcbinfo, et);
|
||||
if (m && m->m_pkthdr.rcvif)
|
||||
ifp = m->m_pkthdr.rcvif;
|
||||
else
|
||||
ifp = NULL;
|
||||
if (ifp) {
|
||||
bpf_req = bpf_peers_present(ifp->if_bpf);
|
||||
} else {
|
||||
/*
|
||||
* We probably should not work around
|
||||
* but kassert, since lro alwasy sets rcvif.
|
||||
*/
|
||||
no_vn = 1;
|
||||
goto skip_vnet;
|
||||
}
|
||||
CURVNET_SET(ifp->if_vnet);
|
||||
skip_vnet:
|
||||
while (m) {
|
||||
m_save = m->m_nextpkt;
|
||||
m->m_nextpkt = NULL;
|
||||
/* Now lets get the ether header */
|
||||
eh = mtod(m, struct ether_header *);
|
||||
etype = ntohs(eh->ether_type);
|
||||
/* Let the BPF see the packet */
|
||||
if (bpf_req && ifp)
|
||||
ETHER_BPF_MTAP(ifp, m);
|
||||
m_adj(m, sizeof(*eh));
|
||||
/* Trim off the ethernet header */
|
||||
switch (etype) {
|
||||
#ifdef INET6
|
||||
case ETHERTYPE_IPV6:
|
||||
{
|
||||
if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
|
||||
m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
|
||||
if (m == NULL) {
|
||||
TCPSTAT_INC(tcps_rcvshort);
|
||||
m_freem(m);
|
||||
goto skipped_pkt;
|
||||
}
|
||||
}
|
||||
ip6 = (struct ip6_hdr *)(eh + 1);
|
||||
th = (struct tcphdr *)(ip6 + 1);
|
||||
tlen = ntohs(ip6->ip6_plen);
|
||||
drop_hdrlen = sizeof(*ip6);
|
||||
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
|
||||
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
|
||||
th->th_sum = m->m_pkthdr.csum_data;
|
||||
else
|
||||
th->th_sum = in6_cksum_pseudo(ip6, tlen,
|
||||
IPPROTO_TCP, m->m_pkthdr.csum_data);
|
||||
th->th_sum ^= 0xffff;
|
||||
} else
|
||||
th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
|
||||
if (th->th_sum) {
|
||||
TCPSTAT_INC(tcps_rcvbadsum);
|
||||
m_freem(m);
|
||||
goto skipped_pkt;
|
||||
}
|
||||
/*
|
||||
* Be proactive about unspecified IPv6 address in source.
|
||||
* As we use all-zero to indicate unbounded/unconnected pcb,
|
||||
* unspecified IPv6 address can be used to confuse us.
|
||||
*
|
||||
* Note that packets with unspecified IPv6 destination is
|
||||
* already dropped in ip6_input.
|
||||
*/
|
||||
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
|
||||
/* XXX stat */
|
||||
m_freem(m);
|
||||
goto skipped_pkt;
|
||||
}
|
||||
iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#ifdef INET
|
||||
case ETHERTYPE_IP:
|
||||
{
|
||||
if (m->m_len < sizeof (struct tcpiphdr)) {
|
||||
if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
|
||||
== NULL) {
|
||||
TCPSTAT_INC(tcps_rcvshort);
|
||||
m_freem(m);
|
||||
goto skipped_pkt;
|
||||
}
|
||||
}
|
||||
ip = (struct ip *)(eh + 1);
|
||||
th = (struct tcphdr *)(ip + 1);
|
||||
drop_hdrlen = sizeof(*ip);
|
||||
iptos = ip->ip_tos;
|
||||
tlen = ntohs(ip->ip_len) - sizeof(struct ip);
|
||||
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
|
||||
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
|
||||
th->th_sum = m->m_pkthdr.csum_data;
|
||||
else
|
||||
th->th_sum = in_pseudo(ip->ip_src.s_addr,
|
||||
ip->ip_dst.s_addr,
|
||||
htonl(m->m_pkthdr.csum_data + tlen +
|
||||
IPPROTO_TCP));
|
||||
th->th_sum ^= 0xffff;
|
||||
} else {
|
||||
int len;
|
||||
struct ipovly *ipov = (struct ipovly *)ip;
|
||||
/*
|
||||
* Checksum extended TCP header and data.
|
||||
*/
|
||||
len = drop_hdrlen + tlen;
|
||||
bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
|
||||
ipov->ih_len = htons(tlen);
|
||||
th->th_sum = in_cksum(m, len);
|
||||
/* Reset length for SDT probes. */
|
||||
ip->ip_len = htons(len);
|
||||
/* Reset TOS bits */
|
||||
ip->ip_tos = iptos;
|
||||
/* Re-initialization for later version check */
|
||||
ip->ip_v = IPVERSION;
|
||||
ip->ip_hl = sizeof(*ip) >> 2;
|
||||
}
|
||||
if (th->th_sum) {
|
||||
TCPSTAT_INC(tcps_rcvbadsum);
|
||||
m_freem(m);
|
||||
goto skipped_pkt;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
/*
|
||||
* Convert TCP protocol specific fields to host format.
|
||||
*/
|
||||
tcp_fields_to_host(th);
|
||||
|
||||
off = th->th_off << 2;
|
||||
if (off < sizeof (struct tcphdr) || off > tlen) {
|
||||
TCPSTAT_INC(tcps_rcvbadoff);
|
||||
m_freem(m);
|
||||
goto skipped_pkt;
|
||||
}
|
||||
tlen -= off;
|
||||
drop_hdrlen += off;
|
||||
/*
|
||||
* Now lets setup the timeval to be when we should
|
||||
* have been called (if we can).
|
||||
*/
|
||||
m->m_pkthdr.lro_nsegs = 1;
|
||||
if (m->m_flags & M_TSTMP_LRO) {
|
||||
tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
|
||||
tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
|
||||
} else {
|
||||
/* Should not be should we kassert instead? */
|
||||
tcp_get_usecs(&tv);
|
||||
}
|
||||
/* Now what about next packet? */
|
||||
if (m_save || has_pkt)
|
||||
nxt_pkt = 1;
|
||||
else
|
||||
nxt_pkt = 0;
|
||||
retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
|
||||
iptos, nxt_pkt, &tv);
|
||||
if (retval) {
|
||||
/* We lost the lock and tcb probably */
|
||||
m = m_save;
|
||||
while (m) {
|
||||
m_save = m->m_nextpkt;
|
||||
m->m_nextpkt = NULL;
|
||||
m_freem(m);
|
||||
m = m_save;
|
||||
}
|
||||
if (no_vn == 0)
|
||||
CURVNET_RESTORE();
|
||||
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
|
||||
return (retval);
|
||||
}
|
||||
skipped_pkt:
|
||||
m = m_save;
|
||||
}
|
||||
if (no_vn == 0)
|
||||
CURVNET_RESTORE();
|
||||
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
|
||||
return (retval);
|
||||
}
|
||||
|
||||
int
|
||||
ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
|
||||
{
|
||||
struct mbuf *m;
|
||||
|
||||
/* First lets see if we have old packets */
|
||||
if (tp->t_in_pkt) {
|
||||
m = tp->t_in_pkt;
|
||||
tp->t_in_pkt = NULL;
|
||||
tp->t_tail_pkt = NULL;
|
||||
if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
|
||||
/* We lost the tcpcb (maybe a RST came in)? */
|
||||
return (1);
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
ctf_outstanding(struct tcpcb *tp)
|
||||
{
|
||||
return (tp->snd_max - tp->snd_una);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
|
||||
{
|
||||
if (rc_sacked <= ctf_outstanding(tp))
|
||||
return (ctf_outstanding(tp) - rc_sacked);
|
||||
else {
|
||||
/* TSNH */
|
||||
#ifdef INVARIANTS
|
||||
panic("tp:%p rc_sacked:%d > out:%d",
|
||||
tp, rc_sacked, ctf_outstanding(tp));
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
|
||||
int32_t rstreason, int32_t tlen)
|
||||
{
|
||||
if (tp != NULL) {
|
||||
tcp_dropwithreset(m, th, tp, tlen, rstreason);
|
||||
INP_WUNLOCK(tp->t_inpcb);
|
||||
} else
|
||||
tcp_dropwithreset(m, th, NULL, tlen, rstreason);
|
||||
}
|
||||
|
||||
/*
|
||||
* ctf_drop_checks returns 1 for you should not proceed. It places
|
||||
* in ret_val what should be returned 1/0 by the caller. The 1 indicates
|
||||
* that the TCB is unlocked and probably dropped. The 0 indicates the
|
||||
* TCB is still valid and locked.
|
||||
*/
|
||||
int
|
||||
ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
|
||||
{
|
||||
int32_t todrop;
|
||||
int32_t thflags;
|
||||
int32_t tlen;
|
||||
|
||||
thflags = *thf;
|
||||
tlen = *tlenp;
|
||||
todrop = tp->rcv_nxt - th->th_seq;
|
||||
if (todrop > 0) {
|
||||
if (thflags & TH_SYN) {
|
||||
thflags &= ~TH_SYN;
|
||||
th->th_seq++;
|
||||
if (th->th_urp > 1)
|
||||
th->th_urp--;
|
||||
else
|
||||
thflags &= ~TH_URG;
|
||||
todrop--;
|
||||
}
|
||||
/*
|
||||
* Following if statement from Stevens, vol. 2, p. 960.
|
||||
*/
|
||||
if (todrop > tlen
|
||||
|| (todrop == tlen && (thflags & TH_FIN) == 0)) {
|
||||
/*
|
||||
* Any valid FIN must be to the left of the window.
|
||||
* At this point the FIN must be a duplicate or out
|
||||
* of sequence; drop it.
|
||||
*/
|
||||
thflags &= ~TH_FIN;
|
||||
/*
|
||||
* Send an ACK to resynchronize and drop any data.
|
||||
* But keep on processing for RST or ACK.
|
||||
*/
|
||||
tp->t_flags |= TF_ACKNOW;
|
||||
todrop = tlen;
|
||||
TCPSTAT_INC(tcps_rcvduppack);
|
||||
TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
|
||||
} else {
|
||||
TCPSTAT_INC(tcps_rcvpartduppack);
|
||||
TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
|
||||
}
|
||||
/*
|
||||
* DSACK - add SACK block for dropped range
|
||||
*/
|
||||
if (tp->t_flags & TF_SACK_PERMIT) {
|
||||
tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
|
||||
/*
|
||||
* ACK now, as the next in-sequence segment
|
||||
* will clear the DSACK block again
|
||||
*/
|
||||
tp->t_flags |= TF_ACKNOW;
|
||||
}
|
||||
*drop_hdrlen += todrop; /* drop from the top afterwards */
|
||||
th->th_seq += todrop;
|
||||
tlen -= todrop;
|
||||
if (th->th_urp > todrop)
|
||||
th->th_urp -= todrop;
|
||||
else {
|
||||
thflags &= ~TH_URG;
|
||||
th->th_urp = 0;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* If segment ends after window, drop trailing data (and PUSH and
|
||||
* FIN); if nothing left, just ACK.
|
||||
*/
|
||||
todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
|
||||
if (todrop > 0) {
|
||||
TCPSTAT_INC(tcps_rcvpackafterwin);
|
||||
if (todrop >= tlen) {
|
||||
TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
|
||||
/*
|
||||
* If window is closed can only take segments at
|
||||
* window edge, and have to drop data and PUSH from
|
||||
* incoming segments. Continue processing, but
|
||||
* remember to ack. Otherwise, drop segment and
|
||||
* ack.
|
||||
*/
|
||||
if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
|
||||
tp->t_flags |= TF_ACKNOW;
|
||||
TCPSTAT_INC(tcps_rcvwinprobe);
|
||||
} else {
|
||||
ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
|
||||
return (1);
|
||||
}
|
||||
} else
|
||||
TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
|
||||
m_adj(m, -todrop);
|
||||
tlen -= todrop;
|
||||
thflags &= ~(TH_PUSH | TH_FIN);
|
||||
}
|
||||
*thf = thflags;
|
||||
*tlenp = tlen;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* The value in ret_val informs the caller
|
||||
* if we dropped the tcb (and lock) or not.
|
||||
* 1 = we dropped it, 0 = the TCB is still locked
|
||||
* and valid.
|
||||
*/
|
||||
void
|
||||
ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
|
||||
{
|
||||
/*
|
||||
* Generate an ACK dropping incoming segment if it occupies sequence
|
||||
* space, where the ACK reflects our state.
|
||||
*
|
||||
* We can now skip the test for the RST flag since all paths to this
|
||||
* code happen after packets containing RST have been dropped.
|
||||
*
|
||||
* In the SYN-RECEIVED state, don't send an ACK unless the segment
|
||||
* we received passes the SYN-RECEIVED ACK test. If it fails send a
|
||||
* RST. This breaks the loop in the "LAND" DoS attack, and also
|
||||
* prevents an ACK storm between two listening ports that have been
|
||||
* sent forged SYN segments, each with the source address of the
|
||||
* other.
|
||||
*/
|
||||
if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
|
||||
(SEQ_GT(tp->snd_una, th->th_ack) ||
|
||||
SEQ_GT(th->th_ack, tp->snd_max))) {
|
||||
*ret_val = 1;
|
||||
ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
|
||||
return;
|
||||
} else
|
||||
*ret_val = 0;
|
||||
tp->t_flags |= TF_ACKNOW;
|
||||
if (m)
|
||||
m_freem(m);
|
||||
}
|
||||
|
||||
void
|
||||
ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
|
||||
{
|
||||
|
||||
/*
|
||||
* Drop space held by incoming segment and return.
|
||||
*/
|
||||
if (tp != NULL)
|
||||
INP_WUNLOCK(tp->t_inpcb);
|
||||
if (m)
|
||||
m_freem(m);
|
||||
}
|
||||
|
||||
int
|
||||
ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
|
||||
{
|
||||
/*
|
||||
* RFC5961 Section 3.2
|
||||
*
|
||||
* - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
|
||||
* window, we send challenge ACK.
|
||||
*
|
||||
* Note: to take into account delayed ACKs, we should test against
|
||||
* last_ack_sent instead of rcv_nxt. Note 2: we handle special case
|
||||
* of closed window, not covered by the RFC.
|
||||
*/
|
||||
int dropped = 0;
|
||||
|
||||
if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
|
||||
SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
|
||||
(tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
|
||||
|
||||
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
|
||||
KASSERT(tp->t_state != TCPS_SYN_SENT,
|
||||
("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
|
||||
__func__, th, tp));
|
||||
|
||||
if (V_tcp_insecure_rst ||
|
||||
(tp->last_ack_sent == th->th_seq) ||
|
||||
(tp->rcv_nxt == th->th_seq) ||
|
||||
((tp->last_ack_sent - 1) == th->th_seq)) {
|
||||
TCPSTAT_INC(tcps_drops);
|
||||
/* Drop the connection. */
|
||||
switch (tp->t_state) {
|
||||
case TCPS_SYN_RECEIVED:
|
||||
so->so_error = ECONNREFUSED;
|
||||
goto close;
|
||||
case TCPS_ESTABLISHED:
|
||||
case TCPS_FIN_WAIT_1:
|
||||
case TCPS_FIN_WAIT_2:
|
||||
case TCPS_CLOSE_WAIT:
|
||||
case TCPS_CLOSING:
|
||||
case TCPS_LAST_ACK:
|
||||
so->so_error = ECONNRESET;
|
||||
close:
|
||||
tcp_state_change(tp, TCPS_CLOSED);
|
||||
/* FALLTHROUGH */
|
||||
default:
|
||||
tp = tcp_close(tp);
|
||||
}
|
||||
dropped = 1;
|
||||
ctf_do_drop(m, tp);
|
||||
} else {
|
||||
TCPSTAT_INC(tcps_badrst);
|
||||
/* Send challenge ACK. */
|
||||
tcp_respond(tp, mtod(m, void *), th, m,
|
||||
tp->rcv_nxt, tp->snd_nxt, TH_ACK);
|
||||
tp->last_ack_sent = tp->rcv_nxt;
|
||||
}
|
||||
} else {
|
||||
m_freem(m);
|
||||
}
|
||||
return (dropped);
|
||||
}
|
||||
|
||||
/*
|
||||
* The value in ret_val informs the caller
|
||||
* if we dropped the tcb (and lock) or not.
|
||||
* 1 = we dropped it, 0 = the TCB is still locked
|
||||
* and valid.
|
||||
*/
|
||||
void
|
||||
ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
|
||||
{
|
||||
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
|
||||
|
||||
TCPSTAT_INC(tcps_badsyn);
|
||||
if (V_tcp_insecure_syn &&
|
||||
SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
|
||||
SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
|
||||
tp = tcp_drop(tp, ECONNRESET);
|
||||
*ret_val = 1;
|
||||
ctf_do_drop(m, tp);
|
||||
} else {
|
||||
/* Send challenge ACK. */
|
||||
tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
|
||||
tp->snd_nxt, TH_ACK);
|
||||
tp->last_ack_sent = tp->rcv_nxt;
|
||||
m = NULL;
|
||||
*ret_val = 0;
|
||||
ctf_do_drop(m, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* bbr_ts_check returns 1 for you should not proceed, the state
|
||||
* machine should return. It places in ret_val what should
|
||||
* be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
|
||||
* that the TCB is unlocked and probably dropped. The 0 indicates the
|
||||
* TCB is still valid and locked.
|
||||
*/
|
||||
int
|
||||
ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
|
||||
int32_t tlen, int32_t thflags, int32_t * ret_val)
|
||||
{
|
||||
|
||||
if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
|
||||
/*
|
||||
* Invalidate ts_recent. If this segment updates ts_recent,
|
||||
* the age will be reset later and ts_recent will get a
|
||||
* valid value. If it does not, setting ts_recent to zero
|
||||
* will at least satisfy the requirement that zero be placed
|
||||
* in the timestamp echo reply when ts_recent isn't valid.
|
||||
* The age isn't reset until we get a valid ts_recent
|
||||
* because we don't want out-of-order segments to be dropped
|
||||
* when ts_recent is old.
|
||||
*/
|
||||
tp->ts_recent = 0;
|
||||
} else {
|
||||
TCPSTAT_INC(tcps_rcvduppack);
|
||||
TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
|
||||
TCPSTAT_INC(tcps_pawsdrop);
|
||||
*ret_val = 0;
|
||||
if (tlen) {
|
||||
ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
|
||||
} else {
|
||||
ctf_do_drop(m, NULL);
|
||||
}
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
|
||||
{
|
||||
int32_t win;
|
||||
|
||||
/*
|
||||
* Calculate amount of space in receive window, and then do TCP
|
||||
* input processing. Receive window is amount of space in rcv queue,
|
||||
* but not less than advertised window.
|
||||
*/
|
||||
win = sbspace(&so->so_rcv);
|
||||
if (win < 0)
|
||||
win = 0;
|
||||
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
|
||||
}
|
||||
|
||||
void
|
||||
ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
|
||||
int32_t rstreason, int32_t tlen)
|
||||
{
|
||||
|
||||
if (tp->t_inpcb) {
|
||||
tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
|
||||
}
|
||||
tcp_dropwithreset(m, th, tp, tlen, rstreason);
|
||||
INP_WUNLOCK(tp->t_inpcb);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
ctf_fixed_maxseg(struct tcpcb *tp)
|
||||
{
|
||||
int optlen;
|
||||
|
||||
if (tp->t_flags & TF_NOOPT)
|
||||
return (tp->t_maxseg);
|
||||
|
||||
/*
|
||||
* Here we have a simplified code from tcp_addoptions(),
|
||||
* without a proper loop, and having most of paddings hardcoded.
|
||||
* We only consider fixed options that we would send every
|
||||
* time I.e. SACK is not considered.
|
||||
*
|
||||
*/
|
||||
#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
|
||||
if (TCPS_HAVEESTABLISHED(tp->t_state)) {
|
||||
if (tp->t_flags & TF_RCVD_TSTMP)
|
||||
optlen = TCPOLEN_TSTAMP_APPA;
|
||||
else
|
||||
optlen = 0;
|
||||
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
|
||||
if (tp->t_flags & TF_SIGNATURE)
|
||||
optlen += PAD(TCPOLEN_SIGNATURE);
|
||||
#endif
|
||||
} else {
|
||||
if (tp->t_flags & TF_REQ_TSTMP)
|
||||
optlen = TCPOLEN_TSTAMP_APPA;
|
||||
else
|
||||
optlen = PAD(TCPOLEN_MAXSEG);
|
||||
if (tp->t_flags & TF_REQ_SCALE)
|
||||
optlen += PAD(TCPOLEN_WINDOW);
|
||||
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
|
||||
if (tp->t_flags & TF_SIGNATURE)
|
||||
optlen += PAD(TCPOLEN_SIGNATURE);
|
||||
#endif
|
||||
if (tp->t_flags & TF_SACK_PERMIT)
|
||||
optlen += PAD(TCPOLEN_SACK_PERMITTED);
|
||||
}
|
||||
#undef PAD
|
||||
optlen = min(optlen, TCP_MAXOLEN);
|
||||
return (tp->t_maxseg - optlen);
|
||||
}
|
||||
|
||||
void
|
||||
ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
|
||||
{
|
||||
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
|
||||
union tcp_log_stackspecific log;
|
||||
struct timeval tv;
|
||||
|
||||
memset(&log, 0, sizeof(log));
|
||||
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
|
||||
log.u_bbr.flex8 = num_sack_blks;
|
||||
if (num_sack_blks > 0) {
|
||||
log.u_bbr.flex1 = sack_blocks[0].start;
|
||||
log.u_bbr.flex2 = sack_blocks[0].end;
|
||||
}
|
||||
if (num_sack_blks > 1) {
|
||||
log.u_bbr.flex3 = sack_blocks[1].start;
|
||||
log.u_bbr.flex4 = sack_blocks[1].end;
|
||||
}
|
||||
if (num_sack_blks > 2) {
|
||||
log.u_bbr.flex5 = sack_blocks[2].start;
|
||||
log.u_bbr.flex6 = sack_blocks[2].end;
|
||||
}
|
||||
if (num_sack_blks > 3) {
|
||||
log.u_bbr.applimited = sack_blocks[3].start;
|
||||
log.u_bbr.pkts_out = sack_blocks[3].end;
|
||||
}
|
||||
TCP_LOG_EVENTP(tp, NULL,
|
||||
&tp->t_inpcb->inp_socket->so_rcv,
|
||||
&tp->t_inpcb->inp_socket->so_snd,
|
||||
TCP_SACK_FILTER_RES, 0,
|
||||
0, &log, false, &tv);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t
|
||||
ctf_decay_count(uint32_t count, uint32_t decay)
|
||||
{
|
||||
/*
|
||||
* Given a count, decay it by a set percentage. The
|
||||
* percentage is in thousands i.e. 100% = 1000,
|
||||
* 19.3% = 193.
|
||||
*/
|
||||
uint64_t perc_count, decay_per;
|
||||
uint32_t decayed_count;
|
||||
if (decay > 1000) {
|
||||
/* We don't raise it */
|
||||
return (count);
|
||||
}
|
||||
perc_count = count;
|
||||
decay_per = decay;
|
||||
perc_count *= decay_per;
|
||||
perc_count /= 1000;
|
||||
/*
|
||||
* So now perc_count holds the
|
||||
* count decay value.
|
||||
*/
|
||||
decayed_count = count - (uint32_t)perc_count;
|
||||
return (decayed_count);
|
||||
}
|
@ -38,16 +38,7 @@
|
||||
#define TCP_MSS_ACCT_SIZE 70
|
||||
#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)
|
||||
|
||||
|
||||
/* Magic flags to tell whats cooking on the pacing wheel */
|
||||
#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */
|
||||
#define PACE_TMR_RACK 0x02 /* RACK timer running */
|
||||
#define PACE_TMR_TLP 0x04 /* TLP timer running */
|
||||
#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
|
||||
#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
|
||||
#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
|
||||
#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */
|
||||
#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
|
||||
#define DUP_ACK_THRESHOLD 3
|
||||
|
||||
/* Magic flags for tracing progress events */
|
||||
#define PROGRESS_DROP 1
|
||||
@ -61,8 +52,66 @@
|
||||
#define USE_RTT_LOW 1
|
||||
#define USE_RTT_AVG 2
|
||||
|
||||
#define PACE_MAX_IP_BYTES 65536
|
||||
#define USECS_IN_SECOND 1000000
|
||||
#define MSEC_IN_SECOND 1000
|
||||
#define MS_IN_USEC 1000
|
||||
#define USEC_TO_MSEC(x) (x / MS_IN_USEC)
|
||||
#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */
|
||||
|
||||
#ifdef _KERNEL
|
||||
/* We have only 7 bits in rack so assert its true */
|
||||
CTASSERT((PACE_TMR_MASK & 0x80) == 0);
|
||||
#ifdef KERN_TLS
|
||||
uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd);
|
||||
#endif
|
||||
int
|
||||
ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so,
|
||||
struct mbuf *m, int has_pkt);
|
||||
int
|
||||
ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt);
|
||||
uint32_t ctf_outstanding(struct tcpcb *tp);
|
||||
uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
|
||||
int
|
||||
ctf_drop_checks(struct tcpopt *to, struct mbuf *m,
|
||||
struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
|
||||
int32_t * drop_hdrlen, int32_t * ret_val);
|
||||
void
|
||||
ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
|
||||
struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
|
||||
void
|
||||
ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
|
||||
struct tcphdr *th, int32_t rstreason, int32_t tlen);
|
||||
void
|
||||
ctf_do_drop(struct mbuf *m, struct tcpcb *tp);
|
||||
|
||||
int
|
||||
ctf_process_rst(struct mbuf *m, struct tcphdr *th,
|
||||
struct socket *so, struct tcpcb *tp);
|
||||
|
||||
void
|
||||
ctf_challenge_ack(struct mbuf *m, struct tcphdr *th,
|
||||
struct tcpcb *tp, int32_t * ret_val);
|
||||
|
||||
int
|
||||
ctf_ts_check(struct mbuf *m, struct tcphdr *th,
|
||||
struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
|
||||
|
||||
void
|
||||
ctf_calc_rwin(struct socket *so, struct tcpcb *tp);
|
||||
|
||||
void
|
||||
ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
|
||||
int32_t rstreason, int32_t tlen);
|
||||
|
||||
uint32_t
|
||||
ctf_fixed_maxseg(struct tcpcb *tp);
|
||||
|
||||
void
|
||||
ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks);
|
||||
|
||||
uint32_t
|
||||
ctf_decay_count(uint32_t count, uint32_t decay_percentage);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
@ -102,7 +102,8 @@ struct tcpcb {
|
||||
t_state:4, /* state of this connection */
|
||||
t_idle_reduce : 1,
|
||||
t_delayed_ack: 7, /* Delayed ack variable */
|
||||
bits_spare : 4;
|
||||
t_fin_is_rst: 1, /* Are fin's treated as resets */
|
||||
bits_spare : 3;
|
||||
u_int t_flags;
|
||||
tcp_seq snd_una; /* sent but unacknowledged */
|
||||
tcp_seq snd_max; /* highest sequence number sent;
|
||||
@ -271,6 +272,11 @@ struct tcp_function_block {
|
||||
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
|
||||
struct socket *, struct tcpcb *,
|
||||
int, int, uint8_t);
|
||||
int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int);
|
||||
int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *,
|
||||
struct socket *, struct tcpcb *,
|
||||
int, int, uint8_t,
|
||||
int, struct timeval *);
|
||||
void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
|
||||
struct socket *, struct tcpcb *,
|
||||
int, int, uint8_t,
|
||||
|
@ -407,6 +407,7 @@ void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
|
||||
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
|
||||
hw-stamped on port (useful for IEEE 1588
|
||||
and 802.1AS) */
|
||||
#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
|
||||
|
||||
#define M_PROTO1 0x00001000 /* protocol-specific */
|
||||
#define M_PROTO2 0x00002000 /* protocol-specific */
|
||||
|
Loading…
Reference in New Issue
Block a user