This commit brings in the TCP high precision timer system (tcp_hpts).

It is the forerunner/foundational work of bringing in both Rack and BBR
which use hpts for pacing out packets. The feature is optional and requires
the TCPHPTS option to be enabled before the feature will be active. TCP
modules that use it must assure that the base component is compile in
the kernel in which they are loaded.

MFC after:	Never
Sponsored by:	Netflix Inc.
Differential Revision:	https://reviews.freebsd.org/D15020
This commit is contained in:
Randall Stewart 2018-04-19 13:37:59 +00:00
parent ba85da072b
commit 3ee9c3c4eb
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=332770
13 changed files with 2735 additions and 35 deletions

View File

@ -4355,6 +4355,7 @@ netinet/tcp_log_buf.c optional tcp_blackbox inet | tcp_blackbox inet6
netinet/tcp_lro.c optional inet | inet6
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap
netinet/tcp_reass.c optional inet | inet6
netinet/tcp_sack.c optional inet | inet6

View File

@ -218,6 +218,7 @@ SYSVMSG opt_sysvipc.h
SYSVSEM opt_sysvipc.h
SYSVSHM opt_sysvipc.h
SW_WATCHDOG opt_watchdog.h
TCPHPTS opt_inet.h
TURNSTILE_PROFILING
UMTX_PROFILING
UMTX_CHAINS opt_global.h

View File

@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/rmlock.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
@ -87,6 +88,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_var.h>
#ifdef TCPHPTS
#include <netinet/tcp_hpts.h>
#endif
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#endif
@ -1224,9 +1228,28 @@ in_pcbrele_rlocked(struct inpcb *inp)
}
return (0);
}
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
#ifdef TCPHPTS
if (inp->inp_in_hpts || inp->inp_in_input) {
struct tcp_hpts_entry *hpts;
/*
* We should not be on the hpts at
* this point in any form. we must
* get the lock to be sure.
*/
hpts = tcp_hpts_lock(inp);
if (inp->inp_in_hpts)
panic("Hpts:%p inp:%p at free still on hpts",
hpts, inp);
mtx_unlock(&hpts->p_mtx);
hpts = tcp_input_lock(inp);
if (inp->inp_in_input)
panic("Hpts:%p inp:%p at free still on input hpts",
hpts, inp);
mtx_unlock(&hpts->p_mtx);
}
#endif
INP_RUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
@ -1255,7 +1278,26 @@ in_pcbrele_wlocked(struct inpcb *inp)
}
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
#ifdef TCPHPTS
if (inp->inp_in_hpts || inp->inp_in_input) {
struct tcp_hpts_entry *hpts;
/*
* We should not be on the hpts at
* this point in any form. we must
* get the lock to be sure.
*/
hpts = tcp_hpts_lock(inp);
if (inp->inp_in_hpts)
panic("Hpts:%p inp:%p at free still on hpts",
hpts, inp);
mtx_unlock(&hpts->p_mtx);
hpts = tcp_input_lock(inp);
if (inp->inp_in_input)
panic("Hpts:%p inp:%p at free still on input hpts",
hpts, inp);
mtx_unlock(&hpts->p_mtx);
}
#endif
INP_WUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);

View File

@ -156,6 +156,7 @@ struct in_conninfo {
* from the global list.
*
* Key:
* (b) - Protected by the hpts lock.
* (c) - Constant after initialization
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
@ -164,6 +165,51 @@ struct in_conninfo {
* (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
*
* Notes on the tcp_hpts:
*
* First Hpts lock order is
* 1) INP_WLOCK()
* 2) HPTS_LOCK() i.e. hpts->pmtx
*
* To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
* You may check the inp->inp_in_hpts flag without the hpts lock.
* The hpts is the only one that will clear this flag holding
* only the hpts lock. This means that in your tcp_output()
* routine when you test for the inp_in_hpts flag to be 1
* it may be transitioning to 0 (by the hpts).
* That's ok since that will just mean an extra call to tcp_output
* that most likely will find the call you executed
* (when the mis-match occured) will have put the TCB back
* on the hpts and it will return. If your
* call did not add the inp back to the hpts then you will either
* over-send or the cwnd will block you from sending more.
*
* Note you should also be holding the INP_WLOCK() when you
* call the remove from the hpts as well. Though usually
* you are either doing this from a timer, where you need and have
* the INP_WLOCK() or from destroying your TCB where again
* you should already have the INP_WLOCK().
*
* The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
* inp_input_cpu_set fields are controlled completely by
* the hpts. Do not ever set these. The inp_hpts_cpu_set
* and inp_input_cpu_set fields indicate if the hpts has
* setup the respective cpu field. It is advised if this
* field is 0, to enqueue the packet with the appropriate
* hpts_immediate() call. If the _set field is 1, then
* you may compare the inp_*_cpu field to the curcpu and
* may want to again insert onto the hpts if these fields
* are not equal (i.e. you are not on the expected CPU).
*
* A note on inp_hpts_calls and inp_input_calls, these
* flags are set when the hpts calls either the output
* or do_segment routines respectively. If the routine
* being called wants to use this, then it needs to
* clear the flag before returning. The hpts will not
* clear the flag. The flags can be used to tell if
* the hpts is the function calling the respective
* routine.
*
* A few other notes:
*
@ -190,14 +236,45 @@ struct inpcb {
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
#define inp_start_zero inp_refcount
#define inp_start_zero inp_hpts
#define inp_zero_size (sizeof(struct inpcb) - \
offsetof(struct inpcb, inp_start_zero))
TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */
uint32_t inp_hpts_request; /* Current hpts request, zero if
* fits in the pacing window (i&b). */
/*
* Note the next fields are protected by a
* different lock (hpts-lock). This means that
* they must correspond in size to the smallest
* protectable bit field (uint8_t on x86, and
* other platfomrs potentially uint32_t?). Also
* since CPU switches can occur at different times the two
* fields can *not* be collapsed into a signal bit field.
*/
#if defined(__amd64__) || defined(__i386__)
volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
volatile uint8_t inp_in_input; /* on input hpts (lock b) */
#else
volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
volatile uint32_t inp_in_input; /* on input hpts (lock b) */
#endif
volatile uint16_t inp_hpts_cpu; /* Lock (i) */
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
volatile uint16_t inp_input_cpu; /* Lock (i) */
volatile uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */
inp_input_cpu_set : 1, /* on input hpts (i) */
inp_hpts_calls :1, /* (i) from output hpts */
inp_input_calls :1, /* (i) from input hpts */
inp_spare_bits2 : 4;
uint8_t inp_spare_byte; /* Compiler hole */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct socket *inp_socket; /* (i) back pointer to socket */
uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */
uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */
TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
@ -638,6 +715,7 @@ short inp_so_options(const struct inpcb *inp);
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
/*
* Flags passed to in_pcblookup*() functions.

1964
sys/netinet/tcp_hpts.c Normal file

File diff suppressed because it is too large Load Diff

304
sys/netinet/tcp_hpts.h Normal file
View File

@ -0,0 +1,304 @@
#ifndef __tcp_hpts_h__
#define __tcp_hpts_h__
/*-
* Copyright (c) 2016-8
* Netflix Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* __FBSDID("$FreeBSD$")
*/
/*
* The hpts uses a 102400 wheel. The wheel
* defines the time in 10 usec increments (102400 x 10).
* This gives a range of 10usec - 1024ms to place
* an entry within. If the user requests more than
* 1.024 second, a remaineder is attached and the hpts
* when seeing the remainder will re-insert the
* inpcb forward in time from where it is until
* the remainder is zero.
*/
#define NUM_OF_HPTSI_SLOTS 102400
TAILQ_HEAD(hptsh, inpcb);
/* Number of useconds in a hpts tick */
#define HPTS_TICKS_PER_USEC 10
#define HPTS_MS_TO_SLOTS(x) (x * 100)
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
#define HPTS_MSEC_IN_SEC 1000
#define HPTS_USEC_IN_MSEC 1000
#define DEFAULT_HPTS_LOG 3072
/*
* Log flags consist of
* 7f 7f 1 1 bits
* p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
*
* So for example cpu 10, number 10 would with
* input active would show up as:
* p_flags = 0001010 0001010 1 0
* <or>
* p_flags = 0x142a
*/
#define HPTS_HPTS_ACTIVE 0x01
#define HPTS_INPUT_ACTIVE 0x02
#define HPTSLOG_IMMEDIATE 1
#define HPTSLOG_INSERT_NORMAL 2
#define HPTSLOG_INSERT_SLEEPER 3
#define HPTSLOG_SLEEP_AFTER 4
#define HPTSLOG_SLEEP_BEFORE 5
#define HPTSLOG_INSERTED 6
#define HPTSLOG_WAKEUP_HPTS 7
#define HPTSLOG_SETTORUN 8
#define HPTSLOG_HPTSI 9
#define HPTSLOG_TOLONG 10
#define HPTSLOG_AWAKENS 11
#define HPTSLOG_TIMESOUT 12
#define HPTSLOG_SLEEPSET 13
#define HPTSLOG_WAKEUP_INPUT 14
#define HPTSLOG_RESCHEDULE 15
#define HPTSLOG_AWAKE 16
#define HPTSLOG_INP_DONE 17
struct hpts_log {
struct inpcb *inp;
int32_t event;
uint32_t cts;
int32_t line;
uint32_t ticknow;
uint32_t t_paceslot;
uint32_t t_hptsreq;
uint32_t p_curtick;
uint32_t p_prevtick;
uint32_t slot_req;
uint32_t p_on_queue_cnt;
uint32_t p_nxt_slot;
uint32_t p_cur_slot;
uint32_t p_hpts_sleep_time;
uint16_t p_flags;
uint8_t p_onhpts;
uint8_t p_oninput;
uint8_t is_notempty;
};
struct hpts_diag {
uint32_t p_hpts_active;
uint32_t p_nxt_slot;
uint32_t p_cur_slot;
uint32_t slot_req;
uint32_t inp_hptsslot;
uint32_t slot_now;
uint32_t have_slept;
uint32_t hpts_sleep_time;
uint32_t yet_to_sleep;
uint32_t need_new_to;
int32_t co_ret;
uint8_t p_on_min_sleep;
};
#ifdef _KERNEL
/* Each hpts has its own p_mtx which is used for locking */
struct tcp_hpts_entry {
/* Cache line 0x00 */
struct mtx p_mtx; /* Mutex for hpts */
uint32_t p_hpts_active; /* Flag that says hpts is awake */
uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
uint32_t p_nxt_slot; /* The next slot outside the current range of
* slots that the hpts is running on. */
int32_t p_on_queue_cnt; /* Count on queue in this hpts */
uint32_t enobuf_cnt;
uint16_t p_log_at;
uint8_t p_direct_wake :1, /* boolean */
p_log_wrapped :1, /* boolean */
p_on_min_sleep:1; /* boolean */
uint8_t p_fill;
/* Cache line 0x40 */
void *p_inp;
struct hptsh p_input; /* For the tcp-input runner */
/* Hptsi wheel */
struct hptsh *p_hptss;
struct hpts_log *p_log;
uint32_t p_logsize;
int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
uint32_t hit_no_enobuf;
uint32_t p_dyn_adjust;
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
* of 255ms */
uint32_t p_delayed_by; /* How much were we delayed by */
/* Cache line 0x80 */
struct sysctl_ctx_list hpts_ctx;
struct sysctl_oid *hpts_root;
struct intr_event *ie;
void *ie_cookie;
uint16_t p_num; /* The hpts number one per cpu */
uint16_t p_cpu; /* The hpts CPU */
/* There is extra space in here */
/* Cache line 0x100 */
struct callout co __aligned(CACHE_LINE_SIZE);
} __aligned(CACHE_LINE_SIZE);
struct tcp_hptsi {
struct proc *rp_proc; /* Process structure for hpts */
struct tcp_hpts_entry **rp_ent; /* Array of hptss */
uint32_t rp_num_hptss; /* Number of hpts threads */
};
#endif
#define HPTS_REMOVE_INPUT 0x01
#define HPTS_REMOVE_OUTPUT 0x02
#define HPTS_REMOVE_ALL (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT)
/*
* When using the hpts, a TCP stack must make sure
* that once a INP_DROPPED flag is applied to a INP
* that it does not expect tcp_output() to ever be
* called by the hpts. The hpts will *not* call
* any output (or input) functions on a TCB that
* is in the DROPPED state.
*
* This implies final ACK's and RST's that might
* be sent when a TCB is still around must be
* sent from a routine like tcp_respond().
*/
#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
* this determines min granularity of the
* hpts. If 0, granularity is 10useconds at
* the cost of more CPU (context switching). */
#ifdef _KERNEL
#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp);
int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line);
#define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__)
struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp);
#define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
/*
* To insert a TCB on the hpts you *must* be holding the
* INP_WLOCK(). The hpts insert code will then acqurire
* the hpts's lock and insert the TCB on the requested
* slot possibly waking up the hpts if you are requesting
* a time earlier than what the hpts is sleeping to (if
* the hpts is sleeping). You may check the inp->inp_in_hpts
* flag without the hpts lock. The hpts is the only one
* that will clear this flag holding only the hpts lock. This
* means that in your tcp_output() routine when you test for
* it to be 1 (so you wont call output) it may be transitioning
* to 0 (by the hpts). That will be fine since that will just
* mean an extra call to tcp_output that most likely will find
* the call you executed (when the mis-match occured) will have
* put the TCB back on the hpts and it will return. If your
* call did not add it back to the hpts then you will either
* over-send or the cwnd will block you from sending more.
*
* Note you should also be holding the INP_WLOCK() when you
* call the remove from the hpts as well. Thoug usually
* you are either doing this from a timer, where you need
* that INP_WLOCK() or from destroying your TCB where again
* you should already have the INP_WLOCK().
*/
uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line);
#define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__)
uint32_t
tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag);
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
void
tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked);
int
__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line);
#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
uint16_t tcp_hpts_delayedby(struct inpcb *inp);
void __tcp_set_hpts(struct inpcb *inp, int32_t line);
#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
extern int32_t tcp_min_hptsi_time;
static __inline uint32_t
tcp_tv_to_hptstick(struct timeval *sv)
{
return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
}
static __inline uint32_t
tcp_gethptstick(struct timeval *sv)
{
struct timeval tv;
if (sv == NULL)
sv = &tv;
microuptime(sv);
return (tcp_tv_to_hptstick(sv));
}
static __inline uint32_t
tcp_tv_to_usectick(struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
}
static __inline uint32_t
tcp_tv_to_mssectick(struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
}
static __inline void
tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
{
mtx_unlock(&hpts->p_mtx);
}
static __inline uint32_t
tcp_get_usecs(struct timeval *tv)
{
struct timeval tvd;
if (tv == NULL)
tv = &tvd;
microuptime(tv);
return (tcp_tv_to_usectick(tv));
}
#endif
#endif

View File

@ -2404,7 +2404,7 @@ tcp_addfastpaths(module_t mod, int type, void *data)
err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
if (err) {
printf("Failed to register fastslow module -- err:%d\n", err);
deregister_tcp_functions(&__tcp_fastack);
deregister_tcp_functions(&__tcp_fastack, false, true);
return(err);
}
break;
@ -2412,12 +2412,12 @@ tcp_addfastpaths(module_t mod, int type, void *data)
if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
return(EBUSY);
}
err = deregister_tcp_functions(&__tcp_fastack, true, false);
err = deregister_tcp_functions(&__tcp_fastslow, true, false);
break;
case MOD_UNLOAD:
err = deregister_tcp_functions(&__tcp_fastack);
if (err == EBUSY)
break;
err = deregister_tcp_functions(&__tcp_fastslow);
err = deregister_tcp_functions(&__tcp_fastack, false, true);
err = deregister_tcp_functions(&__tcp_fastslow, false, true);
if (err == EBUSY)
break;
err = 0;

View File

@ -232,6 +232,9 @@ VNET_DEFINE(uma_zone_t, sack_hole_zone);
VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
#endif
static int tcp_default_fb_init(struct tcpcb *tp);
static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
static int tcp_default_handoff_ok(struct tcpcb *tp);
static struct inpcb *tcp_notify(struct inpcb *, int);
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
static void tcp_mtudisc(struct inpcb *, int);
@ -240,18 +243,13 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
static struct tcp_function_block tcp_def_funcblk = {
"default",
tcp_output,
tcp_do_segment,
tcp_default_ctloutput,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
0,
0
.tfb_tcp_block_name = "freebsd",
.tfb_tcp_output = tcp_output,
.tfb_tcp_do_segment = tcp_do_segment,
.tfb_tcp_ctloutput = tcp_default_ctloutput,
.tfb_tcp_handoff_ok = tcp_default_handoff_ok,
.tfb_tcp_fb_init = tcp_default_fb_init,
.tfb_tcp_fb_fini = tcp_default_fb_fini,
};
int t_functions_inited = 0;
@ -328,6 +326,88 @@ find_and_ref_tcp_fb(struct tcp_function_block *blk)
return(rblk);
}
static struct tcp_function_block *
find_and_ref_tcp_default_fb(void)
{
struct tcp_function_block *rblk;
rw_rlock(&tcp_function_lock);
rblk = tcp_func_set_ptr;
refcount_acquire(&rblk->tfb_refcnt);
rw_runlock(&tcp_function_lock);
return (rblk);
}
void
tcp_switch_back_to_default(struct tcpcb *tp)
{
struct tcp_function_block *tfb;
KASSERT(tp->t_fb != &tcp_def_funcblk,
("%s: called by the built-in default stack", __func__));
/*
* Release the old stack. This function will either find a new one
* or panic.
*/
if (tp->t_fb->tfb_tcp_fb_fini != NULL)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
refcount_release(&tp->t_fb->tfb_refcnt);
/*
* Now, we'll find a new function block to use.
* Start by trying the current user-selected
* default, unless this stack is the user-selected
* default.
*/
tfb = find_and_ref_tcp_default_fb();
if (tfb == tp->t_fb) {
refcount_release(&tfb->tfb_refcnt);
tfb = NULL;
}
/* Does the stack accept this connection? */
if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
(*tfb->tfb_tcp_handoff_ok)(tp)) {
refcount_release(&tfb->tfb_refcnt);
tfb = NULL;
}
/* Try to use that stack. */
if (tfb != NULL) {
/* Initialize the new stack. If it succeeds, we are done. */
tp->t_fb = tfb;
if (tp->t_fb->tfb_tcp_fb_init == NULL ||
(*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
return;
/*
* Initialization failed. Release the reference count on
* the stack.
*/
refcount_release(&tfb->tfb_refcnt);
}
/*
* If that wasn't feasible, use the built-in default
* stack which is not allowed to reject anyone.
*/
tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
if (tfb == NULL) {
/* there always should be a default */
panic("Can't refer to tcp_def_funcblk");
}
if (tfb->tfb_tcp_handoff_ok != NULL) {
if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
/* The default stack cannot say no */
panic("Default stack rejects a new session?");
}
}
tp->t_fb = tfb;
if (tp->t_fb->tfb_tcp_fb_init != NULL &&
(*tp->t_fb->tfb_tcp_fb_init)(tp)) {
/* The default stack cannot fail */
panic("Default stack initialization failed");
}
}
static int
sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
@ -506,6 +586,89 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info,
NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info",
"List TCP function block name-to-ID mappings");
/*
* tfb_tcp_handoff_ok() function for the default stack.
* Note that we'll basically try to take all comers.
*/
static int
tcp_default_handoff_ok(struct tcpcb *tp)
{
return (0);
}
/*
* tfb_tcp_fb_init() function for the default stack.
*
* This handles making sure we have appropriate timers set if you are
* transitioning a socket that has some amount of setup done.
*
* The init() fuction from the default can *never* return non-zero i.e.
* it is required to always succeed since it is the stack of last resort!
*/
static int
tcp_default_fb_init(struct tcpcb *tp)
{
struct socket *so;
INP_WLOCK_ASSERT(tp->t_inpcb);
KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
("%s: connection %p in unexpected state %d", __func__, tp,
tp->t_state));
/*
* Nothing to do for ESTABLISHED or LISTEN states. And, we don't
* know what to do for unexpected states (which includes TIME_WAIT).
*/
if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
return (0);
/*
* Make sure some kind of transmission timer is set if there is
* outstanding data.
*/
so = tp->t_inpcb->inp_socket;
if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
tcp_timer_active(tp, TT_PERSIST))) {
/*
* If the session has established and it looks like it should
* be in the persist state, set the persist timer. Otherwise,
* set the retransmit timer.
*/
if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
(int32_t)(tp->snd_nxt - tp->snd_una) <
(int32_t)sbavail(&so->so_snd))
tcp_setpersist(tp);
else
tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
}
/* All non-embryonic sessions get a keepalive timer. */
if (!tcp_timer_active(tp, TT_KEEP))
tcp_timer_activate(tp, TT_KEEP,
TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
TP_KEEPINIT(tp));
return (0);
}
/*
* tfb_tcp_fb_fini() function for the default stack.
*
* This changes state as necessary (or prudent) to prepare for another stack
* to assume responsibility for the connection.
*/
static void
tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
return;
}
/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
@ -732,11 +895,28 @@ register_tcp_functions(struct tcp_function_block *blk, int wait)
return (register_tcp_functions_as_name(blk, NULL, wait));
}
/*
* Deregister all names associated with a function block. This
* functionally removes the function block from use within the system.
*
* When called with a true quiesce argument, mark the function block
* as being removed so no more stacks will use it and determine
* whether the removal would succeed.
*
* When called with a false quiesce argument, actually attempt the
* removal.
*
* When called with a force argument, attempt to switch all TCBs to
* use the default stack instead of returning EBUSY.
*
* Returns 0 on success (or if the removal would succeed, or an error
* code on failure.
*/
int
deregister_tcp_functions(struct tcp_function_block *blk)
deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
bool force)
{
struct tcp_function *f;
int error=ENOENT;
if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
/* You can't un-register the default */
@ -748,22 +928,63 @@ deregister_tcp_functions(struct tcp_function_block *blk)
rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
/* Mark the block so no more stacks can use it. */
blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
/*
* If TCBs are still attached to the stack, attempt to switch them
* to the default stack.
*/
if (force && blk->tfb_refcnt) {
struct inpcb *inp;
struct tcpcb *tp;
VNET_ITERATOR_DECL(vnet_iter);
rw_wunlock(&tcp_function_lock);
VNET_LIST_RLOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
INP_INFO_WLOCK(&V_tcbinfo);
LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
if (inp->inp_flags & INP_TIMEWAIT) {
INP_WUNLOCK(inp);
continue;
}
tp = intotcpcb(inp);
if (tp == NULL || tp->t_fb != blk) {
INP_WUNLOCK(inp);
continue;
}
tcp_switch_back_to_default(tp);
INP_WUNLOCK(inp);
}
INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
rw_wlock(&tcp_function_lock);
}
if (blk->tfb_refcnt) {
/* Still tcb attached, mark it. */
blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
rw_wunlock(&tcp_function_lock);
/* TCBs still attached. */
rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
if (quiesce) {
/* Skip removal. */
rw_wunlock(&tcp_function_lock);
return (0);
}
/* Remove any function names that map to this function block. */
while (find_tcp_fb_locked(blk, &f) != NULL) {
/* Found */
TAILQ_REMOVE(&t_functions, f, tf_next);
tcp_fb_cnt--;
f->tf_fb = NULL;
free(f, M_TCPFUNCTIONS);
error = 0;
}
rw_wunlock(&tcp_function_lock);
return (error);
return (0);
}
void

View File

@ -852,6 +852,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
refcount_release(&tp->t_fb->tfb_refcnt);
tp->t_fb = rblk;
/*
* XXXrrs this is quite dangerous, it is possible
* for the new function to fail to init. We also
* are not asking if the handoff_is_ok though at
* the very start thats probalbly ok.
*/
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}

View File

@ -1521,17 +1521,34 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
*/
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
}
#ifdef TCPHPTS
/* Assure that we are not on any hpts */
tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL);
#endif
if (blk->tfb_tcp_fb_init) {
error = (*blk->tfb_tcp_fb_init)(tp);
if (error) {
refcount_release(&blk->tfb_refcnt);
if (tp->t_fb->tfb_tcp_fb_init) {
if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) {
/* Fall back failed, drop the connection */
INP_WUNLOCK(inp);
soabort(so);
return(error);
}
}
goto err_out;
}
}
refcount_release(&tp->t_fb->tfb_refcnt);
tp->t_fb = blk;
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_ctloutput(tp, sopt->sopt_dir,
sopt->sopt_name);
}
#endif
err_out:
INP_WUNLOCK(inp);
return (error);
} else if ((sopt->sopt_dir == SOPT_GET) &&

View File

@ -90,6 +90,8 @@ struct tcpcb {
int t_segqlen; /* segment reassembly queue length */
int t_dupacks; /* consecutive dup acks recd */
struct mbuf *t_in_pkt; /* head of the input packet queue for the tcp_hpts system */
struct mbuf *t_tail_pkt; /* tail of the input packet queue for the tcp_hpts system */
struct tcp_timer *t_timers; /* All the TCP timers in one struct */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
@ -257,14 +259,19 @@ struct tcptemp {
struct tcp_function_block {
char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
int (*tfb_tcp_output)(struct tcpcb *);
int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t,
int);
void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t,
int, int, struct timeval *);
int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp);
/* Optional memory allocation/free routine */
void (*tfb_tcp_fb_init)(struct tcpcb *);
int (*tfb_tcp_fb_init)(struct tcpcb *);
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
/* Optional timers, must define all if you define one */
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
@ -274,6 +281,7 @@ struct tcp_function_block {
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
void (*tfb_tcp_mtu_chg)(struct tcpcb *);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
@ -851,9 +859,12 @@ int register_tcp_functions_as_names(struct tcp_function_block *blk,
int wait, const char *names[], int *num_names);
int register_tcp_functions_as_name(struct tcp_function_block *blk,
const char *name, int wait);
int deregister_tcp_functions(struct tcp_function_block *blk);
int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
bool force);
struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
void tcp_switch_back_to_default(struct tcpcb *tp);
struct tcp_function_block *
find_and_ref_tcp_fb(struct tcp_function_block *fs);
int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);

50
sys/sys/kern_prefetch.h Normal file
View File

@ -0,0 +1,50 @@
#ifndef __kern_prefetch_h__
/*-
* Copyright (c) 2016-8
* Netflix Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* __FBSDID("$FreeBSD$")
*/
#define __kern_prefetch_h__
#ifdef _KERNEL
#if defined(__amd64__)
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/pmap.h>
#endif
static __inline void
kern_prefetch(const volatile void *addr, void* before)
{
#if defined(__amd64__)
__asm __volatile("prefetcht1 (%1)":"=rm"(*((int32_t *)before)):"r"(addr):);
#else
__builtin_prefetch(addr);
#endif
}
#endif
#endif

View File

@ -196,6 +196,11 @@ struct pkthdr {
#define lro_nsegs tso_segsz
#define csum_phsum PH_per.sixteen[2]
#define csum_data PH_per.thirtytwo[1]
#define pace_thoff PH_loc.sixteen[0]
#define pace_tlen PH_loc.sixteen[1]
#define pace_drphdrlen PH_loc.sixteen[2]
#define pace_tos PH_loc.eight[6]
#define pace_lock PH_loc.eight[7]
/*
* Description of external storage mapped into mbuf; valid only if M_EXT is