tcp: HPTS performance enhancements

HPTS drives both rack and bbr, and yet there have been many complaints
about performance. This bit of work restructures hpts to help reduce CPU
overhead. It does this by now instead of relying on the timer/callout to
drive it instead use user return from a system call as well as lro flushes
to drive hpts. The timer becomes a backstop that dynamically adjusts
based on how "late" we are.

Reviewed by: tuexen, glebius
Sponsored by: Netflix Inc.
Differential Revision: https://reviews.freebsd.org/D31083
This commit is contained in:
Randall Stewart 2021-07-06 15:23:22 -04:00
parent 747a6b7ace
commit d7955cc0ff
8 changed files with 940 additions and 566 deletions

View File

@ -140,6 +140,16 @@ userret(struct thread *td, struct trapframe *frame)
#ifdef HWPMC_HOOKS
if (PMC_THREAD_HAS_SAMPLES(td))
PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL);
#endif
#ifdef TCPHPTS
/*
* @gallatin is adament that this needs to go here, I
* am not so sure. Running hpts is a lot like
* a lro_flush() that happens while a user process
* is running. But he may know best so I will go
* with his view of accounting. :-)
*/
tcp_run_hpts();
#endif
/*
* Let the scheduler adjust our priority etc.

View File

@ -258,6 +258,7 @@ struct inpcb {
volatile uint32_t inp_in_input; /* on input hpts (lock b) */
#endif
volatile uint16_t inp_hpts_cpu; /* Lock (i) */
volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
@ -266,7 +267,8 @@ struct inpcb {
inp_input_cpu_set : 1, /* on input hpts (i) */
inp_hpts_calls :1, /* (i) from output hpts */
inp_input_calls :1, /* (i) from input hpts */
inp_spare_bits2 : 4;
inp_irq_cpu_set :1, /* (i) from LRO/Driver */
inp_spare_bits2 : 3;
uint8_t inp_numa_domain; /* numa domain */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct socket *inp_socket; /* (i) back pointer to socket */

File diff suppressed because it is too large Load Diff

View File

@ -44,7 +44,7 @@
TAILQ_HEAD(hptsh, inpcb);
/* Number of useconds in a hpts tick */
#define HPTS_TICKS_PER_USEC 10
#define HPTS_TICKS_PER_SLOT 10
#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
@ -56,7 +56,7 @@ struct hpts_diag {
uint32_t p_nxt_slot; /* bbr->flex1 x */
uint32_t p_cur_slot; /* bbr->flex2 x */
uint32_t p_prev_slot; /* bbr->delivered */
uint32_t p_runningtick; /* bbr->inflight */
uint32_t p_runningslot; /* bbr->inflight */
uint32_t slot_req; /* bbr->flex3 x */
uint32_t inp_hptsslot; /* bbr->flex4 x */
uint32_t slot_remaining; /* bbr->flex5 x */
@ -64,8 +64,8 @@ struct hpts_diag {
uint32_t hpts_sleep_time; /* bbr->applimited x */
uint32_t yet_to_sleep; /* bbr->lt_epoch x */
uint32_t need_new_to; /* bbr->flex6 x */
uint32_t wheel_tick; /* bbr->bw_inuse x */
uint32_t maxticks; /* bbr->delRate x */
uint32_t wheel_slot; /* bbr->bw_inuse x */
uint32_t maxslots; /* bbr->delRate x */
uint32_t wheel_cts; /* bbr->rttProp x */
int32_t co_ret; /* bbr->pkts_out x */
uint32_t p_curtick; /* upper bbr->cur_del_rate */
@ -83,16 +83,20 @@ struct hpts_diag {
#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
#define DEFAULT_CONNECTION_THESHOLD 100
#ifdef _KERNEL
/* Each hpts has its own p_mtx which is used for locking */
struct tcp_hpts_entry {
/* Cache line 0x00 */
struct mtx p_mtx; /* Mutex for hpts */
struct timeval p_mysleep; /* Our min sleep time */
uint64_t syscall_cnt;
uint64_t sleeping; /* What the actual sleep was (if sleeping) */
uint16_t p_hpts_active; /* Flag that says hpts is awake */
uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
uint32_t p_runningtick; /* Current tick we are at if we are running */
uint32_t p_runningslot; /* Current tick we are at if we are running */
uint32_t p_prev_slot; /* Previous slot we were on */
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
uint32_t p_nxt_slot; /* The next slot outside the current range of
@ -101,7 +105,8 @@ struct tcp_hpts_entry {
uint32_t p_lasttick; /* Last tick before the current one */
uint8_t p_direct_wake :1, /* boolean */
p_on_min_sleep:1, /* boolean */
p_avail:6;
p_hpts_wake_scheduled:1, /* boolean */
p_avail:5;
uint8_t p_fill[3]; /* Fill to 32 bits */
/* Cache line 0x40 */
void *p_inp;
@ -109,8 +114,6 @@ struct tcp_hpts_entry {
/* Hptsi wheel */
struct hptsh *p_hptss;
int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
uint32_t hit_no_enobuf;
uint32_t p_dyn_adjust;
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
* of 255ms */
uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
@ -134,6 +137,7 @@ struct tcp_hpts_entry {
struct tcp_hptsi {
struct proc *rp_proc; /* Process structure for hpts */
struct tcp_hpts_entry **rp_ent; /* Array of hptss */
uint32_t *cts_last_ran;
uint32_t rp_num_hptss; /* Number of hpts threads */
};
@ -155,10 +159,37 @@ struct tcp_hptsi {
* be sent when a TCB is still around must be
* sent from a routine like tcp_respond().
*/
#define LOWEST_SLEEP_ALLOWED 50
#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
* this determines min granularity of the
* hpts. If 0, granularity is 10useconds at
* the cost of more CPU (context switching). */
* hpts. If 1, granularity is 10useconds at
* the cost of more CPU (context switching).
* Note do not set this to 0.
*/
#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
#define DYNAMIC_MAX_SLEEP 100000 /* 100ms */
/* No of connections when wee start aligning to the cpu from syscalls */
#define OLDEST_THRESHOLD 1200
/* Thresholds for raising/lowering sleep */
#define TICKS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */
#define TICKS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */
/**
*
* Dynamic adjustment of sleeping times is done in "new" mode
* where we are depending on syscall returns and lro returns
* to push hpts forward mainly and the timer is only a backstop.
*
* When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
* then we do a dynamic adjustment on the time we sleep.
* Our threshold is if the lateness of the first client served (in ticks) is
* greater than or equal too ticks_indicate_more_sleep (10ms
* or 10000 ticks). If we were that late, the actual sleep time
* is adjusted down by 50%. If the ticks_ran is less than
* ticks_indicate_more_sleep (100 ticks or 1000usecs).
*
*/
#ifdef _KERNEL
#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
@ -215,12 +246,48 @@ void __tcp_set_hpts(struct inpcb *inp, int32_t line);
void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
void tcp_run_hpts(void);
uint16_t hpts_random_cpu(struct inpcb *inp);
extern int32_t tcp_min_hptsi_time;
#endif /* _KERNEL */
/*
* The following functions should also be available
* to userspace as well.
*/
static __inline uint32_t
tcp_tv_to_hptstick(struct timeval *sv)
tcp_tv_to_hptstick(const struct timeval *sv)
{
return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
}
static __inline uint32_t
tcp_tv_to_usectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
}
static __inline uint32_t
tcp_tv_to_mssectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
}
static __inline uint64_t
tcp_tv_to_lusectick(const struct timeval *sv)
{
return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
}
#ifdef _KERNEL
static __inline void
tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
{
mtx_unlock(&hpts->p_mtx);
}
static __inline uint32_t
@ -234,24 +301,6 @@ tcp_gethptstick(struct timeval *sv)
return (tcp_tv_to_hptstick(sv));
}
static __inline uint32_t
tcp_tv_to_usectick(struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
}
static __inline uint32_t
tcp_tv_to_mssectick(struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
}
static __inline void
tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
{
mtx_unlock(&hpts->p_mtx);
}
static __inline uint32_t
tcp_get_usecs(struct timeval *tv)
{

View File

@ -107,6 +107,11 @@ SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
"default number of LRO entries");
static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH;
SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold,
CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0,
"Number of interrups in a row on the same CPU that will make us declare an 'affinity' cpu?");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
&tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
@ -631,12 +636,13 @@ tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
}
log.u_bbr.inflight = th_seq;
log.u_bbr.delivered = th_ack;
log.u_bbr.timeStamp = cts;
log.u_bbr.epoch = le->next_seq;
log.u_bbr.delivered = th_ack;
log.u_bbr.lt_epoch = le->ack_seq;
log.u_bbr.pacing_gain = th_win;
log.u_bbr.cwnd_gain = le->window;
log.u_bbr.lost = curcpu;
log.u_bbr.cur_del_rate = (uintptr_t)m;
log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
bintime2timeval(&lc->lro_last_queue_time, &btv);
@ -1273,7 +1279,10 @@ tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
INP_WUNLOCK(inp);
return (TCP_LRO_CANNOT);
}
if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) {
inp->inp_irq_cpu = lc->lro_last_cpu;
inp->inp_irq_cpu_set = 1;
}
/* Check if the transport doesn't support the needed optimizations. */
if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {
INP_WUNLOCK(inp);
@ -1445,7 +1454,17 @@ tcp_lro_flush_all(struct lro_ctrl *lc)
/* check if no mbufs to flush */
if (lc->lro_mbuf_count == 0)
goto done;
if (lc->lro_cpu_is_set == 0) {
if (lc->lro_last_cpu == curcpu) {
lc->lro_cnt_of_same_cpu++;
/* Have we reached the threshold to declare a cpu? */
if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh)
lc->lro_cpu_is_set = 1;
} else {
lc->lro_last_cpu = curcpu;
lc->lro_cnt_of_same_cpu = 0;
}
}
CURVNET_SET(lc->ifp->if_vnet);
/* get current time */
@ -1486,6 +1505,9 @@ tcp_lro_flush_all(struct lro_ctrl *lc)
/* flush active streams */
tcp_lro_rx_done(lc);
#ifdef TCPHPTS
tcp_run_hpts();
#endif
lc->lro_mbuf_count = 0;
}

View File

@ -56,6 +56,11 @@
#define TSTMP_LRO 0x0100
#define TSTMP_HDWR 0x0200
#define HAS_TSTMP 0x0400
/*
* Default number of interrupts on the same cpu in a row
* that will cause us to declare a "affinity cpu".
*/
#define TCP_LRO_CPU_DECLARATION_THRESH 50
struct inpcb;
@ -162,12 +167,15 @@ struct lro_ctrl {
unsigned lro_mbuf_count;
unsigned lro_mbuf_max;
unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */
unsigned short lro_cpu; /* Guess at the cpu we have affinity too */
unsigned lro_length_lim; /* max len of aggregated data */
u_long lro_hashsz;
uint32_t lro_last_cpu;
uint32_t lro_cnt_of_same_cpu;
struct lro_head *lro_hash;
struct lro_head lro_active;
struct lro_head lro_free;
uint8_t lro_cpu_is_set; /* Flag to say its ok to set the CPU on the inp */
};
struct tcp_ackent {

View File

@ -2429,10 +2429,10 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
log.u_bbr.pkts_out = diag->co_ret;
log.u_bbr.applimited = diag->hpts_sleep_time;
log.u_bbr.delivered = diag->p_prev_slot;
log.u_bbr.inflight = diag->p_runningtick;
log.u_bbr.bw_inuse = diag->wheel_tick;
log.u_bbr.inflight = diag->p_runningslot;
log.u_bbr.bw_inuse = diag->wheel_slot;
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.delRate = diag->maxticks;
log.u_bbr.delRate = diag->maxslots;
log.u_bbr.cur_del_rate = diag->p_curtick;
log.u_bbr.cur_del_rate <<= 32;
log.u_bbr.cur_del_rate |= diag->p_lasttick;

View File

@ -5634,11 +5634,11 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
log.u_bbr.pkts_out = diag->co_ret;
log.u_bbr.applimited = diag->hpts_sleep_time;
log.u_bbr.delivered = diag->p_prev_slot;
log.u_bbr.inflight = diag->p_runningtick;
log.u_bbr.bw_inuse = diag->wheel_tick;
log.u_bbr.inflight = diag->p_runningslot;
log.u_bbr.bw_inuse = diag->wheel_slot;
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.timeStamp = cts;
log.u_bbr.delRate = diag->maxticks;
log.u_bbr.delRate = diag->maxslots;
log.u_bbr.cur_del_rate = diag->p_curtick;
log.u_bbr.cur_del_rate <<= 32;
log.u_bbr.cur_del_rate |= diag->p_lasttick;
@ -5732,22 +5732,22 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* on the clock. We always have a min
* 10 slots (10 x 10 i.e. 100 usecs).
*/
if (slot <= HPTS_TICKS_PER_USEC) {
if (slot <= HPTS_TICKS_PER_SLOT) {
/* We gain delay */
rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot);
slot = HPTS_TICKS_PER_USEC;
rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
slot = HPTS_TICKS_PER_SLOT;
} else {
/* We take off some */
rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC);
slot = HPTS_TICKS_PER_USEC;
rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
slot = HPTS_TICKS_PER_SLOT;
}
} else {
slot -= rack->r_ctl.rc_agg_delayed;
rack->r_ctl.rc_agg_delayed = 0;
/* Make sure we have 100 useconds at minimum */
if (slot < HPTS_TICKS_PER_USEC) {
rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot;
slot = HPTS_TICKS_PER_USEC;
if (slot < HPTS_TICKS_PER_SLOT) {
rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
slot = HPTS_TICKS_PER_SLOT;
}
if (rack->r_ctl.rc_agg_delayed == 0)
rack->r_late = 0;