tcp: HPTS performance enhancements

HPTS drives both rack and bbr, and yet there have been many complaints about performance. This bit of work restructures hpts to help reduce CPU overhead. It does this by now instead of relying on the timer/callout to drive it instead use user return from a system call as well as lro flushes to drive hpts. The timer becomes a backstop that dynamically adjusts based on how "late" we are. Reviewed by: tuexen, glebius Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D31083
2021-07-06 15:23:22 -04:00 · 2021-07-06 15:23:22 -04:00 · d7955cc0ff
commit d7955cc0ff
parent 747a6b7ace
8 changed files with 940 additions and 566 deletions
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@ -140,6 +140,16 @@ userret(struct thread *td, struct trapframe *frame)
 #ifdef HWPMC_HOOKS
 	if (PMC_THREAD_HAS_SAMPLES(td))
 		PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL);
+#endif
+#ifdef TCPHPTS
+	/*
+	 * @gallatin is adament that this needs to go here, I
+	 * am not so sure. Running hpts is a lot like
+	 * a lro_flush() that happens while a user process
+	 * is running. But he may know best so I will go
+	 * with his view of accounting. :-)
+	 */
+	tcp_run_hpts();
 #endif
 	/*
 	 * Let the scheduler adjust our priority etc.
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@ -258,6 +258,7 @@ struct inpcb {
 	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
 #endif
 	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
+	volatile uint16_t  inp_irq_cpu;	/* Set by LRO in behalf of or the driver */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
@ -266,7 +267,8 @@ struct inpcb {
 			 inp_input_cpu_set : 1,	/* on input hpts (i) */
 			 inp_hpts_calls :1,	/* (i) from output hpts */
 			 inp_input_calls :1,	/* (i) from input hpts */
-			 inp_spare_bits2 : 4;
+			 inp_irq_cpu_set :1,	/* (i) from LRO/Driver */
+			 inp_spare_bits2 : 3;
 	uint8_t inp_numa_domain;	/* numa domain */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@ -44,7 +44,7 @@
 TAILQ_HEAD(hptsh, inpcb);

 /* Number of useconds in a hpts tick */
-#define HPTS_TICKS_PER_USEC 10
+#define HPTS_TICKS_PER_SLOT 10
 #define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
 #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
 #define HPTS_USEC_IN_SEC 1000000
@ -56,7 +56,7 @@ struct hpts_diag {
 	uint32_t p_nxt_slot;		/* bbr->flex1 x */
 	uint32_t p_cur_slot;		/* bbr->flex2 x */
 	uint32_t p_prev_slot;		/* bbr->delivered */
-	uint32_t p_runningtick;		/* bbr->inflight */
+	uint32_t p_runningslot;		/* bbr->inflight */
 	uint32_t slot_req;		/* bbr->flex3 x */
 	uint32_t inp_hptsslot;		/* bbr->flex4 x */
 	uint32_t slot_remaining;	/* bbr->flex5 x */
@ -64,8 +64,8 @@ struct hpts_diag {
 	uint32_t hpts_sleep_time;	/* bbr->applimited x */
 	uint32_t yet_to_sleep;		/* bbr->lt_epoch x */
 	uint32_t need_new_to;		/* bbr->flex6 x  */
-	uint32_t wheel_tick;		/* bbr->bw_inuse x */
-	uint32_t maxticks;		/* bbr->delRate x */
+	uint32_t wheel_slot;		/* bbr->bw_inuse x */
+	uint32_t maxslots;		/* bbr->delRate x */
 	uint32_t wheel_cts;		/* bbr->rttProp x */
 	int32_t co_ret; 		/* bbr->pkts_out x */
 	uint32_t p_curtick;		/* upper bbr->cur_del_rate */
@ -83,16 +83,20 @@ struct hpts_diag {
 #define PACE_PKT_OUTPUT 0x40	/* Output Packets being paced */
 #define PACE_TMR_MASK   (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)

+#define DEFAULT_CONNECTION_THESHOLD 100
+
 #ifdef _KERNEL
 /* Each hpts has its own p_mtx which is used for locking */
 struct tcp_hpts_entry {
 	/* Cache line 0x00 */
 	struct mtx p_mtx;	/* Mutex for hpts */
+	struct timeval p_mysleep;	/* Our min sleep time */
+	uint64_t syscall_cnt;
+	uint64_t sleeping;	/* What the actual sleep was (if sleeping) */
 	uint16_t p_hpts_active; /* Flag that says hpts is awake  */
-	uint8_t p_hpts_wake_scheduled;	/* Have we scheduled a wakeup? */
 	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
 	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */
-	uint32_t p_runningtick; /* Current tick we are at if we are running */
+	uint32_t p_runningslot; /* Current tick we are at if we are running */
 	uint32_t p_prev_slot;	/* Previous slot we were on */
 	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
 	uint32_t p_nxt_slot;	/* The next slot outside the current range of
@ -101,7 +105,8 @@ struct tcp_hpts_entry {
 	uint32_t p_lasttick;	/* Last tick before the current one */
 	uint8_t p_direct_wake :1, /* boolean */
 		p_on_min_sleep:1, /* boolean */
-		p_avail:6;
+		p_hpts_wake_scheduled:1, /* boolean */
+		p_avail:5;
 	uint8_t p_fill[3];	  /* Fill to 32 bits */
 	/* Cache line 0x40 */
 	void *p_inp;
@ -109,8 +114,6 @@ struct tcp_hpts_entry {
 	/* Hptsi wheel */
 	struct hptsh *p_hptss;
 	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
-	uint32_t hit_no_enobuf;
-	uint32_t p_dyn_adjust;
 	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
 					 * of 255ms */
 	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
@ -134,6 +137,7 @@ struct tcp_hpts_entry {
 struct tcp_hptsi {
 	struct proc *rp_proc;	/* Process structure for hpts */
 	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
+	uint32_t *cts_last_ran;
 	uint32_t rp_num_hptss;	/* Number of hpts threads */
 };

@ -155,10 +159,37 @@ struct tcp_hptsi {
 * be sent when a TCB is still around must be
 * sent from a routine like tcp_respond().
 */
+#define LOWEST_SLEEP_ALLOWED 50
 #define DEFAULT_MIN_SLEEP 250	/* How many usec's is default for hpts sleep
 				 * this determines min granularity of the
-				 * hpts. If 0, granularity is 10useconds at
-				 * the cost of more CPU (context switching). */
+				 * hpts. If 1, granularity is 10useconds at
+				 * the cost of more CPU (context switching).
+				 * Note do not set this to 0.
+				 */
+#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
+#define DYNAMIC_MAX_SLEEP 100000	/* 100ms */
+/* No of connections when wee start aligning to the cpu from syscalls */
+#define OLDEST_THRESHOLD 1200
+/* Thresholds for raising/lowering sleep */
+#define TICKS_INDICATE_MORE_SLEEP 100		/* This would be 1ms */
+#define TICKS_INDICATE_LESS_SLEEP 1000		/* This would indicate 10ms */
+/**
+ *
+ * Dynamic adjustment of sleeping times is done in "new" mode
+ * where we are depending on syscall returns and lro returns
+ * to push hpts forward mainly and the timer is only a backstop.
+ *
+ * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
+ * then we do a dynamic adjustment on the time we sleep.
+ * Our threshold is if the lateness of the first client served (in ticks) is
+ * greater than or equal too ticks_indicate_more_sleep (10ms
+ * or 10000 ticks). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the ticks_ran is less than
+ * ticks_indicate_more_sleep (100 ticks or 1000usecs).
+ *
+ */
+
+
 #ifdef _KERNEL
 #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
 struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
@ -215,12 +246,48 @@ void __tcp_set_hpts(struct inpcb *inp, int32_t line);
 void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
 #define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)

+void tcp_run_hpts(void);
+
+uint16_t hpts_random_cpu(struct inpcb *inp);
+
 extern int32_t tcp_min_hptsi_time;

+#endif /* _KERNEL */
+
+/*
+ * The following functions should also be available
+ * to userspace as well.
+ */
 static __inline uint32_t
-tcp_tv_to_hptstick(struct timeval *sv)
+tcp_tv_to_hptstick(const struct timeval *sv)
 {
-	return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
+	return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
+}
+
+static __inline uint32_t
+tcp_tv_to_usectick(const struct timeval *sv)
+{
+	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+static __inline uint32_t
+tcp_tv_to_mssectick(const struct timeval *sv)
+{
+	return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
+}
+
+static __inline uint64_t
+tcp_tv_to_lusectick(const struct timeval *sv)
+{
+	return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+#ifdef _KERNEL
+
+static __inline void
+tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
+{
+	mtx_unlock(&hpts->p_mtx);
 }

 static __inline uint32_t
@ -234,24 +301,6 @@ tcp_gethptstick(struct timeval *sv)
 	return (tcp_tv_to_hptstick(sv));
 }

-static __inline uint32_t
-tcp_tv_to_usectick(struct timeval *sv)
-{
-	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
-}
-
-static __inline uint32_t
-tcp_tv_to_mssectick(struct timeval *sv)
-{
-	return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
-}
-
-static __inline void
-tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
-{
-	mtx_unlock(&hpts->p_mtx);
-}
-
 static __inline uint32_t
 tcp_get_usecs(struct timeval *tv)
 {
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@ -107,6 +107,11 @@ SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
    CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
    "default number of LRO entries");

+static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH;
+SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold,
+    CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0,
+    "Number of interrups in a row on the same CPU that will make us declare an 'affinity' cpu?");
+
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
    &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
@ -631,12 +636,13 @@ tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
 			log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
 		}
 		log.u_bbr.inflight = th_seq;
+		log.u_bbr.delivered = th_ack;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.epoch = le->next_seq;
-		log.u_bbr.delivered = th_ack;
 		log.u_bbr.lt_epoch = le->ack_seq;
 		log.u_bbr.pacing_gain = th_win;
 		log.u_bbr.cwnd_gain = le->window;
+		log.u_bbr.lost = curcpu;
 		log.u_bbr.cur_del_rate = (uintptr_t)m;
 		log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
 		bintime2timeval(&lc->lro_last_queue_time, &btv);
@ -1273,7 +1279,10 @@ tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
 		INP_WUNLOCK(inp);
 		return (TCP_LRO_CANNOT);
 	}
-
+	if ((inp->inp_irq_cpu_set == 0)  && (lc->lro_cpu_is_set == 1)) {
+		inp->inp_irq_cpu = lc->lro_last_cpu;
+		inp->inp_irq_cpu_set = 1;
+	}
 	/* Check if the transport doesn't support the needed optimizations. */
 	if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {
 		INP_WUNLOCK(inp);
@ -1445,7 +1454,17 @@ tcp_lro_flush_all(struct lro_ctrl *lc)
 	/* check if no mbufs to flush */
 	if (lc->lro_mbuf_count == 0)
 		goto done;
-
+	if (lc->lro_cpu_is_set == 0) {
+		if (lc->lro_last_cpu == curcpu) {
+			lc->lro_cnt_of_same_cpu++;
+			/* Have we reached the threshold to declare a cpu? */
+			if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh)
+				lc->lro_cpu_is_set = 1;
+		} else {
+			lc->lro_last_cpu = curcpu;
+			lc->lro_cnt_of_same_cpu = 0;
+		}
+	}
 	CURVNET_SET(lc->ifp->if_vnet);

 	/* get current time */
@ -1486,6 +1505,9 @@ tcp_lro_flush_all(struct lro_ctrl *lc)
 	/* flush active streams */
 	tcp_lro_rx_done(lc);

+#ifdef TCPHPTS
+	tcp_run_hpts();
+#endif
 	lc->lro_mbuf_count = 0;
 }

--- a/sys/netinet/tcp_lro.h
+++ b/sys/netinet/tcp_lro.h
@ -56,6 +56,11 @@
 #define TSTMP_LRO		0x0100
 #define TSTMP_HDWR		0x0200
 #define HAS_TSTMP		0x0400
+/*
+ * Default number of interrupts on the same cpu in a row
+ * that will cause us to declare a "affinity cpu".
+ */
+#define TCP_LRO_CPU_DECLARATION_THRESH 50

 struct inpcb;

@ -162,12 +167,15 @@ struct lro_ctrl {
 	unsigned	lro_mbuf_count;
 	unsigned	lro_mbuf_max;
 	unsigned short	lro_ackcnt_lim;		/* max # of aggregated ACKs */
+	unsigned short	lro_cpu;		/* Guess at the cpu we have affinity too */
 	unsigned 	lro_length_lim;		/* max len of aggregated data */
-
 	u_long		lro_hashsz;
+	uint32_t	lro_last_cpu;
+	uint32_t 	lro_cnt_of_same_cpu;
 	struct lro_head	*lro_hash;
 	struct lro_head	lro_active;
 	struct lro_head	lro_free;
+	uint8_t		lro_cpu_is_set;		/* Flag to say its ok to set the CPU on the inp */
 };

 struct tcp_ackent {
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@ -2429,10 +2429,10 @@ bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
 		log.u_bbr.pkts_out = diag->co_ret;
 		log.u_bbr.applimited = diag->hpts_sleep_time;
 		log.u_bbr.delivered = diag->p_prev_slot;
-		log.u_bbr.inflight = diag->p_runningtick;
-		log.u_bbr.bw_inuse = diag->wheel_tick;
+		log.u_bbr.inflight = diag->p_runningslot;
+		log.u_bbr.bw_inuse = diag->wheel_slot;
 		log.u_bbr.rttProp = diag->wheel_cts;
-		log.u_bbr.delRate = diag->maxticks;
+		log.u_bbr.delRate = diag->maxslots;
 		log.u_bbr.cur_del_rate = diag->p_curtick;
 		log.u_bbr.cur_del_rate <<= 32;
 		log.u_bbr.cur_del_rate |= diag->p_lasttick;
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@ -5634,11 +5634,11 @@ rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
 		log.u_bbr.pkts_out = diag->co_ret;
 		log.u_bbr.applimited = diag->hpts_sleep_time;
 		log.u_bbr.delivered = diag->p_prev_slot;
-		log.u_bbr.inflight = diag->p_runningtick;
-		log.u_bbr.bw_inuse = diag->wheel_tick;
+		log.u_bbr.inflight = diag->p_runningslot;
+		log.u_bbr.bw_inuse = diag->wheel_slot;
 		log.u_bbr.rttProp = diag->wheel_cts;
 		log.u_bbr.timeStamp = cts;
-		log.u_bbr.delRate = diag->maxticks;
+		log.u_bbr.delRate = diag->maxslots;
 		log.u_bbr.cur_del_rate = diag->p_curtick;
 		log.u_bbr.cur_del_rate <<= 32;
 		log.u_bbr.cur_del_rate |= diag->p_lasttick;
@ -5732,22 +5732,22 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
 			 * on the clock. We always have a min
 			 * 10 slots (10 x 10 i.e. 100 usecs).
 			 */
-			if (slot <= HPTS_TICKS_PER_USEC) {
+			if (slot <= HPTS_TICKS_PER_SLOT) {
 				/* We gain delay */
-				rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot);
-				slot = HPTS_TICKS_PER_USEC;
+				rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
+				slot = HPTS_TICKS_PER_SLOT;
 			} else {
 				/* We take off some */
-				rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC);
-				slot = HPTS_TICKS_PER_USEC;
+				rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
+				slot = HPTS_TICKS_PER_SLOT;
 			}
 		} else {
 			slot -= rack->r_ctl.rc_agg_delayed;
 			rack->r_ctl.rc_agg_delayed = 0;
 			/* Make sure we have 100 useconds at minimum */
-			if (slot < HPTS_TICKS_PER_USEC) {
-				rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot;
-				slot = HPTS_TICKS_PER_USEC;
+			if (slot < HPTS_TICKS_PER_SLOT) {
+				rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
+				slot = HPTS_TICKS_PER_SLOT;
 			}
 			if (rack->r_ctl.rc_agg_delayed == 0)
 				rack->r_late = 0;