This adds the final tweaks to LRO that will now allow me

to add BBR. These changes make it so you can get an array of timestamps instead of a compressed ack/data segment. BBR uses this to aid with its delivery estimates. We also now (via Drew's suggestions) will not go to the expense of the tcb lookup if no stack registers to want this feature. If HPTS is not present the feature is not present either and you just get the compressed behavior. Sponsored by: Netflix Inc Differential Revision: https://reviews.freebsd.org/D21127
svn path=/head/; revision=351934
2019-09-06 14:25:41 +00:00 · 2019-09-06 14:25:41 +00:00 · e57b2d0e51 · 2020-12-20 02:59:44 +00:00
commit e57b2d0e51
parent 725ee594b4
4 changed files with 776 additions and 249 deletions
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
--- a/sys/netinet/tcp_lro.h
+++ b/sys/netinet/tcp_lro.h
@ -45,6 +45,8 @@ struct lro_entry {
 	LIST_ENTRY(lro_entry)	hash_next;
 	struct mbuf		*m_head;
 	struct mbuf		*m_tail;
+	struct mbuf		*m_last_mbuf;
+	struct mbuf		*m_prev_last;
 	union {
 		struct ip	*ip4;
 		struct ip6_hdr	*ip6;
@ -67,10 +69,22 @@ struct lro_entry {
 	uint32_t		ack_seq;	/* tcp_seq */
 	uint32_t		tsval;
 	uint32_t		tsecr;
+	uint32_t		tcp_tot_p_len;	/* TCP payload length of chain */
 	uint16_t		window;
 	uint16_t		timestamp;	/* flag, not a TCP hdr field. */
+	uint16_t		need_wakeup;
+	uint16_t		mbuf_cnt;	/* Count of mbufs collected see note */
+	uint16_t		mbuf_appended;
 	struct timeval		mtime;
 };
+/* 
+ * Note: The mbuf_cnt field tracks our number of mbufs added to the m_next 
+ *       list. Each mbuf counted can have data and of course it will 
+ *	 have an ack as well (by defintion any inbound tcp segment will 
+ *	 have an ack value. We use this count to tell us how many ACK's
+ *	 are present for our ack-count threshold. If we exceed that or 
+ *	 the data threshold we will wake up the endpoint.
+ */
 LIST_HEAD(lro_head, lro_entry);

 #define	le_ip4			leip.ip4
@ -115,6 +129,8 @@ void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
 void tcp_lro_flush_all(struct lro_ctrl *);
 int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
 void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *);
+void tcp_lro_reg_mbufq(void);
+void tcp_lro_dereg_mbufq(void);	

 #define	TCP_LRO_NO_ENTRIES	-2
 #define	TCP_LRO_CANNOT		-1
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@ -159,6 +159,65 @@ ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
 }
 #endif

+
+/*
+ * The function ctf_process_inbound_raw() is used by
+ * transport developers to do the steps needed to
+ * support MBUF Queuing i.e. the flags in
+ * inp->inp_flags2:
+ *
+ * - INP_SUPPORTS_MBUFQ
+ * - INP_MBUF_QUEUE_READY
+ * - INP_DONT_SACK_QUEUE
+ * 
+ * These flags help control how LRO will deliver
+ * packets to the transport. You first set in inp_flags2
+ * the INP_SUPPORTS_MBUFQ to tell the LRO code that you
+ * will gladly take a queue of packets instead of a compressed
+ * single packet. You also set in your t_fb pointer the
+ * tfb_do_queued_segments to point to ctf_process_inbound_raw.
+ *
+ * This then gets you lists of inbound ACK's/Data instead
+ * of a condensed compressed ACK/DATA packet. Why would you
+ * want that? This will get you access to all the arrival
+ * times of at least LRO and possibly at the Hardware (if
+ * the interface card supports that) of the actual ACK/DATA.
+ * In some transport designs this is important since knowing
+ * the actual time we got the packet is useful information.
+ *
+ * Now there are some interesting Caveats that the transport
+ * designer needs to take into account when using this feature.
+ * 
+ * 1) It is used with HPTS and pacing, when the pacing timer
+ *    for output calls it will first call the input. 
+ * 2) When you set INP_MBUF_QUEUE_READY this tells LRO
+ *    queue normal packets, I am busy pacing out data and
+ *    will process the queued packets before my tfb_tcp_output
+ *    call from pacing. If a non-normal packet arrives, (e.g. sack)
+ *    you will be awoken immediately.
+ * 3) Finally you can add the INP_DONT_SACK_QUEUE to not even
+ *    be awoken if a SACK has arrived. You would do this when
+ *    you were not only running a pacing for output timer
+ *    but a Rack timer as well i.e. you know you are in recovery
+ *    and are in the process (via the timers) of dealing with
+ *    the loss.
+ *
+ * Now a critical thing you must be aware of here is that the
+ * use of the flags has a far greater scope then just your 
+ * typical LRO. Why? Well thats because in the normal compressed
+ * LRO case at the end of a driver interupt all packets are going
+ * to get presented to the transport no matter if there is one
+ * or 100. With the MBUF_QUEUE model, this is not true. You will
+ * only be awoken to process the queue of packets when:
+ *     a) The flags discussed above allow it.
+ *          <or>
+ *     b) You exceed a ack or data limit (by default the
+ *        ack limit is infinity (64k acks) and the data 
+ *        limit is 64k of new TCP data)
+ *         <or> 
+ *     c) The push bit has been set by the peer
+ */
+
 int
 ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
 {
@ -355,13 +414,7 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int
 		 * have been called (if we can).
 		 */
 		m->m_pkthdr.lro_nsegs = 1;
-		if (m->m_flags & M_TSTMP_LRO) {
-			tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
-			tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
-		} else {
-			/* Should not be should we kassert instead? */
-			tcp_get_usecs(&tv);
-		}
+		tcp_get_usecs(&tv);
 		/* Now what about next packet? */
 		if (m_save || has_pkt)
 			nxt_pkt = 1;
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@ -199,6 +199,8 @@ struct pkthdr {
 #define	lro_nsegs	tso_segsz
 #define	csum_phsum	PH_per.sixteen[2]
 #define	csum_data	PH_per.thirtytwo[1]
+#define lro_len		PH_per.sixteen[0] /* inbound during LRO */
+#define lro_csum	PH_per.sixteen[1] /* inbound during LRO */
 #define pace_thoff	PH_loc.sixteen[0]
 #define pace_tlen	PH_loc.sixteen[1]
 #define pace_drphdrlen	PH_loc.sixteen[2]
@ -392,7 +394,7 @@ void	mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
 /*
 * mbuf flags of global significance and layer crossing.
 * Those of only protocol/layer specific significance are to be mapped
- * to M_PROTO[1-12] and cleared at layer handoff boundaries.
+ * to M_PROTO[1-11] and cleared at layer handoff boundaries.
 * NB: Limited to the lower 24 bits.
 */
 #define	M_EXT		0x00000001 /* has associated external storage */
@ -411,18 +413,17 @@ void	mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
 				      and 802.1AS) */
 #define M_TSTMP_LRO	0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */

-#define	M_PROTO1	0x00001000 /* protocol-specific */
-#define	M_PROTO2	0x00002000 /* protocol-specific */
-#define	M_PROTO3	0x00004000 /* protocol-specific */
-#define	M_PROTO4	0x00008000 /* protocol-specific */
-#define	M_PROTO5	0x00010000 /* protocol-specific */
-#define	M_PROTO6	0x00020000 /* protocol-specific */
-#define	M_PROTO7	0x00040000 /* protocol-specific */
-#define	M_PROTO8	0x00080000 /* protocol-specific */
-#define	M_PROTO9	0x00100000 /* protocol-specific */
-#define	M_PROTO10	0x00200000 /* protocol-specific */
-#define	M_PROTO11	0x00400000 /* protocol-specific */
-#define	M_PROTO12	0x00800000 /* protocol-specific */
+#define	M_PROTO1	0x00002000 /* protocol-specific */
+#define	M_PROTO2	0x00004000 /* protocol-specific */
+#define	M_PROTO3	0x00008000 /* protocol-specific */
+#define	M_PROTO4	0x00010000 /* protocol-specific */
+#define	M_PROTO5	0x00020000 /* protocol-specific */
+#define	M_PROTO6	0x00040000 /* protocol-specific */
+#define	M_PROTO7	0x00080000 /* protocol-specific */
+#define	M_PROTO8	0x00100000 /* protocol-specific */
+#define	M_PROTO9	0x00200000 /* protocol-specific */
+#define	M_PROTO10	0x00400000 /* protocol-specific */
+#define	M_PROTO11	0x00800000 /* protocol-specific */

 #define MB_DTOR_SKIP	0x1	/* don't pollute the cache by touching a freed mbuf */

@ -431,7 +432,7 @@ void	mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
 */
 #define	M_PROTOFLAGS \
    (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
-     M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12)
+     M_PROTO9|M_PROTO10|M_PROTO11)

 /*
 * Flags preserved when copying m_pkthdr.
@ -449,7 +450,7 @@ void	mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
 #define	M_FLAG_PROTOBITS \
    "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
    "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
-    "\27M_PROTO11\30M_PROTO12"
+    "\27M_PROTO11"
 #define	M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)

 /*