This adds the final tweaks to LRO that will now allow me
to add BBR. These changes make it so you can get an array of timestamps instead of a compressed ack/data segment. BBR uses this to aid with its delivery estimates. We also now (via Drew's suggestions) will not go to the expense of the tcb lookup if no stack registers to want this feature. If HPTS is not present the feature is not present either and you just get the compressed behavior. Sponsored by: Netflix Inc Differential Revision: https://reviews.freebsd.org/D21127
This commit is contained in:
parent
725ee594b4
commit
e57b2d0e51
File diff suppressed because it is too large
Load Diff
@ -45,6 +45,8 @@ struct lro_entry {
|
||||
LIST_ENTRY(lro_entry) hash_next;
|
||||
struct mbuf *m_head;
|
||||
struct mbuf *m_tail;
|
||||
struct mbuf *m_last_mbuf;
|
||||
struct mbuf *m_prev_last;
|
||||
union {
|
||||
struct ip *ip4;
|
||||
struct ip6_hdr *ip6;
|
||||
@ -67,10 +69,22 @@ struct lro_entry {
|
||||
uint32_t ack_seq; /* tcp_seq */
|
||||
uint32_t tsval;
|
||||
uint32_t tsecr;
|
||||
uint32_t tcp_tot_p_len; /* TCP payload length of chain */
|
||||
uint16_t window;
|
||||
uint16_t timestamp; /* flag, not a TCP hdr field. */
|
||||
uint16_t need_wakeup;
|
||||
uint16_t mbuf_cnt; /* Count of mbufs collected see note */
|
||||
uint16_t mbuf_appended;
|
||||
struct timeval mtime;
|
||||
};
|
||||
/*
|
||||
* Note: The mbuf_cnt field tracks our number of mbufs added to the m_next
|
||||
* list. Each mbuf counted can have data and of course it will
|
||||
* have an ack as well (by defintion any inbound tcp segment will
|
||||
* have an ack value. We use this count to tell us how many ACK's
|
||||
* are present for our ack-count threshold. If we exceed that or
|
||||
* the data threshold we will wake up the endpoint.
|
||||
*/
|
||||
LIST_HEAD(lro_head, lro_entry);
|
||||
|
||||
#define le_ip4 leip.ip4
|
||||
@ -115,6 +129,8 @@ void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
|
||||
void tcp_lro_flush_all(struct lro_ctrl *);
|
||||
int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
|
||||
void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *);
|
||||
void tcp_lro_reg_mbufq(void);
|
||||
void tcp_lro_dereg_mbufq(void);
|
||||
|
||||
#define TCP_LRO_NO_ENTRIES -2
|
||||
#define TCP_LRO_CANNOT -1
|
||||
|
@ -159,6 +159,65 @@ again:
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* The function ctf_process_inbound_raw() is used by
|
||||
* transport developers to do the steps needed to
|
||||
* support MBUF Queuing i.e. the flags in
|
||||
* inp->inp_flags2:
|
||||
*
|
||||
* - INP_SUPPORTS_MBUFQ
|
||||
* - INP_MBUF_QUEUE_READY
|
||||
* - INP_DONT_SACK_QUEUE
|
||||
*
|
||||
* These flags help control how LRO will deliver
|
||||
* packets to the transport. You first set in inp_flags2
|
||||
* the INP_SUPPORTS_MBUFQ to tell the LRO code that you
|
||||
* will gladly take a queue of packets instead of a compressed
|
||||
* single packet. You also set in your t_fb pointer the
|
||||
* tfb_do_queued_segments to point to ctf_process_inbound_raw.
|
||||
*
|
||||
* This then gets you lists of inbound ACK's/Data instead
|
||||
* of a condensed compressed ACK/DATA packet. Why would you
|
||||
* want that? This will get you access to all the arrival
|
||||
* times of at least LRO and possibly at the Hardware (if
|
||||
* the interface card supports that) of the actual ACK/DATA.
|
||||
* In some transport designs this is important since knowing
|
||||
* the actual time we got the packet is useful information.
|
||||
*
|
||||
* Now there are some interesting Caveats that the transport
|
||||
* designer needs to take into account when using this feature.
|
||||
*
|
||||
* 1) It is used with HPTS and pacing, when the pacing timer
|
||||
* for output calls it will first call the input.
|
||||
* 2) When you set INP_MBUF_QUEUE_READY this tells LRO
|
||||
* queue normal packets, I am busy pacing out data and
|
||||
* will process the queued packets before my tfb_tcp_output
|
||||
* call from pacing. If a non-normal packet arrives, (e.g. sack)
|
||||
* you will be awoken immediately.
|
||||
* 3) Finally you can add the INP_DONT_SACK_QUEUE to not even
|
||||
* be awoken if a SACK has arrived. You would do this when
|
||||
* you were not only running a pacing for output timer
|
||||
* but a Rack timer as well i.e. you know you are in recovery
|
||||
* and are in the process (via the timers) of dealing with
|
||||
* the loss.
|
||||
*
|
||||
* Now a critical thing you must be aware of here is that the
|
||||
* use of the flags has a far greater scope then just your
|
||||
* typical LRO. Why? Well thats because in the normal compressed
|
||||
* LRO case at the end of a driver interupt all packets are going
|
||||
* to get presented to the transport no matter if there is one
|
||||
* or 100. With the MBUF_QUEUE model, this is not true. You will
|
||||
* only be awoken to process the queue of packets when:
|
||||
* a) The flags discussed above allow it.
|
||||
* <or>
|
||||
* b) You exceed a ack or data limit (by default the
|
||||
* ack limit is infinity (64k acks) and the data
|
||||
* limit is 64k of new TCP data)
|
||||
* <or>
|
||||
* c) The push bit has been set by the peer
|
||||
*/
|
||||
|
||||
int
|
||||
ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
|
||||
{
|
||||
@ -355,13 +414,7 @@ skip_vnet:
|
||||
* have been called (if we can).
|
||||
*/
|
||||
m->m_pkthdr.lro_nsegs = 1;
|
||||
if (m->m_flags & M_TSTMP_LRO) {
|
||||
tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
|
||||
tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
|
||||
} else {
|
||||
/* Should not be should we kassert instead? */
|
||||
tcp_get_usecs(&tv);
|
||||
}
|
||||
tcp_get_usecs(&tv);
|
||||
/* Now what about next packet? */
|
||||
if (m_save || has_pkt)
|
||||
nxt_pkt = 1;
|
||||
|
@ -199,6 +199,8 @@ struct pkthdr {
|
||||
#define lro_nsegs tso_segsz
|
||||
#define csum_phsum PH_per.sixteen[2]
|
||||
#define csum_data PH_per.thirtytwo[1]
|
||||
#define lro_len PH_per.sixteen[0] /* inbound during LRO */
|
||||
#define lro_csum PH_per.sixteen[1] /* inbound during LRO */
|
||||
#define pace_thoff PH_loc.sixteen[0]
|
||||
#define pace_tlen PH_loc.sixteen[1]
|
||||
#define pace_drphdrlen PH_loc.sixteen[2]
|
||||
@ -392,7 +394,7 @@ void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
|
||||
/*
|
||||
* mbuf flags of global significance and layer crossing.
|
||||
* Those of only protocol/layer specific significance are to be mapped
|
||||
* to M_PROTO[1-12] and cleared at layer handoff boundaries.
|
||||
* to M_PROTO[1-11] and cleared at layer handoff boundaries.
|
||||
* NB: Limited to the lower 24 bits.
|
||||
*/
|
||||
#define M_EXT 0x00000001 /* has associated external storage */
|
||||
@ -411,18 +413,17 @@ void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
|
||||
and 802.1AS) */
|
||||
#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
|
||||
|
||||
#define M_PROTO1 0x00001000 /* protocol-specific */
|
||||
#define M_PROTO2 0x00002000 /* protocol-specific */
|
||||
#define M_PROTO3 0x00004000 /* protocol-specific */
|
||||
#define M_PROTO4 0x00008000 /* protocol-specific */
|
||||
#define M_PROTO5 0x00010000 /* protocol-specific */
|
||||
#define M_PROTO6 0x00020000 /* protocol-specific */
|
||||
#define M_PROTO7 0x00040000 /* protocol-specific */
|
||||
#define M_PROTO8 0x00080000 /* protocol-specific */
|
||||
#define M_PROTO9 0x00100000 /* protocol-specific */
|
||||
#define M_PROTO10 0x00200000 /* protocol-specific */
|
||||
#define M_PROTO11 0x00400000 /* protocol-specific */
|
||||
#define M_PROTO12 0x00800000 /* protocol-specific */
|
||||
#define M_PROTO1 0x00002000 /* protocol-specific */
|
||||
#define M_PROTO2 0x00004000 /* protocol-specific */
|
||||
#define M_PROTO3 0x00008000 /* protocol-specific */
|
||||
#define M_PROTO4 0x00010000 /* protocol-specific */
|
||||
#define M_PROTO5 0x00020000 /* protocol-specific */
|
||||
#define M_PROTO6 0x00040000 /* protocol-specific */
|
||||
#define M_PROTO7 0x00080000 /* protocol-specific */
|
||||
#define M_PROTO8 0x00100000 /* protocol-specific */
|
||||
#define M_PROTO9 0x00200000 /* protocol-specific */
|
||||
#define M_PROTO10 0x00400000 /* protocol-specific */
|
||||
#define M_PROTO11 0x00800000 /* protocol-specific */
|
||||
|
||||
#define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */
|
||||
|
||||
@ -431,7 +432,7 @@ void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
|
||||
*/
|
||||
#define M_PROTOFLAGS \
|
||||
(M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
|
||||
M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12)
|
||||
M_PROTO9|M_PROTO10|M_PROTO11)
|
||||
|
||||
/*
|
||||
* Flags preserved when copying m_pkthdr.
|
||||
@ -449,7 +450,7 @@ void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
|
||||
#define M_FLAG_PROTOBITS \
|
||||
"\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
|
||||
"\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
|
||||
"\27M_PROTO11\30M_PROTO12"
|
||||
"\27M_PROTO11"
|
||||
#define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)
|
||||
|
||||
/*
|
||||
|
Loading…
x
Reference in New Issue
Block a user