From 05cde7efa660ada01c90883b32e993fd927fe574 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Tue, 2 Aug 2016 06:36:47 +0000 Subject: [PATCH] tcp/lro: Implement hash table for LRO entries. This significantly improves HTTP workload performance and reduces HTTP workload latency. Reviewed by: rrs, gallatin, hps Obtained from: rrs, gallatin Sponsored by: Netflix (rrs, gallatin) , Microsoft (sephe) Differential Revision: https://reviews.freebsd.org/D6689 --- sys/netinet/tcp_lro.c | 81 ++++++++++++++++++++++++++++++++++++++----- sys/netinet/tcp_lro.h | 3 ++ 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 6a92badb002b..50c646d1f50e 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -68,19 +68,24 @@ static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #endif static void tcp_lro_rx_done(struct lro_ctrl *lc); +static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, + uint32_t csum, int use_hash); static __inline void -tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_entry *le) +tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, + struct lro_entry *le) { LIST_INSERT_HEAD(&lc->lro_active, le, next); + LIST_INSERT_HEAD(bucket, le, hash_next); } static __inline void tcp_lro_active_remove(struct lro_entry *le) { - LIST_REMOVE(le, next); + LIST_REMOVE(le, next); /* active list */ + LIST_REMOVE(le, hash_next); /* hash bucket */ } int @@ -95,7 +100,7 @@ tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, { struct lro_entry *le; size_t size; - unsigned i; + unsigned i, elements; lc->lro_bad_csum = 0; lc->lro_queued = 0; @@ -110,6 +115,18 @@ tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, LIST_INIT(&lc->lro_free); LIST_INIT(&lc->lro_active); + /* create hash table to accelerate entry lookup */ + if (lro_entries > lro_mbufs) + elements = lro_entries; + else + elements = lro_mbufs; + lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, + HASH_NOWAIT); + if (lc->lro_hash == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + /* compute size to allocate */ size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + (lro_entries * sizeof(*le)); @@ -147,6 +164,13 @@ tcp_lro_free(struct lro_ctrl *lc) m_freem(le->m_head); } + /* free hash table */ + if (lc->lro_hash != NULL) { + free(lc->lro_hash, M_LRO); + lc->lro_hash = NULL; + } + lc->lro_hashsz = 0; + /* free mbuf array, if any */ for (x = 0; x != lc->lro_mbuf_count; x++) m_freem(lc->lro_mbuf_data[x].mb); @@ -487,7 +511,7 @@ tcp_lro_flush_all(struct lro_ctrl *lc) } /* add packet to LRO engine */ - if (tcp_lro_rx(lc, mb, 0) != 0) { + if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { /* input packet to network layer */ (*lc->ifp->if_input)(lc->ifp, mb); lc->lro_queued++; @@ -561,8 +585,8 @@ tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, } #endif -int -tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +static int +tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) { struct lro_entry *le; struct ether_header *eh; @@ -578,6 +602,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) tcp_seq seq; int error, ip_len, l; uint16_t eh_type, tcp_data_len; + struct lro_head *bucket; /* We expect a contiguous header [eh, ip, tcp]. */ @@ -670,8 +695,41 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) seq = ntohl(th->th_seq); + if (!use_hash) { + bucket = &lc->lro_hash[0]; + } else if (M_HASHTYPE_ISHASH(m)) { + bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; + } else { + uint32_t hash; + + switch (eh_type) { +#ifdef INET + case ETHERTYPE_IP: + hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + hash = ip6->ip6_src.s6_addr32[0] + + ip6->ip6_dst.s6_addr32[0]; + hash += ip6->ip6_src.s6_addr32[1] + + ip6->ip6_dst.s6_addr32[1]; + hash += ip6->ip6_src.s6_addr32[2] + + ip6->ip6_dst.s6_addr32[2]; + hash += ip6->ip6_src.s6_addr32[3] + + ip6->ip6_dst.s6_addr32[3]; + break; +#endif + default: + hash = 0; + break; + } + hash += th->th_sport + th->th_dport; + bucket = &lc->lro_hash[hash % lc->lro_hashsz]; + } + /* Try to find a matching previous segment. */ - LIST_FOREACH(le, &lc->lro_active, next) { + LIST_FOREACH(le, bucket, hash_next) { if (le->eh_type != eh_type) continue; if (le->source_port != th->th_sport || @@ -779,7 +837,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) /* Start a new segment chain. */ le = LIST_FIRST(&lc->lro_free); LIST_REMOVE(le, next); - tcp_lro_active_insert(lc, le); + tcp_lro_active_insert(lc, bucket, le); getmicrotime(&le->mtime); /* Start filling in details. */ @@ -837,6 +895,13 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) return (0); } +int +tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +{ + + return tcp_lro_rx2(lc, m, csum, 1); +} + void tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) { diff --git a/sys/netinet/tcp_lro.h b/sys/netinet/tcp_lro.h index 63aa62edd8ba..e019cd1e0edd 100644 --- a/sys/netinet/tcp_lro.h +++ b/sys/netinet/tcp_lro.h @@ -40,6 +40,7 @@ struct lro_entry { LIST_ENTRY(lro_entry) next; + LIST_ENTRY(lro_entry) hash_next; struct mbuf *m_head; struct mbuf *m_tail; union { @@ -95,6 +96,8 @@ struct lro_ctrl { unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */ unsigned lro_length_lim; /* max len of aggregated data */ + u_long lro_hashsz; + struct lro_head *lro_hash; struct lro_head lro_active; struct lro_head lro_free; };