From 3ea844b1da3f287b59141a9acb13ebed50efbb42 Mon Sep 17 00:00:00 2001 From: gallatin Date: Mon, 9 Mar 2020 13:44:51 +0000 Subject: [PATCH] make lacp's use_numa hashing aware of send tags When I did the use_numa support, I missed the fact that there is a separate hash function for send tag nic selection. So when use_numa is enabled, ktls offload does not work properly, as it does not reliably allocate a send tag on the proper egress nic since different egress nics are selected for send-tag allocation and packet transmit. To fix this, this change: - refectors lacp_select_tx_port_by_hash() and lacp_select_tx_port() to make lacp_select_tx_port_by_hash() always called by lacp_select_tx_port() - pre-shifts flowids to convert them to hashes when calling lacp_select_tx_port_by_hash() - adds a numa_domain field to if_snd_tag_alloc_params - plumbs the numa domain into places where we allocate send tags In testing with NIC TLS setup on a NUMA machine, I see thousands of output errors before the change when enabling kern.ipc.tls.ifnet.permitted=1. After the change, I see no errors, and I see the NIC sysctl counters showing active TLS offload sessions. Reviewed by: rrs, hselasky, jhb Sponsored by: Netflix --- sys/kern/uipc_ktls.c | 1 + sys/net/ieee8023ad_lacp.c | 43 +++++++++++-------------------------- sys/net/ieee8023ad_lacp.h | 4 +--- sys/net/if_lagg.c | 11 ++++++---- sys/net/if_var.h | 1 + sys/netinet/in_pcb.c | 1 + sys/netinet/tcp_ratelimit.c | 1 + 7 files changed, 25 insertions(+), 37 deletions(-) diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index fac092a9b7aa..98e4dfb4f47a 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -800,6 +800,7 @@ ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force, params.hdr.type = IF_SND_TAG_TYPE_TLS; params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; + params.hdr.numa_domain = inp->inp_numa_domain; params.tls.inp = inp; params.tls.tls = tls; INP_RUNLOCK(inp); diff --git a/sys/net/ieee8023ad_lacp.c b/sys/net/ieee8023ad_lacp.c index 907c7f958cdd..af887f2f0dee 100644 --- a/sys/net/ieee8023ad_lacp.c +++ b/sys/net/ieee8023ad_lacp.c @@ -832,13 +832,12 @@ lacp_stop(struct lagg_softc *sc) } struct lagg_port * -lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t hash, uint8_t numa_domain) { struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_portmap *pm; struct lacp_port *lp; struct lacp_port **map; - uint32_t hash; int count; if (__predict_false(lsc->lsc_suppress_distributing)) { @@ -854,10 +853,10 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) #ifdef NUMA if ((sc->sc_opts & LAGG_OPT_USE_NUMA) && - pm->pm_num_dom > 1 && m->m_pkthdr.numa_domain < MAXMEMDOM) { - count = pm->pm_numa[m->m_pkthdr.numa_domain].count; + pm->pm_num_dom > 1 && numa_domain < MAXMEMDOM) { + count = pm->pm_numa[numa_domain].count; if (count > 0) { - map = pm->pm_numa[m->m_pkthdr.numa_domain].map; + map = pm->pm_numa[numa_domain].map; } else { /* No ports on this domain; use global hash. */ map = pm->pm_map; @@ -869,11 +868,6 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) map = pm->pm_map; count = pm->pm_count; } - if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && - M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) - hash = m->m_pkthdr.flowid >> sc->flowid_shift; - else - hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey); hash %= count; lp = map[hash]; @@ -884,33 +878,22 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) return (lp->lp_lagg); } -#if defined(RATELIMIT) || defined(KERN_TLS) struct lagg_port * -lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid) +lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) { struct lacp_softc *lsc = LACP_SOFTC(sc); - struct lacp_portmap *pm; - struct lacp_port *lp; uint32_t hash; + uint8_t numa_domain; - if (__predict_false(lsc->lsc_suppress_distributing)) { - LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); - return (NULL); - } + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && + M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) + hash = m->m_pkthdr.flowid >> sc->flowid_shift; + else + hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey); - pm = &lsc->lsc_pmap[lsc->lsc_activemap]; - if (pm->pm_count == 0) { - LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__)); - return (NULL); - } - - hash = flowid >> sc->flowid_shift; - hash %= pm->pm_count; - lp = pm->pm_map[hash]; - - return (lp->lp_lagg); + numa_domain = m->m_pkthdr.numa_domain; + return (lacp_select_tx_port_by_hash(sc, hash, numa_domain)); } -#endif /* * lacp_suppress_distributing: drop transmit packets for a while diff --git a/sys/net/ieee8023ad_lacp.h b/sys/net/ieee8023ad_lacp.h index b6a0860ff1e0..2262f1d74294 100644 --- a/sys/net/ieee8023ad_lacp.h +++ b/sys/net/ieee8023ad_lacp.h @@ -293,9 +293,7 @@ struct lacp_softc { struct mbuf *lacp_input(struct lagg_port *, struct mbuf *); struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); -#if defined(RATELIMIT) || defined(KERN_TLS) -struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t); -#endif +struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t, uint8_t); void lacp_attach(struct lagg_softc *); void lacp_detach(void *); void lacp_init(struct lagg_softc *); diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index 37b3a708f8a9..018e82c4a1f1 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -1609,12 +1609,13 @@ mst_to_lst(struct m_snd_tag *mst) * contents. */ static struct lagg_port * -lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype) +lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype, + uint8_t numa_domain) { struct lagg_softc *sc; struct lagg_port *lp; struct lagg_lb *lb; - uint32_t p; + uint32_t hash, p; sc = ifp->if_softc; @@ -1634,7 +1635,8 @@ lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype) if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || flowtype == M_HASHTYPE_NONE) return (NULL); - return (lacp_select_tx_port_by_hash(sc, flowid)); + hash = flowid >> sc->flowid_shift; + return (lacp_select_tx_port_by_hash(sc, hash, numa_domain)); default: return (NULL); } @@ -1654,7 +1656,8 @@ lagg_snd_tag_alloc(struct ifnet *ifp, sc = ifp->if_softc; LAGG_RLOCK(); - lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype); + lp = lookup_snd_tag_port(ifp, params->hdr.flowid, + params->hdr.flowtype, params->hdr.numa_domain); if (lp == NULL) { LAGG_RUNLOCK(); return (EOPNOTSUPP); diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 864f519495e9..e37aaaaa1dca 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -198,6 +198,7 @@ struct if_snd_tag_alloc_header { uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ uint32_t flowid; /* mbuf hash value */ uint32_t flowtype; /* mbuf hash type */ + uint8_t numa_domain; /* numa domain of associated inp */ }; struct if_snd_tag_alloc_rate_limit { diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 884c36f60828..24a18f06c943 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -3258,6 +3258,7 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, + .rate_limit.hdr.numa_domain = inp->inp_numa_domain, .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c index 8a3ec2740e9c..196384f1c954 100644 --- a/sys/netinet/tcp_ratelimit.c +++ b/sys/netinet/tcp_ratelimit.c @@ -1006,6 +1006,7 @@ rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = 1, + .rate_limit.hdr.numa_domain = inp->inp_numa_domain, .rate_limit.max_rate = COMMON_RATE, .rate_limit.flags = M_NOWAIT, };