f411704afc
linking NIC Receive Side Scaling (RSS) to the network stack's connection-group implementation. This prototype (and derived patches) are in use at Juniper and several other FreeBSD-using companies, so despite some reservations about its maturity, merge the patch to the base tree so that it can be iteratively refined in collaboration rather than maintained as a set of gradually diverging patch sets. (1) Merge a software implementation of the Toeplitz hash specified in RSS implemented by David Malone. This is used to allow suitable pcbgroup placement of connections before the first packet is received from the NIC. Software hashing is generally avoided, however, due to high cost of the hash on general-purpose CPUs. (2) In in_rss.c, maintain authoritative versions of RSS state intended to be pushed to each NIC, including keying material, hash algorithm/ configuration, and buckets. Provide software-facing interfaces to hash 2- and 4-tuples for IPv4 and IPv6 using both the RSS standardised Toeplitz and a 'naive' variation with a hash efficient in software but with poor distribution properties. Implement rss_m2cpuid()to be used by netisr and other load balancing code to look up the CPU on which an mbuf should be processed. (3) In the Ethernet link layer, allow netisr distribution using RSS as a source of policy as an alternative to source ordering; continue to default to direct dispatch (i.e., don't try and requeue packets for processing on the 'right' CPU if they arrive in a directly dispatchable context). (4) Allow RSS to control tuning of connection groups in order to align groups with RSS buckets. If a packet arrives on a protocol using connection groups, and contains a suitable hardware-generated hash, use that hash value to select the connection group for pcb lookup for both IPv4 and IPv6. If no hardware-generated Toeplitz hash is available, we fall back on regular PCB lookup risking contention rather than pay the cost of Toeplitz in software -- this is a less scalable but, at my last measurement, faster approach. As core counts go up, we may want to revise this strategy despite CPU overhead. Where device drivers suitably configure NICs, and connection groups / RSS are enabled, this should avoid both lock and line contention during connection lookup for TCP. This commit does not modify any device drivers to tune device RSS configuration to the global RSS configuration; patches are in circulation to do this for at least Chelsio T3 and Intel 1G/10G drivers. Currently, the KPI for device drivers is not particularly robust, nor aware of more advanced features such as runtime reconfiguration/rebalancing. This will hopefully prove a useful starting point for refinement. No MFC is scheduled as we will first want to nail down a more mature and maintainable KPI/KBI for device drivers. Sponsored by: Juniper Networks (original work) Sponsored by: EMC/Isilon (patch update and merge)
134 lines
3.9 KiB
C
134 lines
3.9 KiB
C
/*-
|
|
* Copyright (c) 2010-2011 Juniper Networks, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* This software was developed by Robert N. M. Watson under contract
|
|
* to Juniper Networks, Inc.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include "opt_inet6.h"
|
|
#include "opt_rss.h"
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/mbuf.h>
|
|
|
|
#include <netinet/in.h>
|
|
#include <netinet/in_pcb.h>
|
|
#include <netinet/in_rss.h>
|
|
#ifdef INET6
|
|
#include <netinet6/in6_pcb.h>
|
|
#endif /* INET6 */
|
|
|
|
/*
|
|
* Given a hash of whatever the covered tuple might be, return a pcbgroup
|
|
* index. Where RSS is supported, try to align bucket selection with RSS CPU
|
|
* affinity strategy.
|
|
*/
|
|
static __inline u_int
|
|
in6_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
|
|
{
|
|
|
|
#ifdef RSS
|
|
return (rss_getbucket(hash));
|
|
#else
|
|
return (hash % pcbinfo->ipi_npcbgroups);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
|
|
* information is insufficient to identify the pcbgroup. This might occur if
|
|
* a TCP packet turnsup with a 2-tuple hash, or if an RSS hash is present but
|
|
* RSS is not compiled into the kernel.
|
|
*/
|
|
struct inpcbgroup *
|
|
in6_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
|
|
{
|
|
|
|
#ifdef RSS
|
|
if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
|
|
hashtype == M_HASHTYPE_RSS_TCP_IPV4) ||
|
|
(pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE &&
|
|
hashtype == M_HASHTYPE_RSS_IPV4))
|
|
return (&pcbinfo->ipi_pcbgroups[
|
|
in6_pcbgroup_getbucket(pcbinfo, hash)]);
|
|
#endif
|
|
return (NULL);
|
|
}
|
|
|
|
struct inpcbgroup *
|
|
in6_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
|
|
{
|
|
|
|
return (in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
|
|
m->m_pkthdr.flowid));
|
|
}
|
|
|
|
struct inpcbgroup *
|
|
in6_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, const struct in6_addr *laddrp,
|
|
u_short lport, const struct in6_addr *faddrp, u_short fport)
|
|
{
|
|
uint32_t hash;
|
|
|
|
/*
|
|
* RSS note: we pass foreign addr/port as source, and local addr/port
|
|
* as destination, as we want to align with what the hardware is
|
|
* doing.
|
|
*/
|
|
switch (pcbinfo->ipi_hashfields) {
|
|
case IPI_HASHFIELDS_4TUPLE:
|
|
#ifdef RSS
|
|
hash = rss_hash_ip6_4tuple(*faddrp, fport, *laddrp, lport);
|
|
#else
|
|
hash = faddrp->s6_addr32[3] ^ fport;
|
|
#endif
|
|
break;
|
|
|
|
case IPI_HASHFIELDS_2TUPLE:
|
|
#ifdef RSS
|
|
hash = rss_hash_ip6_2tuple(*faddrp, *laddrp);
|
|
#else
|
|
hash = faddrp->s6_addr32[3] ^ laddrp->s6_addr32[3];
|
|
#endif
|
|
break;
|
|
|
|
default:
|
|
hash = 0;
|
|
}
|
|
return (&pcbinfo->ipi_pcbgroups[in6_pcbgroup_getbucket(pcbinfo,
|
|
hash)]);
|
|
}
|
|
|
|
struct inpcbgroup *
|
|
in6_pcbgroup_byinpcb(struct inpcb *inp)
|
|
{
|
|
|
|
return (in6_pcbgroup_bytuple(inp->inp_pcbinfo, &inp->in6p_laddr,
|
|
inp->inp_lport, &inp->in6p_faddr, inp->inp_fport));
|
|
}
|