freebsd-skq/sys/net/flowtable.c

1195 lines
28 KiB
C
Raw Normal View History

/*-
2014-02-17 12:07:17 +00:00
* Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
* Copyright (c) 2008-2010, BitGravity Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Neither the name of the BitGravity Corporation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "opt_route.h"
#include "opt_mpath.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
2014-02-07 10:05:12 +00:00
#include <sys/param.h>
#include <sys/types.h>
#include <sys/bitstring.h>
#include <sys/condvar.h>
#include <sys/callout.h>
#include <sys/hash.h>
2014-02-07 10:05:12 +00:00
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <vm/uma.h>
#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/if_var.h>
2014-02-07 10:05:12 +00:00
#include <net/route.h>
#include <net/flowtable.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/if_ether.h>
#include <netinet/ip.h>
#ifdef INET6
#include <netinet/ip6.h>
#endif
#ifdef FLOWTABLE_HASH_ALL
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/sctp.h>
#endif
#include <ddb/ddb.h>
#ifdef FLOWTABLE_HASH_ALL
#define KEY_PORTS (sizeof(uint16_t) * 2)
#define KEY_ADDRS 2
#else
#define KEY_PORTS 0
#define KEY_ADDRS 1
#endif
#ifdef INET6
#define KEY_ADDR_LEN sizeof(struct in6_addr)
#else
#define KEY_ADDR_LEN sizeof(struct in_addr)
#endif
#define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
struct flentry {
uint32_t f_hash; /* hash flowing forward */
uint32_t f_key[KEYLEN]; /* address(es and ports) */
uint32_t f_uptime; /* uptime at last access */
uint16_t f_fibnum; /* fib index */
#ifdef FLOWTABLE_HASH_ALL
uint8_t f_proto; /* protocol */
uint8_t f_flags; /* stale? */
#define FL_STALE 1
#endif
SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */
struct rtentry *f_rt; /* rtentry for flow */
struct llentry *f_lle; /* llentry for flow */
};
#undef KEYLEN
SLIST_HEAD(flist, flentry);
/* Make sure we can use pcpu_zone_ptr for struct flist. */
CTASSERT(sizeof(struct flist) == sizeof(void *));
struct flowtable {
counter_u64_t *ft_stat;
int ft_size;
/*
* ft_table is a malloc(9)ed array of pointers. Pointers point to
* memory from UMA_ZONE_PCPU zone.
* ft_masks is per-cpu pointer itself. Each instance points
* to a malloc(9)ed bitset, that is private to corresponding CPU.
2014-02-07 10:05:12 +00:00
*/
struct flist **ft_table;
bitstr_t **ft_masks;
bitstr_t *ft_tmpmask;
};
#define FLOWSTAT_ADD(ft, name, v) \
counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1)
static struct proc *flowcleanerproc;
static uint32_t flow_hashjitter;
static struct cv flowclean_f_cv;
static struct cv flowclean_c_cv;
static struct mtx flowclean_lock;
static uint32_t flowclean_cycles;
/*
* TODO:
2014-02-07 10:05:12 +00:00
* - add sysctls to resize && flush flow tables
2009-04-19 04:39:42 +00:00
* - Add per flowtable sysctls for statistics and configuring timeouts
* - add saturation counter to rtentry to support per-packet load-balancing
* add flag to indicate round-robin flow, add list lookup from head
for flows
* - add sysctl / device node / syscall to support exporting and importing
* of flows with flag to indicate that a flow was imported so should
* not be considered for auto-cleaning
* - support explicit connection state (currently only ad-hoc for DSR)
* - idetach() cleanup for options VIMAGE builds.
*/
#ifdef INET
static VNET_DEFINE(struct flowtable, ip4_ft);
#define V_ip4_ft VNET(ip4_ft)
#endif
#ifdef INET6
static VNET_DEFINE(struct flowtable, ip6_ft);
#define V_ip6_ft VNET(ip6_ft)
#endif
static uma_zone_t flow_zone;
static VNET_DEFINE(int, flowtable_enable) = 1;
#define V_flowtable_enable VNET(flowtable_enable)
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator (DPCPU), as suggested by Peter Wemm, and implement a new per-virtual network stack memory allocator. Modify vnet to use the allocator instead of monolithic global container structures (vinet, ...). This change solves many binary compatibility problems associated with VIMAGE, and restores ELF symbols for virtualized global variables. Each virtualized global variable exists as a "reference copy", and also once per virtual network stack. Virtualized global variables are tagged at compile-time, placing the in a special linker set, which is loaded into a contiguous region of kernel memory. Virtualized global variables in the base kernel are linked as normal, but those in modules are copied and relocated to a reserved portion of the kernel's vnet region with the help of a the kernel linker. Virtualized global variables exist in per-vnet memory set up when the network stack instance is created, and are initialized statically from the reference copy. Run-time access occurs via an accessor macro, which converts from the current vnet and requested symbol to a per-vnet address. When "options VIMAGE" is not compiled into the kernel, normal global ELF symbols will be used instead and indirection is avoided. This change restores static initialization for network stack global variables, restores support for non-global symbols and types, eliminates the need for many subsystem constructors, eliminates large per-subsystem structures that caused many binary compatibility issues both for monitoring applications (netstat) and kernel modules, removes the per-function INIT_VNET_*() macros throughout the stack, eliminates the need for vnet_symmap ksym(2) munging, and eliminates duplicate definitions of virtualized globals under VIMAGE_GLOBALS. Bump __FreeBSD_version and update UPDATING. Portions submitted by: bz Reviewed by: bz, zec Discussed with: gnn, jamie, jeff, jhb, julian, sam Suggested by: peter Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
"flowtable");
SYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator (DPCPU), as suggested by Peter Wemm, and implement a new per-virtual network stack memory allocator. Modify vnet to use the allocator instead of monolithic global container structures (vinet, ...). This change solves many binary compatibility problems associated with VIMAGE, and restores ELF symbols for virtualized global variables. Each virtualized global variable exists as a "reference copy", and also once per virtual network stack. Virtualized global variables are tagged at compile-time, placing the in a special linker set, which is loaded into a contiguous region of kernel memory. Virtualized global variables in the base kernel are linked as normal, but those in modules are copied and relocated to a reserved portion of the kernel's vnet region with the help of a the kernel linker. Virtualized global variables exist in per-vnet memory set up when the network stack instance is created, and are initialized statically from the reference copy. Run-time access occurs via an accessor macro, which converts from the current vnet and requested symbol to a per-vnet address. When "options VIMAGE" is not compiled into the kernel, normal global ELF symbols will be used instead and indirection is avoided. This change restores static initialization for network stack global variables, restores support for non-global symbols and types, eliminates the need for many subsystem constructors, eliminates large per-subsystem structures that caused many binary compatibility issues both for monitoring applications (netstat) and kernel modules, removes the per-function INIT_VNET_*() macros throughout the stack, eliminates the need for vnet_symmap ksym(2) munging, and eliminates duplicate definitions of virtualized globals under VIMAGE_GLOBALS. Bump __FreeBSD_version and update UPDATING. Portions submitted by: bz Reviewed by: bz, zec Discussed with: gnn, jamie, jeff, jhb, julian, sam Suggested by: peter Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
&VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
&flow_zone, "Maximum number of flows allowed");
static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
static struct flentry *
flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
#ifdef INET
static struct flentry *
flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
{
struct flentry *fle;
struct sockaddr_in *sin;
struct ip *ip;
uint32_t fibnum;
#ifdef FLOWTABLE_HASH_ALL
uint32_t key[3];
int iphlen;
uint16_t sport, dport;
uint8_t proto;
#endif
ip = mtod(m, struct ip *);
if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
(ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
return (NULL);
fibnum = M_GETFIB(m);
#ifdef FLOWTABLE_HASH_ALL
iphlen = ip->ip_hl << 2;
proto = ip->ip_p;
switch (proto) {
case IPPROTO_TCP: {
struct tcphdr *th;
th = (struct tcphdr *)((char *)ip + iphlen);
sport = th->th_sport;
dport = th->th_dport;
if (th->th_flags & (TH_RST|TH_FIN))
fibnum |= (FL_STALE << 24);
2014-02-07 10:05:12 +00:00
break;
}
case IPPROTO_UDP: {
struct udphdr *uh;
uh = (struct udphdr *)((char *)ip + iphlen);
sport = uh->uh_sport;
dport = uh->uh_dport;
2014-02-07 10:05:12 +00:00
break;
}
case IPPROTO_SCTP: {
struct sctphdr *sh;
sh = (struct sctphdr *)((char *)ip + iphlen);
sport = sh->src_port;
dport = sh->dest_port;
/* XXXGL: handle stale? */
2014-02-07 10:05:12 +00:00
break;
}
default:
sport = dport = 0;
break;
}
key[0] = ip->ip_dst.s_addr;
key[1] = ip->ip_src.s_addr;
key[2] = (dport << 16) | sport;
fibnum |= proto << 16;
fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
fibnum);
#else /* !FLOWTABLE_HASH_ALL */
fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
sizeof(struct in_addr), fibnum);
#endif /* FLOWTABLE_HASH_ALL */
if (fle == NULL)
return (NULL);
sin = (struct sockaddr_in *)&ro->ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = ip->ip_dst;
return (fle);
}
#endif /* INET */
#ifdef INET6
/*
* PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
* then it sets p to point at the offset "len" in the mbuf. WARNING: the
* pointer might become stale after other pullups (but we never use it
* this way).
*/
#define PULLUP_TO(_len, p, T) \
do { \
int x = (_len) + sizeof(T); \
if ((m)->m_len < x) \
return (NULL); \
p = (mtod(m, char *) + (_len)); \
} while (0)
#define TCP(p) ((struct tcphdr *)(p))
#define SCTP(p) ((struct sctphdr *)(p))
#define UDP(p) ((struct udphdr *)(p))
static struct flentry *
flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
{
struct flentry *fle;
struct sockaddr_in6 *sin6;
struct ip6_hdr *ip6;
uint32_t fibnum;
#ifdef FLOWTABLE_HASH_ALL
uint32_t key[9];
void *ulp;
int hlen;
uint16_t sport, dport;
u_short offset;
uint8_t proto;
#else
uint32_t key[4];
#endif
ip6 = mtod(m, struct ip6_hdr *);
if (in6_localaddr(&ip6->ip6_dst))
return (NULL);
fibnum = M_GETFIB(m);
#ifdef FLOWTABLE_HASH_ALL
hlen = sizeof(struct ip6_hdr);
proto = ip6->ip6_nxt;
offset = sport = dport = 0;
ulp = NULL;
while (ulp == NULL) {
switch (proto) {
case IPPROTO_ICMPV6:
case IPPROTO_OSPFIGP:
case IPPROTO_PIM:
case IPPROTO_CARP:
case IPPROTO_ESP:
case IPPROTO_NONE:
ulp = ip6;
break;
case IPPROTO_TCP:
PULLUP_TO(hlen, ulp, struct tcphdr);
dport = TCP(ulp)->th_dport;
sport = TCP(ulp)->th_sport;
if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
fibnum |= (FL_STALE << 24);
break;
case IPPROTO_SCTP:
PULLUP_TO(hlen, ulp, struct sctphdr);
dport = SCTP(ulp)->src_port;
sport = SCTP(ulp)->dest_port;
/* XXXGL: handle stale? */
break;
case IPPROTO_UDP:
PULLUP_TO(hlen, ulp, struct udphdr);
dport = UDP(ulp)->uh_dport;
sport = UDP(ulp)->uh_sport;
break;
case IPPROTO_HOPOPTS: /* RFC 2460 */
PULLUP_TO(hlen, ulp, struct ip6_hbh);
hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
ulp = NULL;
break;
case IPPROTO_ROUTING: /* RFC 2460 */
2014-02-08 22:10:53 +00:00
PULLUP_TO(hlen, ulp, struct ip6_rthdr);
hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
ulp = NULL;
break;
case IPPROTO_FRAGMENT: /* RFC 2460 */
PULLUP_TO(hlen, ulp, struct ip6_frag);
hlen += sizeof (struct ip6_frag);
proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
IP6F_OFF_MASK;
ulp = NULL;
break;
case IPPROTO_DSTOPTS: /* RFC 2460 */
PULLUP_TO(hlen, ulp, struct ip6_hbh);
hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
ulp = NULL;
break;
case IPPROTO_AH: /* RFC 2402 */
PULLUP_TO(hlen, ulp, struct ip6_ext);
hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
ulp = NULL;
break;
default:
PULLUP_TO(hlen, ulp, struct ip6_ext);
break;
}
}
bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
key[8] = (dport << 16) | sport;
fibnum |= proto << 16;
fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
fibnum);
#else /* !FLOWTABLE_HASH_ALL */
bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
fibnum);
#endif /* FLOWTABLE_HASH_ALL */
if (fle == NULL)
return (NULL);
sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
return (fle);
}
#endif /* INET6 */
static bitstr_t *
flowtable_mask(struct flowtable *ft)
{
2014-02-17 12:02:44 +00:00
/*
* flowtable_free_stale() calls w/o critical section, but
* with sched_bind(). Since pointer is stable throughout
* ft lifetime, it is safe, otherwise...
*
* CRITICAL_ASSERT(curthread);
*/
return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
}
static struct flist *
flowtable_list(struct flowtable *ft, uint32_t hash)
{
CRITICAL_ASSERT(curthread);
return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
}
static int
flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
{
if (((fle->f_rt->rt_flags & RTF_UP) == 0) ||
(fle->f_rt->rt_ifp == NULL) ||
!RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
(fle->f_lle->la_flags & LLE_VALID) == 0)
return (1);
if (time_uptime - fle->f_uptime > maxidle)
return (1);
#ifdef FLOWTABLE_HASH_ALL
if (fle->f_flags & FL_STALE)
return (1);
#endif
return (0);
}
static int
flow_full(void)
{
int count, max;
2014-02-08 22:10:53 +00:00
count = uma_zone_get_cur(flow_zone);
max = uma_zone_get_max(flow_zone);
return (count > (max - (max >> 3)));
}
static int
flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
{
#ifdef FLOWTABLE_HASH_ALL
uint8_t proto;
proto = (fibnum >> 16) & 0xff;
fibnum &= 0xffff;
#endif
CRITICAL_ASSERT(curthread);
/* Microoptimization for IPv4: don't use bcmp(). */
if (((keylen == sizeof(uint32_t) && (fle->f_key[0] == key[0])) ||
(bcmp(fle->f_key, key, keylen) == 0)) &&
fibnum == fle->f_fibnum &&
#ifdef FLOWTABLE_HASH_ALL
proto == fle->f_proto &&
#endif
(fle->f_rt->rt_flags & RTF_UP) &&
fle->f_rt->rt_ifp != NULL &&
(fle->f_lle->la_flags & LLE_VALID))
return (1);
return (0);
}
static struct flentry *
flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
int keylen, uint32_t fibnum0)
{
#ifdef INET6
2014-02-17 12:02:44 +00:00
struct route_in6 sro6;
#endif
#ifdef INET
2014-02-17 12:02:44 +00:00
struct route sro;
#endif
struct route *ro = NULL;
struct rtentry *rt;
struct lltable *lt = NULL;
struct llentry *lle;
struct sockaddr_storage *l3addr;
struct ifnet *ifp;
struct flist *flist;
struct flentry *fle, *iter;
bitstr_t *mask;
uint16_t fibnum = fibnum0;
#ifdef FLOWTABLE_HASH_ALL
uint8_t proto;
proto = (fibnum0 >> 16) & 0xff;
fibnum = fibnum0 & 0xffff;
#endif
/*
* This bit of code ends up locking the
* same route 3 times (just like ip_output + ether_output)
* - at lookup
* - in rt_check when called by arpresolve
* - dropping the refcount for the rtentry
*
* This could be consolidated to one if we wrote a variant
* of arpresolve with an rt_check variant that expected to
* receive the route locked
*/
#ifdef INET
if (ft == &V_ip4_ft) {
struct sockaddr_in *sin;
ro = &sro;
bzero(&sro.ro_dst, sizeof(sro.ro_dst));
sin = (struct sockaddr_in *)&sro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr.s_addr = key[0];
}
#endif
#ifdef INET6
if (ft == &V_ip6_ft) {
struct sockaddr_in6 *sin6;
ro = (struct route *)&sro6;
sin6 = &sro6.ro_dst;
bzero(sin6, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
}
#endif
ro->ro_rt = NULL;
#ifdef RADIX_MPATH
rtalloc_mpath_fib(ro, hash, fibnum);
#else
rtalloc_ign_fib(ro, 0, fibnum);
#endif
if (ro->ro_rt == NULL)
return (NULL);
rt = ro->ro_rt;
ifp = rt->rt_ifp;
if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
RTFREE(rt);
return (NULL);
}
#ifdef INET
if (ft == &V_ip4_ft)
lt = LLTABLE(ifp);
#endif
#ifdef INET6
if (ft == &V_ip6_ft)
lt = LLTABLE6(ifp);
#endif
if (rt->rt_flags & RTF_GATEWAY)
l3addr = (struct sockaddr_storage *)rt->rt_gateway;
else
l3addr = (struct sockaddr_storage *)&ro->ro_dst;
lle = llentry_alloc(ifp, lt, l3addr);
if (lle == NULL) {
RTFREE(rt);
return (NULL);
}
/* Don't insert the entry if the ARP hasn't yet finished resolving. */
if ((lle->la_flags & LLE_VALID) == 0) {
RTFREE(rt);
LLE_FREE(lle);
FLOWSTAT_INC(ft, ft_fail_lle_invalid);
return (NULL);
}
fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
if (fle == NULL) {
RTFREE(rt);
LLE_FREE(lle);
return (NULL);
}
fle->f_hash = hash;
bcopy(key, &fle->f_key, keylen);
fle->f_rt = rt;
fle->f_lle = lle;
fle->f_fibnum = fibnum;
fle->f_uptime = time_uptime;
#ifdef FLOWTABLE_HASH_ALL
fle->f_proto = proto;
fle->f_flags = fibnum0 >> 24;
#endif
critical_enter();
mask = flowtable_mask(ft);
flist = flowtable_list(ft, hash);
if (SLIST_EMPTY(flist)) {
bit_set(mask, (hash % ft->ft_size));
SLIST_INSERT_HEAD(flist, fle, f_next);
goto skip;
2014-02-07 10:05:12 +00:00
}
2014-02-08 22:10:53 +00:00
/*
* find end of list and make sure that we were not
* preempted by another thread handling this flow
*/
SLIST_FOREACH(iter, flist, f_next) {
2014-02-18 14:21:26 +00:00
KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
("%s: wrong hash", __func__));
if (flow_matches(iter, key, keylen, fibnum)) {
/*
* We probably migrated to an other CPU after
* lookup in flowtable_lookup_common() failed.
* It appeared that this CPU already has flow
* entry.
*/
iter->f_uptime = time_uptime;
#ifdef FLOWTABLE_HASH_ALL
iter->f_flags |= fibnum >> 24;
#endif
critical_exit();
FLOWSTAT_INC(ft, ft_collisions);
uma_zfree(flow_zone, fle);
return (iter);
}
2014-02-07 10:05:12 +00:00
}
SLIST_INSERT_HEAD(flist, fle, f_next);
skip:
critical_exit();
FLOWSTAT_INC(ft, ft_inserts);
return (fle);
}
int
flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
{
struct flentry *fle;
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
struct llentry *lle;
if (V_flowtable_enable == 0)
return (ENXIO);
switch (sa) {
#ifdef INET
case AF_INET:
fle = flowtable_lookup_ipv4(m, ro);
break;
#endif
#ifdef INET6
case AF_INET6:
fle = flowtable_lookup_ipv6(m, ro);
break;
#endif
default:
panic("%s: sa %d", __func__, sa);
}
if (fle == NULL)
return (EHOSTUNREACH);
if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) {
M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
m->m_pkthdr.flowid = fle->f_hash;
}
ro->ro_rt = fle->f_rt;
ro->ro_flags |= RT_NORTREF;
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
lle = fle->f_lle;
if (lle != NULL && (lle->la_flags & LLE_VALID)) {
ro->ro_prepend = lle->r_linkdata;
ro->ro_plen = lle->r_hdrlen;
ro->ro_flags |= RT_MAY_LOOP;
if (lle->la_flags & LLE_IFADDR)
ro->ro_flags |= RT_L2_ME;
}
return (0);
}
static struct flentry *
flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
uint32_t fibnum)
{
struct flist *flist;
struct flentry *fle;
uint32_t hash;
FLOWSTAT_INC(ft, ft_lookups);
hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
critical_enter();
flist = flowtable_list(ft, hash);
SLIST_FOREACH(fle, flist, f_next) {
2014-02-18 14:21:26 +00:00
KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
("%s: wrong hash", __func__));
if (flow_matches(fle, key, keylen, fibnum)) {
fle->f_uptime = time_uptime;
#ifdef FLOWTABLE_HASH_ALL
fle->f_flags |= fibnum >> 24;
#endif
critical_exit();
FLOWSTAT_INC(ft, ft_hits);
return (fle);
}
}
critical_exit();
FLOWSTAT_INC(ft, ft_misses);
return (flowtable_insert(ft, hash, key, keylen, fibnum));
}
/*
* used by the bit_alloc macro
*/
2014-02-17 12:02:44 +00:00
#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
static void
flowtable_alloc(struct flowtable *ft)
{
ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
M_FTABLE, M_WAITOK);
for (int i = 0; i < ft->ft_size; i++)
ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
for (int i = 0; i < mp_ncpus; i++) {
bitstr_t **b;
b = zpcpu_get_cpu(ft->ft_masks, i);
*b = bit_alloc(ft->ft_size);
}
ft->ft_tmpmask = bit_alloc(ft->ft_size);
}
#undef calloc
static void
flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
{
struct flist *flist, freelist;
struct flentry *fle, *fle1, *fleprev;
bitstr_t *mask, *tmpmask;
int curbit, tmpsize;
SLIST_INIT(&freelist);
mask = flowtable_mask(ft);
tmpmask = ft->ft_tmpmask;
tmpsize = ft->ft_size;
memcpy(tmpmask, mask, ft->ft_size/8);
curbit = 0;
2014-03-05 02:35:15 +00:00
fleprev = NULL; /* pacify gcc */
/*
* XXX Note to self, bit_ffs operates at the byte level
* and thus adds gratuitous overhead
*/
bit_ffs(tmpmask, ft->ft_size, &curbit);
while (curbit != -1) {
if (curbit >= ft->ft_size || curbit < -1) {
log(LOG_ALERT,
"warning: bad curbit value %d \n",
curbit);
break;
}
FLOWSTAT_INC(ft, ft_free_checks);
critical_enter();
flist = flowtable_list(ft, curbit);
#ifdef DIAGNOSTIC
if (SLIST_EMPTY(flist) && curbit > 0) {
log(LOG_ALERT,
"warning bit=%d set, but no fle found\n",
curbit);
}
2014-02-08 22:10:53 +00:00
#endif
SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
if (rt != NULL && fle->f_rt != rt) {
fleprev = fle;
continue;
}
if (!flow_stale(ft, fle, maxidle)) {
fleprev = fle;
continue;
}
if (fle == SLIST_FIRST(flist))
SLIST_REMOVE_HEAD(flist, f_next);
else
SLIST_REMOVE_AFTER(fleprev, f_next);
SLIST_INSERT_HEAD(&freelist, fle, f_next);
}
if (SLIST_EMPTY(flist))
bit_clear(mask, curbit);
critical_exit();
bit_clear(tmpmask, curbit);
bit_ffs(tmpmask, tmpsize, &curbit);
}
SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
FLOWSTAT_INC(ft, ft_frees);
if (fle->f_rt != NULL)
RTFREE(fle->f_rt);
if (fle->f_lle != NULL)
LLE_FREE(fle->f_lle);
uma_zfree(flow_zone, fle);
}
}
static void
flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
{
int i;
CPU_FOREACH(i) {
if (smp_started == 1) {
thread_lock(curthread);
sched_bind(curthread, i);
thread_unlock(curthread);
}
flowtable_free_stale(ft, rt, maxidle);
if (smp_started == 1) {
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
}
}
}
void
flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
{
struct flowtable *ft;
switch (sa) {
#ifdef INET
case AF_INET:
ft = &V_ip4_ft;
break;
#endif
#ifdef INET6
case AF_INET6:
ft = &V_ip6_ft;
break;
#endif
default:
panic("%s: sa %d", __func__, sa);
}
flowtable_clean_vnet(ft, rt, 0);
}
static void
flowtable_cleaner(void)
{
VNET_ITERATOR_DECL(vnet_iter);
struct thread *td;
if (bootverbose)
log(LOG_INFO, "flowtable cleaner started\n");
td = curthread;
while (1) {
uint32_t flowclean_freq, maxidle;
/*
* The maximum idle time, as well as frequency are arbitrary.
*/
if (flow_full())
maxidle = 5;
else
maxidle = 30;
VNET_LIST_RLOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
#ifdef INET
flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
#endif
#ifdef INET6
flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
#endif
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
if (flow_full())
flowclean_freq = 4*hz;
else
flowclean_freq = 20*hz;
mtx_lock(&flowclean_lock);
thread_lock(td);
sched_prio(td, PPAUSE);
thread_unlock(td);
flowclean_cycles++;
cv_broadcast(&flowclean_f_cv);
cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
mtx_unlock(&flowclean_lock);
}
}
static void
flowtable_flush(void *unused __unused)
{
uint64_t start;
mtx_lock(&flowclean_lock);
start = flowclean_cycles;
while (start == flowclean_cycles) {
cv_broadcast(&flowclean_c_cv);
cv_wait(&flowclean_f_cv, &flowclean_lock);
}
mtx_unlock(&flowclean_lock);
}
static struct kproc_desc flow_kp = {
"flowcleaner",
flowtable_cleaner,
&flowcleanerproc
};
SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
2009-06-09 21:55:28 +00:00
static int
flowtable_get_size(char *name)
{
int size;
if (TUNABLE_INT_FETCH(name, &size)) {
if (size < 256)
size = 256;
if (!powerof2(size)) {
printf("%s must be power of 2\n", name);
size = 2048;
}
} else {
/*
* round up to the next power of 2
*/
size = 1 << fls((1024 + maxusers * 64) - 1);
}
return (size);
}
static void
flowtable_init(const void *unused __unused)
{
flow_hashjitter = arc4random();
flow_zone = uma_zcreate("flows", sizeof(struct flentry),
NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
cv_init(&flowclean_c_cv, "c_flowcleanwait");
cv_init(&flowclean_f_cv, "f_flowcleanwait");
mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
EVENTHANDLER_PRI_ANY);
}
SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
flowtable_init, NULL);
#ifdef INET
static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
"Flowtable for IPv4");
static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
ip4_ftstat, "Flowtable statistics for IPv4 "
"(struct flowtable_stat, net/flowtable.h)");
static void
flowtable_init_vnet_v4(const void *unused __unused)
{
V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
V_ip4_ft.ft_stat = VNET(ip4_ftstat);
flowtable_alloc(&V_ip4_ft);
}
VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
flowtable_init_vnet_v4, NULL);
#endif /* INET */
#ifdef INET6
static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
"Flowtable for IPv6");
static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
ip6_ftstat, "Flowtable statistics for IPv6 "
"(struct flowtable_stat, net/flowtable.h)");
static void
flowtable_init_vnet_v6(const void *unused __unused)
{
V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
V_ip6_ft.ft_stat = VNET(ip6_ftstat);
flowtable_alloc(&V_ip6_ft);
}
VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
flowtable_init_vnet_v6, NULL);
#endif /* INET6 */
#ifdef DDB
static bitstr_t *
flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
{
return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
}
static struct flist *
flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
{
2014-02-08 22:10:53 +00:00
return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
}
static void
flow_show(struct flowtable *ft, struct flentry *fle)
{
int idle_time;
int rt_valid, ifp_valid;
volatile struct rtentry *rt;
struct ifnet *ifp = NULL;
uint32_t *hashkey = fle->f_key;
idle_time = (int)(time_uptime - fle->f_uptime);
rt = fle->f_rt;
rt_valid = rt != NULL;
2014-02-07 10:05:12 +00:00
if (rt_valid)
ifp = rt->rt_ifp;
ifp_valid = ifp != NULL;
#ifdef INET
if (ft == &V_ip4_ft) {
char daddr[4*sizeof "123"];
#ifdef FLOWTABLE_HASH_ALL
char saddr[4*sizeof "123"];
uint16_t sport, dport;
#endif
inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
#ifdef FLOWTABLE_HASH_ALL
2014-02-08 22:10:53 +00:00
inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
dport = ntohs((uint16_t)(hashkey[2] >> 16));
sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
#else
db_printf("%s ", daddr);
#endif
}
#endif /* INET */
#ifdef INET6
if (ft == &V_ip6_ft) {
#ifdef FLOWTABLE_HASH_ALL
db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
hashkey[0], hashkey[1], hashkey[2],
hashkey[3], hashkey[4], hashkey[5],
hashkey[6], hashkey[7], hashkey[8]);
#else
db_printf("\n\tkey=%08x:%08x:%08x ",
hashkey[0], hashkey[1], hashkey[2]);
#endif
}
#endif /* INET6 */
2014-02-07 10:05:12 +00:00
db_printf("hash=%08x idle_time=%03d"
"\n\tfibnum=%02d rt=%p",
fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
#ifdef FLOWTABLE_HASH_ALL
if (fle->f_flags & FL_STALE)
db_printf(" FL_STALE ");
#endif
if (rt_valid) {
if (rt->rt_flags & RTF_UP)
db_printf(" RTF_UP ");
}
if (ifp_valid) {
if (ifp->if_flags & IFF_LOOPBACK)
db_printf(" IFF_LOOPBACK ");
if (ifp->if_flags & IFF_UP)
2014-02-08 22:10:53 +00:00
db_printf(" IFF_UP ");
if (ifp->if_flags & IFF_POINTOPOINT)
2014-02-08 22:10:53 +00:00
db_printf(" IFF_POINTOPOINT ");
}
db_printf("\n");
}
static void
flowtable_show(struct flowtable *ft, int cpuid)
{
int curbit = 0;
bitstr_t *mask, *tmpmask;
if (cpuid != -1)
db_printf("cpu: %d\n", cpuid);
mask = flowtable_mask_pcpu(ft, cpuid);
tmpmask = ft->ft_tmpmask;
memcpy(tmpmask, mask, ft->ft_size/8);
/*
* XXX Note to self, bit_ffs operates at the byte level
* and thus adds gratuitous overhead
*/
bit_ffs(tmpmask, ft->ft_size, &curbit);
while (curbit != -1) {
struct flist *flist;
struct flentry *fle;
if (curbit >= ft->ft_size || curbit < -1) {
db_printf("warning: bad curbit value %d \n",
curbit);
break;
}
flist = flowtable_list_pcpu(ft, curbit, cpuid);
SLIST_FOREACH(fle, flist, f_next)
flow_show(ft, fle);
bit_clear(tmpmask, curbit);
bit_ffs(tmpmask, ft->ft_size, &curbit);
}
}
static void
flowtable_show_vnet(struct flowtable *ft)
{
int i;
CPU_FOREACH(i)
flowtable_show(ft, i);
}
DB_SHOW_COMMAND(flowtables, db_show_flowtables)
{
VNET_ITERATOR_DECL(vnet_iter);
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
#ifdef VIMAGE
db_printf("vnet %p\n", vnet_iter);
#endif
#ifdef INET
printf("IPv4:\n");
flowtable_show_vnet(&V_ip4_ft);
#endif
#ifdef INET6
printf("IPv6:\n");
flowtable_show_vnet(&V_ip6_ft);
#endif
CURVNET_RESTORE();
}
}
#endif