o Axe non-pcpu flowtable implementation. It wasn't enabled or used,

and probably is a leftover from first prototyping by Kip. The
  non-pcpu implementation used mutexes, so it doubtfully worked
  better than simple routing lookup.
o Use UMA_ZONE_PCPU zone for pointers instead of [MAXCPU] arrays,
  use zpcpu_get() to access data in there.
o Substitute own single list implementation with SLIST(). This
  has two functional side effects:
  - new flows go into head of a list, before they went to tail.
  - a bug when incorrect flow was deleted in flow cleaner is
    fixed.
o Due to cache line alignment, there is no reason to keep
  different zones for IPv4 and IPv6 flows. Both consume one
  cache line, real size of allocation is equal.
o Rely on that f_hash, f_rt, f_lle are stable during fle
  lifetime, remove useless volatile quilifiers.
o More INET/INET6 splitting.

Reviewed by:	adrian
Sponsored by:	Netflix
Sponsored by:	Nginx, Inc.
This commit is contained in:
glebius 2014-02-13 04:59:18 +00:00
parent 796712ba61
commit 7d1964f9ec
2 changed files with 193 additions and 422 deletions

View File

@ -47,13 +47,16 @@ __FBSDID("$FreeBSD$");
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <vm/uma.h>
#include <net/if.h>
#include <net/if_llatbl.h>
@ -76,6 +79,7 @@ __FBSDID("$FreeBSD$");
#include <ddb/ddb.h>
#ifdef INET
struct ipv4_tuple {
uint16_t ip_sport; /* source port */
uint16_t ip_dport; /* destination port */
@ -87,7 +91,9 @@ union ipv4_flow {
struct ipv4_tuple ipf_ipt;
uint32_t ipf_key[3];
};
#endif
#ifdef INET6
struct ipv6_tuple {
uint16_t ip_sport; /* source port */
uint16_t ip_dport; /* destination port */
@ -99,28 +105,44 @@ union ipv6_flow {
struct ipv6_tuple ipf_ipt;
uint32_t ipf_key[9];
};
#endif
struct flentry {
volatile uint32_t f_fhash; /* hash flowing forward */
uint32_t f_fhash; /* hash flowing forward */
uint16_t f_flags; /* flow flags */
uint8_t f_pad;
uint8_t f_proto; /* protocol */
uint32_t f_fibnum; /* fib index */
uint32_t f_uptime; /* uptime at last access */
struct flentry *f_next; /* pointer to collision entry */
volatile struct rtentry *f_rt; /* rtentry for flow */
volatile struct llentry *f_lle; /* llentry for flow */
SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */
struct rtentry *f_rt; /* rtentry for flow */
struct llentry *f_lle; /* llentry for flow */
union {
#ifdef INET
union ipv4_flow v4;
#endif
#ifdef INET6
union ipv6_flow v6;
#endif
} f_flow;
#define f_flow4 f_flow.v4
#define f_flow6 f_flow.v6
};
#define KEYLEN(flags) ((((flags) & FL_IPV6) ? 9 : 3) * 4)
struct flentry_v4 {
struct flentry fl_entry;
union ipv4_flow fl_flow;
};
/* Make sure f_flow begins with key. */
#ifdef INET
CTASSERT(offsetof(struct flentry, f_flow) ==
offsetof(struct flentry, f_flow4.ipf_key));
#endif
#ifdef INET6
CTASSERT(offsetof(struct flentry, f_flow) ==
offsetof(struct flentry, f_flow6.ipf_key));
#endif
struct flentry_v6 {
struct flentry fl_entry;
union ipv6_flow fl_flow;
};
SLIST_HEAD(flist, flentry);
/* Make sure we can use pcpu_zone_ptr for struct flist. */
CTASSERT(sizeof(struct flist) == sizeof(void *));
#define SECS_PER_HOUR 3600
#define SECS_PER_DAY (24*SECS_PER_HOUR)
@ -130,37 +152,28 @@ struct flentry_v6 {
#define FIN_WAIT_IDLE 600
#define TCP_IDLE SECS_PER_DAY
typedef void fl_lock_t(struct flowtable *, uint32_t);
union flentryp {
struct flentry **global;
struct flentry **pcpu[MAXCPU];
};
struct flowtable {
counter_u64_t *ft_stat;
uma_zone_t ft_zone;
int ft_size;
int ft_lock_count;
uint32_t ft_flags;
uint32_t ft_max_depth;
fl_lock_t *ft_lock;
fl_lock_t *ft_unlock;
/*
* XXX need to pad out
* ft_table is a malloc(9)ed array of pointers. Pointers point to
* memory from UMA_ZONE_PCPU zone.
* ft_masks is per-cpu pointer itself. Each instance points
* to a malloc(9)ed bitset, that is private to corresponding CPU.
*/
struct mtx *ft_locks;
union flentryp ft_table;
bitstr_t *ft_masks[MAXCPU];
struct flist **ft_table;
bitstr_t **ft_masks;
bitstr_t *ft_tmpmask;
uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
uint32_t ft_udp_idle;
uint32_t ft_fin_wait_idle;
uint32_t ft_syn_idle;
uint32_t ft_tcp_idle;
boolean_t ft_full;
} __aligned(CACHE_LINE_SIZE);
};
#define FLOWSTAT_ADD(ft, name, v) \
counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
@ -190,15 +203,15 @@ static uint32_t flowclean_freq;
*/
#ifdef INET
static VNET_DEFINE(struct flowtable, ip4_ft);
#define V_ip4_ft VNET(ip4_ft)
static uma_zone_t flow_ipv4_zone;
#define V_ip4_ft VNET(ip4_ft)
#endif
#ifdef INET6
static VNET_DEFINE(struct flowtable, ip6_ft);
#define V_ip6_ft VNET(ip6_ft)
static uma_zone_t flow_ipv6_zone;
#endif
static uma_zone_t flow_zone;
static VNET_DEFINE(int, flowtable_enable) = 1;
static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
@ -215,6 +228,8 @@ static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
"flowtable");
SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
&VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
&flow_zone, "Maximum number of flows allowed");
/*
* XXX This does not end up updating timeouts at runtime
@ -233,43 +248,10 @@ SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
&VNET_NAME(flowtable_tcp_expire), 0,
"seconds after which to remove flow allocated to a TCP connection.");
static void
flowtable_global_lock(struct flowtable *table, uint32_t hash)
{
int lock_index = (hash)&(table->ft_lock_count - 1);
mtx_lock(&table->ft_locks[lock_index]);
}
static void
flowtable_global_unlock(struct flowtable *table, uint32_t hash)
{
int lock_index = (hash)&(table->ft_lock_count - 1);
mtx_unlock(&table->ft_locks[lock_index]);
}
static void
flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
{
critical_enter();
}
static void
flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
{
critical_exit();
}
#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
#define FL_STALE (1<<8)
static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
static struct flentry *flowtable_lookup_common(struct flowtable *,
struct sockaddr_storage *, struct sockaddr_storage *, struct mbuf *, int);
@ -320,27 +302,6 @@ flags_to_proto(int flags)
}
#ifdef INET
#ifdef FLOWTABLE_DEBUG
static void
ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
struct sockaddr_in *dsin)
{
char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
if (flags & FL_HASH_ALL) {
inet_ntoa_r(ssin->sin_addr, saddr);
inet_ntoa_r(dsin->sin_addr, daddr);
printf("proto=%d %s:%d->%s:%d\n",
proto, saddr, ntohs(ssin->sin_port), daddr,
ntohs(dsin->sin_port));
} else {
inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
printf("proto=%d %s\n", proto, daddr);
}
}
#endif
static int
ipv4_mbuf_demarshal(struct mbuf *m, struct sockaddr_in *ssin,
struct sockaddr_in *dsin, uint16_t *flags)
@ -456,10 +417,10 @@ flow_to_route(struct flentry *fle, struct route *ro)
sin = (struct sockaddr_in *)&ro->ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
hashkey = fle->f_flow4.ipf_key;
sin->sin_addr.s_addr = hashkey[2];
ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
ro->ro_rt = fle->f_rt;
ro->ro_lle = fle->f_lle;
ro->ro_flags |= RT_NORTREF;
}
#endif /* INET */
@ -661,10 +622,10 @@ flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
hashkey = fle->f_flow6.ipf_key;
memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
ro->ro_rt = fle->f_rt;
ro->ro_lle = fle->f_lle;
ro->ro_flags |= RT_NORTREF;
}
#endif /* INET6 */
@ -672,31 +633,24 @@ flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
static bitstr_t *
flowtable_mask(struct flowtable *ft)
{
bitstr_t *mask;
if (ft->ft_flags & FL_PCPU)
mask = ft->ft_masks[curcpu];
else
mask = ft->ft_masks[0];
/*
* flowtable_free_stale() calls w/o critical section, but
* with sched_bind(). Since pointer is stable throughout
* ft lifetime, it is safe, otherwise...
*
* CRITICAL_ASSERT(curthread);
*/
return (mask);
return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
}
static struct flentry **
flowtable_entry(struct flowtable *ft, uint32_t hash)
static struct flist *
flowtable_list(struct flowtable *ft, uint32_t hash)
{
struct flentry **fle;
int index = (hash % ft->ft_size);
if (ft->ft_flags & FL_PCPU) {
KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
fle = &ft->ft_table.pcpu[curcpu][index];
} else {
KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
fle = &ft->ft_table.global[index];
}
return (fle);
CRITICAL_ASSERT(curthread);
return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
}
static int
@ -730,24 +684,6 @@ flow_stale(struct flowtable *ft, struct flentry *fle)
return (0);
}
static void
flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
{
uint32_t *hashkey;
int i, nwords;
if (fle->f_flags & FL_IPV6) {
nwords = 9;
hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
} else {
nwords = 3;
hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
}
for (i = 0; i < nwords; i++)
hashkey[i] = key[i];
}
static int
flow_full(struct flowtable *ft)
{
@ -755,8 +691,8 @@ flow_full(struct flowtable *ft)
int count, max;
full = ft->ft_full;
count = uma_zone_get_cur(ft->ft_zone);
max = uma_zone_get_max(ft->ft_zone);
count = uma_zone_get_cur(flow_zone);
max = uma_zone_get_max(flow_zone);
if (full && (count < (max - (max >> 3))))
ft->ft_full = FALSE;
@ -783,26 +719,31 @@ static int
flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
uint32_t fibnum, struct route *ro, uint16_t flags)
{
struct flentry *fle, *fletail, *newfle, **flep;
struct flist *flist;
struct flentry *fle, *iter;
int depth;
bitstr_t *mask;
uint8_t proto;
newfle = uma_zalloc(ft->ft_zone, M_NOWAIT | M_ZERO);
if (newfle == NULL)
fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
if (fle == NULL)
return (ENOMEM);
newfle->f_flags |= (flags & FL_IPV6);
proto = flags_to_proto(flags);
bcopy(key, &fle->f_flow, KEYLEN(flags));
fle->f_flags |= (flags & FL_IPV6);
fle->f_proto = flags_to_proto(flags);
fle->f_rt = ro->ro_rt;
fle->f_lle = ro->ro_lle;
fle->f_fhash = hash;
fle->f_fibnum = fibnum;
fle->f_uptime = time_uptime;
FL_ENTRY_LOCK(ft, hash);
critical_enter();
mask = flowtable_mask(ft);
flep = flowtable_entry(ft, hash);
fletail = fle = *flep;
flist = flowtable_list(ft, hash);
if (fle == NULL) {
bit_set(mask, FL_ENTRY_INDEX(ft, hash));
*flep = fle = newfle;
if (SLIST_EMPTY(flist)) {
bit_set(mask, (hash % ft->ft_size));
SLIST_INSERT_HEAD(flist, fle, f_next);
goto skip;
}
@ -812,65 +753,30 @@ flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
* find end of list and make sure that we were not
* preempted by another thread handling this flow
*/
while (fle != NULL) {
if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
SLIST_FOREACH(iter, flist, f_next) {
if (iter->f_fhash == hash && !flow_stale(ft, iter)) {
/*
* there was either a hash collision
* or we lost a race to insert
*/
FL_ENTRY_UNLOCK(ft, hash);
uma_zfree(ft->ft_zone, newfle);
critical_exit();
uma_zfree(flow_zone, fle);
return (EEXIST);
}
/*
* re-visit this double condition XXX
*/
if (fletail->f_next != NULL)
fletail = fle->f_next;
depth++;
fle = fle->f_next;
}
if (depth > ft->ft_max_depth)
ft->ft_max_depth = depth;
fletail->f_next = newfle;
fle = newfle;
SLIST_INSERT_HEAD(flist, fle, f_next);
skip:
flowtable_set_hashkey(fle, key);
critical_exit();
fle->f_proto = proto;
fle->f_rt = ro->ro_rt;
fle->f_lle = ro->ro_lle;
fle->f_fhash = hash;
fle->f_fibnum = fibnum;
fle->f_uptime = time_uptime;
FL_ENTRY_UNLOCK(ft, hash);
return (0);
}
static int
flowtable_key_equal(struct flentry *fle, uint32_t *key)
{
uint32_t *hashkey;
int i, nwords;
if (fle->f_flags & FL_IPV6) {
nwords = 9;
hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
} else {
nwords = 3;
hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
}
for (i = 0; i < nwords; i++)
if (hashkey[i] != key[i])
return (0);
return (1);
}
struct flentry *
flowtable_lookup(sa_family_t sa, struct mbuf *m)
{
@ -895,6 +801,7 @@ flowtable_lookup_common(struct flowtable *ft, struct sockaddr_storage *ssa,
{
struct route_in6 sro6;
struct route sro, *ro;
struct flist *flist;
struct flentry *fle;
struct rtentry *rt;
struct llentry *lle;
@ -974,34 +881,24 @@ flowtable_lookup_common(struct flowtable *ft, struct sockaddr_storage *ssa,
return (NULL);
FLOWSTAT_INC(ft, ft_lookups);
FL_ENTRY_LOCK(ft, hash);
if ((fle = FL_ENTRY(ft, hash)) == NULL) {
FL_ENTRY_UNLOCK(ft, hash);
goto uncached;
}
keycheck:
rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
lle = __DEVOLATILE(struct llentry *, fle->f_lle);
if ((rt != NULL)
&& lle != NULL
&& fle->f_fhash == hash
&& flowtable_key_equal(fle, key)
&& (proto == fle->f_proto)
&& (fibnum == fle->f_fibnum)
&& (rt->rt_flags & RTF_UP)
&& (rt->rt_ifp != NULL)
&& (lle->la_flags & LLE_VALID)) {
FLOWSTAT_INC(ft, ft_hits);
fle->f_uptime = time_uptime;
fle->f_flags |= flags;
FL_ENTRY_UNLOCK(ft, hash);
goto success;
} else if (fle->f_next != NULL) {
fle = fle->f_next;
goto keycheck;
}
FL_ENTRY_UNLOCK(ft, hash);
uncached:
critical_enter();
flist = flowtable_list(ft, hash);
SLIST_FOREACH(fle, flist, f_next)
if (fle->f_fhash == hash && bcmp(&fle->f_flow, key,
KEYLEN(fle->f_flags)) == 0 &&
proto == fle->f_proto && fibnum == fle->f_fibnum &&
(fle->f_rt->rt_flags & RTF_UP) &&
fle->f_rt->rt_ifp != NULL &&
(fle->f_lle->la_flags & LLE_VALID)) {
fle->f_uptime = time_uptime;
fle->f_flags |= flags;
critical_exit();
FLOWSTAT_INC(ft, ft_hits);
goto success;
}
critical_exit();
if (flags & FL_NOAUTO || flow_full(ft))
return (NULL);
@ -1088,38 +985,22 @@ flowtable_lookup_common(struct flowtable *ft, struct sockaddr_storage *ssa,
/*
* used by the bit_alloc macro
*/
#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
static void
flowtable_alloc(struct flowtable *ft)
{
if (ft->ft_flags & FL_PCPU) {
ft->ft_lock = flowtable_pcpu_lock;
ft->ft_unlock = flowtable_pcpu_unlock;
ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
M_FTABLE, M_WAITOK);
for (int i = 0; i < ft->ft_size; i++)
ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
for (int i = 0; i <= mp_maxid; i++) {
ft->ft_table.pcpu[i] =
malloc(ft->ft_size * sizeof(struct flentry *),
M_RTABLE, M_WAITOK | M_ZERO);
ft->ft_masks[i] = bit_alloc(ft->ft_size);
}
} else {
ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
(fls(mp_maxid + 1) << 1));
ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
for (int i = 0; i < mp_ncpus; i++) {
bitstr_t **b;
ft->ft_lock = flowtable_global_lock;
ft->ft_unlock = flowtable_global_unlock;
ft->ft_table.global =
malloc(ft->ft_size * sizeof(struct flentry *),
M_RTABLE, M_WAITOK | M_ZERO);
ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
M_RTABLE, M_WAITOK | M_ZERO);
for (int i = 0; i < ft->ft_lock_count; i++)
mtx_init(&ft->ft_locks[i], "flow", NULL,
MTX_DEF | MTX_DUPOK);
ft->ft_masks[0] = bit_alloc(ft->ft_size);
b = zpcpu_get_cpu(ft->ft_masks, i);
*b = bit_alloc(ft->ft_size);
}
ft->ft_tmpmask = bit_alloc(ft->ft_size);
@ -1139,41 +1020,22 @@ flowtable_alloc(struct flowtable *ft)
}
}
/*
* The rest of the code is devoted to garbage collection of expired entries.
* It is a new additon made necessary by the switch to dynamically allocating
* flow tables.
*
*/
static void
fle_free(struct flentry *fle, struct flowtable *ft)
{
struct rtentry *rt;
struct llentry *lle;
rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
lle = __DEVOLATILE(struct llentry *, fle->f_lle);
if (rt != NULL)
RTFREE(rt);
if (lle != NULL)
LLE_FREE(lle);
uma_zfree(ft->ft_zone, fle);
}
#undef calloc
static void
flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
{
int curbit = 0, tmpsize;
struct flentry *fle, **flehead, *fleprev;
struct flentry *flefreehead, *flefreetail, *fletmp;
struct flist *flist, freelist;
struct flentry *fle, *fle1, *fleprev;
bitstr_t *mask, *tmpmask;
int curbit, tmpsize;
flefreehead = flefreetail = NULL;
SLIST_INIT(&freelist);
mask = flowtable_mask(ft);
tmpmask = ft->ft_tmpmask;
tmpsize = ft->ft_size;
memcpy(tmpmask, mask, ft->ft_size/8);
curbit = 0;
/*
* XXX Note to self, bit_ffs operates at the byte level
* and thus adds gratuitous overhead
@ -1187,69 +1049,72 @@ flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
break;
}
FL_ENTRY_LOCK(ft, curbit);
flehead = flowtable_entry(ft, curbit);
fle = fleprev = *flehead;
FLOWSTAT_INC(ft, ft_free_checks);
critical_enter();
flist = flowtable_list(ft, curbit);
#ifdef DIAGNOSTIC
if (fle == NULL && curbit > 0) {
if (SLIST_EMPTY(flist) && curbit > 0) {
log(LOG_ALERT,
"warning bit=%d set, but no fle found\n",
curbit);
}
#endif
while (fle != NULL) {
if (rt != NULL) {
if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
fleprev = fle;
fle = fle->f_next;
continue;
}
} else if (!flow_stale(ft, fle)) {
SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
if (rt != NULL && fle->f_rt != rt) {
fleprev = fle;
fle = fle->f_next;
continue;
}
/*
* delete head of the list
*/
if (fleprev == *flehead) {
fletmp = fleprev;
if (fle == fleprev) {
fleprev = *flehead = fle->f_next;
} else
fleprev = *flehead = fle;
fle = fle->f_next;
} else {
/*
* don't advance fleprev
*/
fletmp = fle;
fleprev->f_next = fle->f_next;
fle = fleprev->f_next;
if (!flow_stale(ft, fle)) {
fleprev = fle;
continue;
}
if (flefreehead == NULL)
flefreehead = flefreetail = fletmp;
else {
flefreetail->f_next = fletmp;
flefreetail = fletmp;
}
fletmp->f_next = NULL;
if (fle == SLIST_FIRST(flist))
SLIST_REMOVE_HEAD(flist, f_next);
else
SLIST_REMOVE_AFTER(fleprev, f_next);
SLIST_INSERT_HEAD(&freelist, fle, f_next);
}
if (*flehead == NULL)
if (SLIST_EMPTY(flist))
bit_clear(mask, curbit);
FL_ENTRY_UNLOCK(ft, curbit);
critical_exit();
bit_clear(tmpmask, curbit);
tmpmask += (curbit / 8);
tmpsize -= (curbit / 8) * 8;
bit_ffs(tmpmask, tmpsize, &curbit);
}
while ((fle = flefreehead) != NULL) {
flefreehead = fle->f_next;
SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
FLOWSTAT_INC(ft, ft_frees);
fle_free(fle, ft);
if (fle->f_rt != NULL)
RTFREE(fle->f_rt);
if (fle->f_lle != NULL)
LLE_FREE(fle->f_lle);
uma_zfree(flow_zone, fle);
}
}
static void
flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt)
{
int i;
CPU_FOREACH(i) {
if (smp_started == 1) {
thread_lock(curthread);
sched_bind(curthread, i);
thread_unlock(curthread);
}
flowtable_free_stale(ft, rt);
if (smp_started == 1) {
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
}
}
}
@ -1257,7 +1122,6 @@ void
flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
{
struct flowtable *ft;
int i;
switch (sa) {
#ifdef INET
@ -1274,51 +1138,7 @@ flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
panic("%s: sa %d", __func__, sa);
}
if (ft->ft_flags & FL_PCPU) {
CPU_FOREACH(i) {
if (smp_started == 1) {
thread_lock(curthread);
sched_bind(curthread, i);
thread_unlock(curthread);
}
flowtable_free_stale(ft, rt);
if (smp_started == 1) {
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
}
}
} else {
flowtable_free_stale(ft, rt);
}
}
static void
flowtable_clean_vnet(struct flowtable *ft)
{
if (ft->ft_flags & FL_PCPU) {
int i;
CPU_FOREACH(i) {
if (smp_started == 1) {
thread_lock(curthread);
sched_bind(curthread, i);
thread_unlock(curthread);
}
flowtable_free_stale(ft, NULL);
if (smp_started == 1) {
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
}
}
} else
flowtable_free_stale(ft, NULL);
flowtable_clean_vnet(ft, rt);
}
static void
@ -1335,10 +1155,10 @@ flowtable_cleaner(void)
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
#ifdef INET
flowtable_clean_vnet(&V_ip4_ft);
flowtable_clean_vnet(&V_ip4_ft, NULL);
#endif
#ifdef INET6
flowtable_clean_vnet(&V_ip6_ft);
flowtable_clean_vnet(&V_ip6_ft, NULL);
#endif
CURVNET_RESTORE();
}
@ -1408,16 +1228,9 @@ flowtable_init(const void *unused __unused)
flow_hashjitter = arc4random();
#ifdef INET
flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
flow_zone = uma_zcreate("flows", sizeof(struct flentry),
NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_MAXBUCKET);
uma_zone_set_max(flow_ipv4_zone, 1024 + maxusers * 64 * mp_ncpus);
#endif
#ifdef INET6
flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_MAXBUCKET);
uma_zone_set_max(flow_ipv6_zone, 1024 + maxusers * 64 * mp_ncpus);
#endif
uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
cv_init(&flowclean_c_cv, "c_flowcleanwait");
cv_init(&flowclean_f_cv, "f_flowcleanwait");
@ -1432,8 +1245,6 @@ SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
#ifdef INET
static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
"Flowtable for IPv4");
SYSCTL_UMA_MAX(_net_flowtable_ip4, OID_AUTO, maxflows, CTLFLAG_RW,
&flow_ipv4_zone, "Maximum number of IPv4 flows allowed");
static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
@ -1446,9 +1257,7 @@ static void
flowtable_init_vnet_v4(const void *unused __unused)
{
V_ip4_ft.ft_zone = flow_ipv4_zone;
V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
V_ip4_ft.ft_flags = FL_PCPU;
V_ip4_ft.ft_stat = VNET(ip4_ftstat);
flowtable_alloc(&V_ip4_ft);
}
@ -1459,8 +1268,6 @@ VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
#ifdef INET6
static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
"Flowtable for IPv6");
SYSCTL_UMA_MAX(_net_flowtable_ip6, OID_AUTO, maxflows, CTLFLAG_RW,
&flow_ipv6_zone, "Maximum number of IPv6 flows allowed");
static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
@ -1473,9 +1280,7 @@ static void
flowtable_init_vnet_v6(const void *unused __unused)
{
V_ip6_ft.ft_zone = flow_ipv6_zone;
V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
V_ip6_ft.ft_flags = FL_PCPU;
V_ip6_ft.ft_stat = VNET(ip6_ftstat);
flowtable_alloc(&V_ip6_ft);
}
@ -1484,45 +1289,18 @@ VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
#endif /* INET6 */
#ifdef DDB
static uint32_t *
flowtable_get_hashkey(struct flentry *fle)
{
uint32_t *hashkey;
if (fle->f_flags & FL_IPV6)
hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
else
hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
return (hashkey);
}
static bitstr_t *
flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
{
bitstr_t *mask;
if (ft->ft_flags & FL_PCPU)
mask = ft->ft_masks[cpuid];
else
mask = ft->ft_masks[0];
return (mask);
return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
}
static struct flentry **
flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
static struct flist *
flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
{
struct flentry **fle;
int index = (hash % ft->ft_size);
if (ft->ft_flags & FL_PCPU) {
fle = &ft->ft_table.pcpu[cpuid][index];
} else {
fle = &ft->ft_table.global[index];
}
return (fle);
return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
}
static void
@ -1542,7 +1320,7 @@ flow_show(struct flowtable *ft, struct flentry *fle)
if (rt_valid)
ifp = rt->rt_ifp;
ifp_valid = ifp != NULL;
hashkey = flowtable_get_hashkey(fle);
hashkey = (uint32_t *)&fle->f_flow;
if (fle->f_flags & FL_IPV6)
goto skipaddr;
@ -1594,7 +1372,6 @@ static void
flowtable_show(struct flowtable *ft, int cpuid)
{
int curbit = 0;
struct flentry *fle, **flehead;
bitstr_t *mask, *tmpmask;
if (cpuid != -1)
@ -1608,20 +1385,19 @@ flowtable_show(struct flowtable *ft, int cpuid)
*/
bit_ffs(tmpmask, ft->ft_size, &curbit);
while (curbit != -1) {
struct flist *flist;
struct flentry *fle;
if (curbit >= ft->ft_size || curbit < -1) {
db_printf("warning: bad curbit value %d \n",
curbit);
break;
}
flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
fle = *flehead;
flist = flowtable_list_pcpu(ft, curbit, cpuid);
while (fle != NULL) {
SLIST_FOREACH(fle, flist, f_next)
flow_show(ft, fle);
fle = fle->f_next;
continue;
}
bit_clear(tmpmask, curbit);
bit_ffs(tmpmask, ft->ft_size, &curbit);
}
@ -1631,14 +1407,10 @@ static void
flowtable_show_vnet(struct flowtable *ft)
{
if (ft->ft_flags & FL_PCPU) {
int i;
int i;
CPU_FOREACH(i) {
flowtable_show(ft, i);
}
} else
flowtable_show(ft, -1);
CPU_FOREACH(i)
flowtable_show(ft, i);
}
DB_SHOW_COMMAND(flowtables, db_show_flowtables)

View File

@ -44,7 +44,6 @@ struct flowtable_stat {
#ifdef _KERNEL
#define FL_HASH_ALL (1<<0) /* hash 4-tuple + protocol */
#define FL_PCPU (1<<1) /* pcpu cache */
#define FL_NOAUTO (1<<2) /* don't automatically add flentry on miss */
#define FL_IPV6 (1<<9)