diff --git a/share/man/man9/netisr.9 b/share/man/man9/netisr.9 index 63112be4c1b1..ac648d100d78 100644 --- a/share/man/man9/netisr.9 +++ b/share/man/man9/netisr.9 @@ -27,7 +27,7 @@ .\" .\" $FreeBSD$ .\" -.Dd January 11, 2015 +.Dd June 3, 2016 .Dt NETISR 9 .Os .Sh NAME @@ -61,6 +61,16 @@ .Fn netisr_get_cpucount "void" .Ft u_int .Fn netisr_get_cpuid "u_int cpunumber" +.Pp +With optional virtual network stack support enabled via the following kernel +compile option: +.Bd -ragged -offset indent +.Cd "options VIMAGE" +.Ed +.Ft void +.Fn netisr_register_vnet "const struct netisr_handler *nhp" +.Ft void +.Fn netisr_unregister_vnet "const struct netisr_handler *nhp" .Sh DESCRIPTION The .Nm @@ -80,6 +90,16 @@ and may also manage queue limits and statistics using the and .Fn netisr_setqlimit . .Pp +In case of VIMAGE kernels each virtual network stack (vnet), that is not the +default base system network stack, calls +.Fn netisr_register_vnet +and +.Fn netisr_unregister_vnet +to enable or disable packet processing by the +.Nm +for each protocol. +Disabling will also purge any outstanding packet from the protocol queue. +.Pp .Nm supports multi-processor execution of handlers, and relies on a combination of source ordering and protocol-specific ordering and work-placement diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c index 943776c80d5c..949a4174de19 100644 --- a/sys/net/if_epair.c +++ b/sys/net/if_epair.c @@ -959,6 +959,9 @@ vnet_epair_init(const void *unused __unused) V_epair_cloner = if_clone_advanced(epairname, 0, epair_clone_match, epair_clone_create, epair_clone_destroy); +#ifdef VIMAGE + netisr_register_vnet(&epair_nh); +#endif } VNET_SYSINIT(vnet_epair_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_epair_init, NULL); @@ -967,6 +970,9 @@ static void vnet_epair_uninit(const void *unused __unused) { +#ifdef VIMAGE + netisr_unregister_vnet(&epair_nh); +#endif if_clone_detach(V_epair_cloner); } VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 9346aecb9f82..2542ab68afd4 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -702,12 +702,16 @@ vnet_ether_init(__unused void *arg) if ((i = pfil_head_register(&V_link_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil link hook, " "error %d\n", __func__, i); +#ifdef VIMAGE + netisr_register_vnet(ðer_nh); +#endif } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); +#ifdef VIMAGE static void -vnet_ether_destroy(__unused void *arg) +vnet_ether_pfil_destroy(__unused void *arg) { int i; @@ -715,8 +719,18 @@ vnet_ether_destroy(__unused void *arg) printf("%s: WARNING: unable to unregister pfil link hook, " "error %d\n", __func__, i); } +VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, + vnet_ether_pfil_destroy, NULL); + +static void +vnet_ether_destroy(__unused void *arg) +{ + + netisr_unregister_vnet(ðer_nh); +} VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); +#endif @@ -740,7 +754,9 @@ ether_input(struct ifnet *ifp, struct mbuf *m) * so assert it is correct here. */ KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch", __func__)); + CURVNET_SET_QUIET(ifp->if_vnet); netisr_dispatch(NETISR_ETHER, m); + CURVNET_RESTORE(); m = mn; } } diff --git a/sys/net/netisr.c b/sys/net/netisr.c index 492a8512d978..318e1a3f6e0b 100644 --- a/sys/net/netisr.c +++ b/sys/net/netisr.c @@ -210,6 +210,23 @@ SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD, */ static struct netisr_proto netisr_proto[NETISR_MAXPROT]; +#ifdef VIMAGE +/* + * The netisr_enable array describes a per-VNET flag for registered + * protocols on whether this netisr is active in this VNET or not. + * netisr_register() will automatically enable the netisr for the + * default VNET and all currently active instances. + * netisr_unregister() will disable all active VNETs, including vnet0. + * Individual network stack instances can be enabled/disabled by the + * netisr_(un)register _vnet() functions. + * With this we keep the one netisr_proto per protocol but add a + * mechanism to stop netisr processing for vnet teardown. + * Apart from that we expect a VNET to always be enabled. + */ +static VNET_DEFINE(u_int, netisr_enable[NETISR_MAXPROT]); +#define V_netisr_enable VNET(netisr_enable) +#endif + /* * Per-CPU workstream data. See netisr_internal.h for more details. */ @@ -352,6 +369,7 @@ sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) void netisr_register(const struct netisr_handler *nhp) { + VNET_ITERATOR_DECL(vnet_iter); struct netisr_work *npwp; const char *name; u_int i, proto; @@ -420,6 +438,22 @@ netisr_register(const struct netisr_handler *nhp) bzero(npwp, sizeof(*npwp)); npwp->nw_qlimit = netisr_proto[proto].np_qlimit; } + +#ifdef VIMAGE + /* + * Test that we are in vnet0 and have a curvnet set. + */ + KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); + KASSERT(IS_DEFAULT_VNET(curvnet), ("%s: curvnet %p is not vnet0 %p", + __func__, curvnet, vnet0)); + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_netisr_enable[proto] = 1; + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +#endif NETISR_WUNLOCK(); } @@ -584,6 +618,7 @@ netisr_drain_proto(struct netisr_work *npwp) void netisr_unregister(const struct netisr_handler *nhp) { + VNET_ITERATOR_DECL(vnet_iter); struct netisr_work *npwp; #ifdef INVARIANTS const char *name; @@ -602,6 +637,16 @@ netisr_unregister(const struct netisr_handler *nhp) ("%s(%u): protocol not registered for %s", __func__, proto, name)); +#ifdef VIMAGE + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_netisr_enable[proto] = 0; + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +#endif + netisr_proto[proto].np_name = NULL; netisr_proto[proto].np_handler = NULL; netisr_proto[proto].np_m2flow = NULL; @@ -616,6 +661,97 @@ netisr_unregister(const struct netisr_handler *nhp) NETISR_WUNLOCK(); } +#ifdef VIMAGE +void +netisr_register_vnet(const struct netisr_handler *nhp) +{ + u_int proto; + + proto = nhp->nh_proto; + + KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name)); + NETISR_WLOCK(); + KASSERT(netisr_proto[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + nhp->nh_name)); + + V_netisr_enable[proto] = 1; + NETISR_WUNLOCK(); +} + +static void +netisr_drain_proto_vnet(struct vnet *vnet, u_int proto) +{ + struct netisr_workstream *nwsp; + struct netisr_work *npwp; + struct mbuf *m, *mp, *n, *ne; + u_int i; + + KASSERT(vnet != NULL, ("%s: vnet is NULL", __func__)); + NETISR_LOCK_ASSERT(); + + CPU_FOREACH(i) { + nwsp = DPCPU_ID_PTR(i, nws); + if (nwsp->nws_intr_event == NULL) + continue; + npwp = &nwsp->nws_work[proto]; + NWS_LOCK(nwsp); + + /* + * Rather than dissecting and removing mbufs from the middle + * of the chain, we build a new chain if the packet stays and + * update the head and tail pointers at the end. All packets + * matching the given vnet are freed. + */ + m = npwp->nw_head; + n = ne = NULL; + while (m != NULL) { + mp = m; + m = m->m_nextpkt; + mp->m_nextpkt = NULL; + if (mp->m_pkthdr.rcvif->if_vnet != vnet) { + if (n == NULL) { + n = ne = mp; + } else { + ne->m_nextpkt = mp; + ne = mp; + } + continue; + } + /* This is a packet in the selected vnet. Free it. */ + npwp->nw_len--; + m_freem(mp); + } + npwp->nw_head = n; + npwp->nw_tail = ne; + NWS_UNLOCK(nwsp); + } +} + +void +netisr_unregister_vnet(const struct netisr_handler *nhp) +{ + u_int proto; + + proto = nhp->nh_proto; + + KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name)); + NETISR_WLOCK(); + KASSERT(netisr_proto[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + nhp->nh_name)); + + V_netisr_enable[proto] = 0; + + netisr_drain_proto_vnet(curvnet, proto); + NETISR_WUNLOCK(); +} +#endif + /* * Compose the global and per-protocol policies on dispatch, and return the * dispatch policy to use. @@ -906,6 +1042,13 @@ netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); +#ifdef VIMAGE + if (V_netisr_enable[proto] == 0) { + m_freem(m); + return (ENOPROTOOPT); + } +#endif + m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED, source, m, &cpuid); if (m != NULL) { @@ -952,6 +1095,13 @@ netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); +#ifdef VIMAGE + if (V_netisr_enable[proto] == 0) { + m_freem(m); + return (ENOPROTOOPT); + } +#endif + dispatch_policy = netisr_get_dispatch(npp); if (dispatch_policy == NETISR_DISPATCH_DEFERRED) return (netisr_queue_src(proto, source, m)); diff --git a/sys/net/netisr.h b/sys/net/netisr.h index 94a6cc479c5f..63764a74f2eb 100644 --- a/sys/net/netisr.h +++ b/sys/net/netisr.h @@ -210,6 +210,10 @@ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp); void netisr_register(const struct netisr_handler *nhp); int netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit); void netisr_unregister(const struct netisr_handler *nhp); +#ifdef VIMAGE +void netisr_register_vnet(const struct netisr_handler *nhp); +void netisr_unregister_vnet(const struct netisr_handler *nhp); +#endif /* * Process a packet destined for a protocol, and attempt direct dispatch. diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index c0746035e3a3..91158b0fccc9 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -191,15 +191,33 @@ SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, "maximum routing socket dispatch queue length"); static void -rts_init(void) +vnet_rts_init(void) { int tmp; - if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) - rtsock_nh.nh_qlimit = tmp; - netisr_register(&rtsock_nh); + if (IS_DEFAULT_VNET(curvnet)) { + if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) + rtsock_nh.nh_qlimit = tmp; + netisr_register(&rtsock_nh); + } +#ifdef VIMAGE + else + netisr_register_vnet(&rtsock_nh); +#endif } -SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0); +VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_rts_init, 0); + +#ifdef VIMAGE +static void +vnet_rts_uninit(void) +{ + + netisr_unregister_vnet(&rtsock_nh); +} +VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_rts_uninit, 0); +#endif static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 48fae92ae5fe..1a23390b2b54 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -143,7 +143,6 @@ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, } while (0) -static void arp_init(void); static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET @@ -1337,12 +1336,33 @@ arp_iflladdr(void *arg __unused, struct ifnet *ifp) } static void -arp_init(void) +vnet_arp_init(void) { - netisr_register(&arp_nh); - if (IS_DEFAULT_VNET(curvnet)) + if (IS_DEFAULT_VNET(curvnet)) { + netisr_register(&arp_nh); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); + } +#ifdef VIMAGE + else + netisr_register_vnet(&arp_nh); +#endif } -SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); +VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, + vnet_arp_init, 0); + +#ifdef VIMAGE +/* + * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH + * lookups after destroying the hash. Ideally this would go on SI_ORDER_3.5. + */ +static void +vnet_arp_destroy(__unused void *arg) +{ + + netisr_unregister_vnet(&arp_nh); +} +VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_arp_destroy, NULL); +#endif diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 9afb8d66b381..f30913e49ab8 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -331,8 +331,15 @@ ip_init(void) __func__); /* Skip initialization of globals for non-default instances. */ - if (!IS_DEFAULT_VNET(curvnet)) +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) { + netisr_register_vnet(&ip_nh); +#ifdef RSS + netisr_register_vnet(&ip_direct_nh); +#endif return; + } +#endif pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) @@ -366,6 +373,11 @@ ip_destroy(void *unused __unused) { int error; +#ifdef RSS + netisr_unregister_vnet(&ip_direct_nh); +#endif + netisr_unregister_vnet(&ip_nh); + if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error); diff --git a/sys/netinet6/ip6_input.c b/sys/netinet6/ip6_input.c index d7fc9eeaf33a..a897d6c3e1d3 100644 --- a/sys/netinet6/ip6_input.c +++ b/sys/netinet6/ip6_input.c @@ -217,8 +217,15 @@ ip6_init(void) V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ - if (!IS_DEFAULT_VNET(curvnet)) +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) { + netisr_register_vnet(&ip6_nh); +#ifdef RSS + netisr_register_vnet(&ip6_direct_nh); +#endif return; + } +#endif pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) @@ -310,6 +317,11 @@ ip6_destroy(void *unused __unused) { int error; +#ifdef RSS + netisr_unregister_vnet(&ip6_direct_nh); +#endif + netisr_unregister_vnet(&ip6_nh); + if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error);