From 2bbab0af6dd5aa825fbe86813d917565bb885b67 Mon Sep 17 00:00:00 2001 From: "Alexander V. Chernikov" Date: Sat, 23 May 2020 10:21:02 +0000 Subject: [PATCH] Use epoch(9) for rtentries to simplify control plane operations. Currently the only reason of refcounting rtentries is the need to report the rtable operation details immediately after the execution. Delaying rtentry reclamation allows to stop refcounting and simplify the code. Additionally, this change allows to reimplement rib_lookup_info(), which is used by some of the customers to get the matching prefix along with nexthops, in more efficient way. The change keeps per-vnet rtzone uma zone. It adds nh_vnet field to nhop_priv to be able to reliably set curvnet even during vnet teardown. Rest of the reference counting code will be removed in the D24867 . Differential Revision: https://reviews.freebsd.org/D24866 --- sys/fs/nfsclient/nfs_clvfsops.c | 3 + sys/net/if.c | 4 +- sys/net/route.c | 127 ++++++++++++++------------------ sys/net/route.h | 2 - sys/net/route/nhop.h | 1 + sys/net/route/nhop_ctl.c | 10 +++ sys/net/route/nhop_var.h | 1 + sys/net/route/route_var.h | 2 + sys/net/rtsock.c | 5 +- sys/netinet6/nd6.c | 3 + sys/netinet6/nd6_rtr.c | 30 ++++---- sys/nfs/bootp_subr.c | 3 + 12 files changed, 97 insertions(+), 94 deletions(-) diff --git a/sys/fs/nfsclient/nfs_clvfsops.c b/sys/fs/nfsclient/nfs_clvfsops.c index d68171e7ad2b..5490155da68c 100644 --- a/sys/fs/nfsclient/nfs_clvfsops.c +++ b/sys/fs/nfsclient/nfs_clvfsops.c @@ -465,18 +465,21 @@ nfs_mountroot(struct mount *mp) if (nd->mygateway.sin_len != 0 && nd->mygateway.sin_addr.s_addr != 0) { struct sockaddr_in mask, sin; + struct epoch_tracker et; bzero((caddr_t)&mask, sizeof(mask)); sin = mask; sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); /* XXX MRT use table 0 for this sort of thing */ + NET_EPOCH_ENTER(et); CURVNET_SET(TD_TO_VNET(td)); error = rtrequest_fib(RTM_ADD, (struct sockaddr *)&sin, (struct sockaddr *)&nd->mygateway, (struct sockaddr *)&mask, RTF_UP | RTF_GATEWAY, NULL, RT_DEFAULT_FIB); CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); if (error) panic("nfs_mountroot: RTM_ADD: %d", error); } diff --git a/sys/net/if.c b/sys/net/if.c index 2a80a0bc46e6..bccbba268b56 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1854,18 +1854,17 @@ ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, ifp = ifa->ifa_ifp; + NET_EPOCH_ENTER(et); bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; if (cmd == RTM_ADD) { /* explicitly specify (loopback) ifa */ if (info.rti_ifp != NULL) { - NET_EPOCH_ENTER(et); rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp); if (rti_ifa != NULL) ifa_ref(rti_ifa); info.rti_ifa = rti_ifa; - NET_EPOCH_EXIT(et); } } info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; @@ -1874,6 +1873,7 @@ ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); + NET_EPOCH_EXIT(et); if (rti_ifa != NULL) ifa_free(rti_ifa); diff --git a/sys/net/route.c b/sys/net/route.c index 1b966b691bd5..c21f3de66a57 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -120,9 +120,6 @@ VNET_PCPUSTAT_SYSUNINIT(rtstat); VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) -VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ -#define V_rttrash VNET(rttrash) - /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. @@ -148,6 +145,7 @@ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *, static struct rtentry *rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror); static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); +static void destroy_rtentry_epoch(epoch_context_t ctx); #ifdef RADIX_MPATH static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror); @@ -332,6 +330,16 @@ vnet_route_uninit(const void *unused __unused) } } + /* + * dom_rtdetach calls rt_table_destroy(), which + * schedules deletion for all rtentries, nexthops and control + * structures. Wait for the destruction callbacks to fire. + * Note that this should result in freeing all rtentries, but + * nexthops deletions will be scheduled for the next epoch run + * and will be completed after vnet teardown. + */ + epoch_drain_callbacks(net_epoch_preempt); + free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } @@ -449,41 +457,54 @@ rtfree(struct rtentry *rt) if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); - /* - * the rtentry must have been removed from the routing table - * so it is represented in rttrash.. remove that now. - */ - V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif - - /* Unreference nexthop */ - nhop_free(rt->rt_nhop); + epoch_call(net_epoch_preempt, destroy_rtentry_epoch, + &rt->rt_epoch_ctx); /* - * and the rtentry itself of course + * FALLTHROUGH to RT_UNLOCK() so the reporting functions + * have consistent behaviour of operating on unlocked entry. */ - uma_zfree(V_rtzone, rt); - return; } done: RT_UNLOCK(rt); } -/* - * Temporary RTFREE() function wrapper. - * Intended to use in control plane code to - * avoid exposing internal layout of 'struct rtentry'. - */ -void -rtfree_func(struct rtentry *rt) +static void +destroy_rtentry(struct rtentry *rt) { - RTFREE(rt); + /* + * At this moment rnh, nh_control may be already freed. + * nhop interface may have been migrated to a different vnet. + * Use vnet stored in the nexthop to delete the entry. + */ + CURVNET_SET(nhop_get_vnet(rt->rt_nhop)); + + /* Unreference nexthop */ + nhop_free(rt->rt_nhop); + + uma_zfree(V_rtzone, rt); + + CURVNET_RESTORE(); +} + +/* + * Epoch callback indicating rtentry is safe to destroy + */ +static void +destroy_rtentry_epoch(epoch_context_t ctx) +{ + struct rtentry *rt; + + rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); + + destroy_rtentry(rt); } /* @@ -546,7 +567,7 @@ rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, RT_LOCK(rt); flags = rt->rt_flags; - RTFREE_LOCKED(rt); + RT_UNLOCK(rt); RTSTAT_INC(rts_dynamic); @@ -1112,13 +1133,6 @@ rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) ifa = rt->rt_nhop->nh_ifa; if (ifa != NULL && ifa->ifa_rtrequest != NULL) ifa->ifa_rtrequest(RTM_DELETE, rt, rt->rt_nhop, info); - - /* - * One more rtentry floating around that is not - * linked to the routing table. rttrash will be decremented - * when RTFREE(rt) is eventually called. - */ - V_rttrash++; } @@ -1386,6 +1400,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); KASSERT((info->rti_flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); + NET_EPOCH_ASSERT(); dst = info->rti_info[RTAX_DST]; @@ -1580,13 +1595,10 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, ifa->ifa_rtrequest(RTM_ADD, rt, rt->rt_nhop, info); /* - * actually return a resultant rtentry and - * give the caller a single reference. + * actually return a resultant rtentry */ - if (ret_nrt) { + if (ret_nrt) *ret_nrt = rt; - RT_ADDREF(rt); - } rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); @@ -1622,15 +1634,13 @@ del_route(struct rib_head *rnh, struct rt_addrinfo *info, /* * If the caller wants it, then it can have it, - * but it's up to it to free the rtentry as we won't be - * doing it. + * the entry will be deleted after the end of the current epoch. */ - if (ret_nrt) { + if (ret_nrt) *ret_nrt = rt; - RT_UNLOCK(rt); - } else - RTFREE_LOCKED(rt); - + + RTFREE_LOCKED(rt); + return (0); } @@ -1736,10 +1746,8 @@ change_route_one(struct rib_head *rnh, struct rt_addrinfo *info, if ((nh_orig->nh_ifa != nh->nh_ifa) && nh_orig->nh_ifa->ifa_rtrequest) nh_orig->nh_ifa->ifa_rtrequest(RTM_DELETE, rt, nh_orig, info); - if (ret_nrt != NULL) { + if (ret_nrt != NULL) *ret_nrt = rt; - RT_ADDREF(rt); - } RT_UNLOCK(rt); @@ -1757,7 +1765,6 @@ static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt) { - struct epoch_tracker et; int error; /* Check if updated gateway exists */ @@ -1765,8 +1772,6 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, (info->rti_info[RTAX_GATEWAY] == NULL)) return (EINVAL); - NET_EPOCH_ENTER(et); - /* * route change is done in multiple steps, with dropping and * reacquiring lock. In the situations with multiple processes @@ -1779,7 +1784,6 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, if (error != EAGAIN) break; } - NET_EPOCH_EXIT(et); return (error); } @@ -1825,6 +1829,7 @@ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { RIB_RLOCK_TRACKER; + struct epoch_tracker et; struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; @@ -1957,38 +1962,18 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; + NET_EPOCH_ENTER(et); error = rtrequest1_fib(cmd, &info, &rt, fibnum); if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ - RT_LOCK(rt); /* TODO: interface routes/aliases */ - RT_ADDREF(rt); - RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, rt, fibnum); - RT_LOCK(rt); - RT_REMREF(rt); - if (cmd == RTM_DELETE) { - /* - * If we are deleting, and we found an entry, - * then it's been removed from the tree.. - * now throw it away. - */ - RTFREE_LOCKED(rt); - } else { - if (cmd == RTM_ADD) { - /* - * We just wanted to add it.. - * we don't actually need a reference. - */ - RT_REMREF(rt); - } - RT_UNLOCK(rt); - } didwork = 1; } + NET_EPOCH_EXIT(et); if (error) a_failure = error; } diff --git a/sys/net/route.h b/sys/net/route.h index 4ce3310139ab..355a84407f9b 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -332,8 +332,6 @@ struct rt_addrinfo { #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) -#define RTFREE_FUNC(_rt) rtfree_func(_rt) - #define RO_NHFREE(_ro) do { \ if ((_ro)->ro_nh) { \ NH_FREE((_ro)->ro_nh); \ diff --git a/sys/net/route/nhop.h b/sys/net/route/nhop.h index d0304f37f6dc..a3ee003cdf7d 100644 --- a/sys/net/route/nhop.h +++ b/sys/net/route/nhop.h @@ -176,6 +176,7 @@ struct rib_head; uint32_t nhop_get_idx(const struct nhop_object *nh); enum nhop_type nhop_get_type(const struct nhop_object *nh); int nhop_get_rtflags(const struct nhop_object *nh); +struct vnet *nhop_get_vnet(const struct nhop_object *nh); #endif /* _KERNEL */ diff --git a/sys/net/route/nhop_ctl.c b/sys/net/route/nhop_ctl.c index 45e7b4cbb1a8..464d4cb05eab 100644 --- a/sys/net/route/nhop_ctl.c +++ b/sys/net/route/nhop_ctl.c @@ -500,6 +500,9 @@ finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info, return (ENOMEM); } + /* Save vnet to ease destruction */ + nh_priv->nh_vnet = curvnet; + /* Reference external objects and calculate (referenced) ifa */ if_ref(nh->nh_ifp); ifa_ref(nh->nh_ifa); @@ -698,6 +701,13 @@ nhop_set_rtflags(struct nhop_object *nh, int rt_flags) nh->nh_priv->rt_flags = rt_flags; } +struct vnet * +nhop_get_vnet(const struct nhop_object *nh) +{ + + return (nh->nh_priv->nh_vnet); +} + void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu) { diff --git a/sys/net/route/nhop_var.h b/sys/net/route/nhop_var.h index 4bf26ff54269..aac2829e186f 100644 --- a/sys/net/route/nhop_var.h +++ b/sys/net/route/nhop_var.h @@ -78,6 +78,7 @@ struct nhop_priv { struct nhop_object *nh; /* backreference to the dataplane nhop */ struct nh_control *nh_control; /* backreference to the rnh */ struct nhop_priv *nh_next; /* hash table membership */ + struct vnet *nh_vnet; /* vnet nhop belongs to */ struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ }; diff --git a/sys/net/route/route_var.h b/sys/net/route/route_var.h index 3dfe7f152954..288713e2b564 100644 --- a/sys/net/route/route_var.h +++ b/sys/net/route/route_var.h @@ -35,6 +35,7 @@ #ifndef RNF_NORMAL #include #endif +#include #include /* struct sockaddr_in */ #include @@ -148,6 +149,7 @@ struct rtentry { #define rt_endzero rt_mtx struct mtx rt_mtx; /* mutex for routing entry */ struct rtentry *rt_chain; /* pointer to next rtentry to delete */ + struct epoch_context rt_epoch_ctx; /* net epoch tracker */ }; #define RT_LOCK_INIT(_rt) \ diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index e474da739cbb..a1806df834c5 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -742,7 +742,6 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, } } RT_LOCK(rt); - RT_ADDREF(rt); RIB_RUNLOCK(rnh); *ret_nrt = rt; @@ -930,7 +929,6 @@ route_output(struct mbuf *m, struct socket *so, ...) #endif RT_LOCK(saved_nrt); rtm->rtm_index = saved_nrt->rt_nhop->nh_ifp->if_index; - RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); } break; @@ -987,8 +985,7 @@ route_output(struct mbuf *m, struct socket *so, ...) flush: NET_EPOCH_EXIT(et); - if (rt != NULL) - RTFREE(rt); + rt = NULL; #ifdef INET6 if (rtm != NULL) { diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index 57009fba5886..1438b1ccd93c 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -1566,14 +1566,17 @@ nd6_free_redirect(const struct llentry *ln) int fibnum; struct sockaddr_in6 sin6; struct rt_addrinfo info; + struct epoch_tracker et; lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6); memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6; info.rti_filter = nd6_isdynrte; + NET_EPOCH_ENTER(et); for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); + NET_EPOCH_EXIT(et); } /* diff --git a/sys/netinet6/nd6_rtr.c b/sys/netinet6/nd6_rtr.c index 123879d29eca..98cc1b220248 100644 --- a/sys/netinet6/nd6_rtr.c +++ b/sys/netinet6/nd6_rtr.c @@ -690,10 +690,8 @@ defrouter_addreq(struct nd_defrouter *new) error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &newrt, fibnum); - if (newrt) { + if (newrt != NULL) rt_routemsg(RTM_ADD, newrt, new->ifp, 0, fibnum); - RTFREE_FUNC(newrt); - } if (error == 0) new->installed = 1; } @@ -708,6 +706,7 @@ defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; struct rtentry *oldrt = NULL; + struct epoch_tracker et; unsigned int fibnum; bzero(&def, sizeof(def)); @@ -720,13 +719,13 @@ defrouter_delreq(struct nd_defrouter *dr) gate.sin6_addr = dr->rtaddr; fibnum = dr->ifp->if_fib; + NET_EPOCH_ENTER(et); in6_rtrequest(RTM_DELETE, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, fibnum); - if (oldrt) { + if (oldrt != NULL) rt_routemsg(RTM_DELETE, oldrt, dr->ifp, 0, fibnum); - RTFREE_FUNC(oldrt); - } + NET_EPOCH_EXIT(et); dr->installed = 0; } @@ -1022,6 +1021,7 @@ defrouter_select_fib(int fibnum) } ND6_RUNLOCK(); + NET_EPOCH_ENTER(et); /* * If we selected a router for this FIB and it's different * than the installed one, remove the installed router and @@ -1037,6 +1037,7 @@ defrouter_select_fib(int fibnum) } if (selected_dr != NULL) defrouter_rele(selected_dr); + NET_EPOCH_EXIT(et); } static struct nd_defrouter * @@ -2064,7 +2065,6 @@ nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) pr->ndpr_stateflags |= NDPRF_ONLINK; rt_routemsg(RTM_ADD, rt, pr->ndpr_ifp, 0, fibnum); - RTFREE_FUNC(rt); } /* Return the last error we got. */ @@ -2132,7 +2132,6 @@ nd6_prefix_onlink(struct nd_prefix *pr) } /* should we care about ia6_flags? */ } - NET_EPOCH_EXIT(et); if (ifa == NULL) { /* * This can still happen, when, for example, we receive an RA @@ -2145,13 +2144,12 @@ nd6_prefix_onlink(struct nd_prefix *pr) "prefix(%s/%d) on %s\n", __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp))); - return (0); - } - - error = nd6_prefix_onlink_rtrequest(pr, ifa); - - if (ifa != NULL) + error = 0; + } else { + error = nd6_prefix_onlink_rtrequest(pr, ifa); ifa_free(ifa); + } + NET_EPOCH_EXIT(et); return (error); } @@ -2167,6 +2165,7 @@ nd6_prefix_offlink(struct nd_prefix *pr) char ip6buf[INET6_ADDRSTRLEN]; uint64_t genid; int fibnum, maxfib, a_failure; + struct epoch_tracker et; ND6_ONLINK_LOCK_ASSERT(); ND6_UNLOCK_ASSERT(); @@ -2193,6 +2192,7 @@ nd6_prefix_offlink(struct nd_prefix *pr) } a_failure = 0; + NET_EPOCH_ENTER(et); for (; fibnum < maxfib; fibnum++) { rt = NULL; error = in6_rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL, @@ -2205,8 +2205,8 @@ nd6_prefix_offlink(struct nd_prefix *pr) /* report route deletion to the routing socket. */ rt_routemsg(RTM_DELETE, rt, ifp, 0, fibnum); - RTFREE_FUNC(rt); } + NET_EPOCH_EXIT(et); error = a_failure; a_failure = 1; if (error == 0) { diff --git a/sys/nfs/bootp_subr.c b/sys/nfs/bootp_subr.c index 07b445418327..d64026d0cd90 100644 --- a/sys/nfs/bootp_subr.c +++ b/sys/nfs/bootp_subr.c @@ -1664,14 +1664,17 @@ bootpc_init(void) goto out; if (gctx->gotrootpath != 0) { + struct epoch_tracker et; kern_setenv("boot.netif.name", ifctx->ifp->if_xname); + NET_EPOCH_ENTER(et); bootpc_add_default_route(ifctx); error = md_mount(&nd->root_saddr, nd->root_hostnam, nd->root_fh, &nd->root_fhsize, &nd->root_args, td); bootpc_remove_default_route(ifctx); + NET_EPOCH_EXIT(et); if (error != 0) { if (gctx->any_root_overrides == 0) panic("nfs_boot: mount root, error=%d", error);