diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index cf279ca1a554..f2597ba65052 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -208,6 +208,8 @@ net altq .. + route + .. .. net80211 .. diff --git a/include/Makefile b/include/Makefile index 462bcb001566..cbfb75951c62 100644 --- a/include/Makefile +++ b/include/Makefile @@ -53,6 +53,7 @@ LSUBDIRS= cam/ata cam/mmc cam/nvme cam/scsi \ geom/mirror geom/mountver geom/multipath geom/nop \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ net/altq \ + net/route \ netgraph/atm netgraph/netflow \ netinet/cc \ netinet/netdump \ diff --git a/lib/libc/gen/sysctl.3 b/lib/libc/gen/sysctl.3 index e44455df5cec..f383d61c36ef 100644 --- a/lib/libc/gen/sysctl.3 +++ b/lib/libc/gen/sysctl.3 @@ -563,6 +563,7 @@ The fifth, sixth, and seventh level names are as follows: .It Dv NET_RT_IFLIST Ta 0 or if_index Ta None .It Dv NET_RT_IFMALIST Ta 0 or if_index Ta None .It Dv NET_RT_IFLISTL Ta 0 or if_index Ta None +.It Dv NET_RT_NHOPS Ta None Ta fib number .El .Pp The @@ -583,6 +584,9 @@ uses 'l' versions of the message header structures: .Va struct if_msghdrl and .Va struct ifa_msghdrl . +.Pp +.Dv NET_RT_NHOPS +returns all nexthops for specified address family in given fib. .It Li PF_INET Get or set various global information about the IPv4 (Internet Protocol version 4). diff --git a/sys/conf/files b/sys/conf/files index 6805a4ddeb0b..1f11498138ef 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4091,6 +4091,11 @@ net/raw_cb.c standard net/raw_usrreq.c standard net/route.c standard net/route_temporal.c standard +net/route/nhop.c standard +net/route/nhop_ctl.c standard +net/route/nhop_utils.c standard +net/route/route_ctl.c standard +net/route/route_helpers.c standard net/rss_config.c optional inet rss | inet6 rss net/rtsock.c standard net/slcompress.c optional netgraph_vjc | sppp | \ diff --git a/sys/net/radix_mpath.c b/sys/net/radix_mpath.c index 8a341458cea2..698ddbb516e8 100644 --- a/sys/net/radix_mpath.c +++ b/sys/net/radix_mpath.c @@ -211,7 +211,7 @@ rt_mpath_conflict(struct rib_head *rnh, struct rtentry *rt, return (0); } -static struct rtentry * +struct rtentry * rt_mpath_selectrte(struct rtentry *rte, uint32_t hash) { struct radix_node *rn0, *rn; diff --git a/sys/net/radix_mpath.h b/sys/net/radix_mpath.h index 8f73dd032c9c..e4f513847545 100644 --- a/sys/net/radix_mpath.h +++ b/sys/net/radix_mpath.h @@ -56,10 +56,27 @@ int rt_mpath_conflict(struct rib_head *, struct rtentry *, struct sockaddr *); void rtalloc_mpath_fib(struct route *, u_int32_t, u_int); struct rtentry *rt_mpath_select(struct rtentry *, uint32_t); +struct rtentry *rt_mpath_selectrte(struct rtentry *, uint32_t); int rt_mpath_deldup(struct rtentry *, struct rtentry *); int rn4_mpath_inithead(void **, int, u_int); int rn6_mpath_inithead(void **, int, u_int); +static inline struct rtentry * +rt_mpath_next(struct rtentry *rt) +{ + struct radix_node *next, *rn; + + rn = (struct radix_node *)rt; + + if (!rn->rn_dupedkey) + return (NULL); + next = rn->rn_dupedkey; + if (rn->rn_mask == next->rn_mask) + return (struct rtentry *)next; + else + return (NULL); +} + #endif #endif /* _NET_RADIX_MPATH_H_ */ diff --git a/sys/net/route.c b/sys/net/route.c index 402373277ef4..9ffaf9570a26 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -62,6 +62,8 @@ #include #include #include +#include +#include #include #ifdef RADIX_MPATH @@ -108,10 +110,7 @@ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); -VNET_PCPUSTAT_DEFINE_STATIC(struct rtstat, rtstat); -#define RTSTAT_ADD(name, val) \ - VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val)) -#define RTSTAT_INC(name) RTSTAT_ADD(name, 1) +VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat); VNET_PCPUSTAT_SYSINIT(rtstat); #ifdef VIMAGE @@ -240,6 +239,7 @@ route_init(void) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; + nhops_init(); } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); @@ -377,6 +377,8 @@ rt_table_init(int offset, int family, u_int fibnum) /* Init locks */ RIB_LOCK_INIT(rh); + nhops_init_rib(rh); + /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; @@ -408,6 +410,8 @@ rt_table_destroy(struct rib_head *rh) rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); + nhops_destroy_rib(rh); + /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); @@ -586,6 +590,9 @@ rtfree(struct rtentry *rt) */ R_Free(rt_key(rt)); + /* Unreference nexthop */ + nhop_free(rt->rt_nhop); + /* * and the rtentry itself of course */ @@ -1400,6 +1407,7 @@ rt_updatemtu(struct ifnet *ifp) RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); RIB_WUNLOCK(rnh); + nhops_update_ifmtu(rnh, ifp, ifmtu.mtu); } } } @@ -1544,6 +1552,7 @@ int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { + struct epoch_tracker et; const struct sockaddr *dst; struct rib_head *rnh; int error; @@ -1592,9 +1601,11 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, error = add_route(rnh, info, ret_nrt); break; case RTM_CHANGE: + NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); error = change_route(rnh, info, ret_nrt); RIB_WUNLOCK(rnh); + NET_EPOCH_EXIT(et); break; default: error = EOPNOTSUPP; @@ -1609,9 +1620,11 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, { struct sockaddr *dst, *ndst, *gateway, *netmask; struct rtentry *rt, *rt_old; + struct nhop_object *nh; struct radix_node *rn; struct ifaddr *ifa; int error, flags; + struct epoch_tracker et; dst = info->rti_info[RTAX_DST]; gateway = info->rti_info[RTAX_GATEWAY]; @@ -1631,18 +1644,30 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, } else { ifa_ref(info->rti_ifa); } + + NET_EPOCH_ENTER(et); + error = nhop_create_from_info(rnh, info, &nh); + NET_EPOCH_EXIT(et); + if (error != 0) { + ifa_free(info->rti_ifa); + return (error); + } + rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { ifa_free(info->rti_ifa); + nhop_free(nh); return (ENOBUFS); } rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = rnh->rib_fibnum; + rt->rt_nhop = nh; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { ifa_free(info->rti_ifa); + nhop_free(nh); uma_zfree(V_rtzone, rt); return (error); } @@ -1682,6 +1707,7 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); + nhop_free(nh); uma_zfree(V_rtzone, rt); return (EEXIST); } @@ -1723,6 +1749,7 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, if (rn == NULL) { ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); + nhop_free(nh); uma_zfree(V_rtzone, rt); return (EEXIST); } @@ -1802,6 +1829,7 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, int error = 0; int free_ifa = 0; int family, mtu; + struct nhop_object *nh; struct if_mtuinfo ifmtu; RIB_WLOCK_ASSERT(rnh); @@ -1824,6 +1852,7 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, } #endif + nh = NULL; RT_LOCK(rt); rt_setmetrics(info, rt); @@ -1852,6 +1881,10 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, goto bad; } + error = nhop_create_from_nhop(rnh, rt->rt_nhop, info, &nh); + if (error != 0) + goto bad; + /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL) { @@ -1897,6 +1930,11 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, } } + /* Update nexthop */ + nhop_free(rt->rt_nhop); + rt->rt_nhop = nh; + nh = NULL; + /* * This route change may have modified the route's gateway. In that * case, any inpcbs that have cached this route need to invalidate their @@ -1910,6 +1948,8 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, } bad: RT_UNLOCK(rt); + if (nh != NULL) + nhop_free(nh); if (free_ifa != 0) { ifa_free(info->rti_ifa); info->rti_ifa = NULL; diff --git a/sys/net/route.h b/sys/net/route.h index b5646246320c..4bdb6e84389e 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -90,7 +90,8 @@ struct rt_metrics { u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ u_long rmx_weight; /* route weight */ - u_long rmx_filler[3]; /* will be used for T/TCP later */ + u_long rmx_nhidx; /* route nexhop index */ + u_long rmx_filler[2]; /* will be used for T/TCP later */ }; /* @@ -150,6 +151,7 @@ struct rtentry { struct sockaddr *rt_gateway; /* value */ struct ifnet *rt_ifp; /* the answer: interface to use */ struct ifaddr *rt_ifa; /* the answer: interface address to use */ + struct nhop_object *rt_nhop; /* nexthop data */ int rt_flags; /* up/down?, host/net */ int rt_refcnt; /* # held references */ u_int rt_fibnum; /* which FIB */ @@ -215,9 +217,13 @@ struct rtentry { #define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ +#define NHR_NONE 0x00 /* empty flags field */ #define NHR_IFAIF 0x01 /* Return ifa_ifp interface */ #define NHR_REF 0x02 /* For future use */ +/* uRPF */ +#define NHR_NODEFAULT 0x04 /* do not consider default route */ + /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ @@ -245,6 +251,8 @@ struct rtstat { uint64_t rts_newgateway; /* routes modified by redirects */ uint64_t rts_unreach; /* lookups which failed */ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ + uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ + uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ }; /* @@ -507,6 +515,8 @@ int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int expire_sec); +/* New API */ +void rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg); #endif #endif diff --git a/sys/net/route/nhop.c b/sys/net/route/nhop.c new file mode 100644 index 000000000000..d71ba79c1295 --- /dev/null +++ b/sys/net/route/nhop.c @@ -0,0 +1,388 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains data structures management logic for the nexthop ("nhop") + * route subsystem. + * + * Nexthops in the original sense are the objects containing all the necessary + * information to forward the packet to the selected destination. + * In particular, nexthop is defined by a combination of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + * NHF_DEFAULT + * + * All nexthops are stored in the resizable hash table. + * Additionally, each nexthop gets assigned its unique index (nexthop index) + * so userland programs can interact with the nexthops easier. Index allocation + * is backed by the bitmask array. + */ + +static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); + + +/* Hash management functions */ + +int +nhops_init_rib(struct rib_head *rh) +{ + struct nh_control *ctl; + size_t alloc_size; + uint32_t num_buckets, num_items; + void *ptr; + + ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO); + + /* + * Allocate nexthop hash. Start with 16 items by default (128 bytes). + * This will be enough for most of the cases. + */ + num_buckets = 16; + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO); + CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets); + + /* + * Allocate nexthop index bitmask. + */ + num_items = 128 * 8; /* 128 bytes */ + ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO); + bitmask_init(&ctl->nh_idx_head, ptr, num_items); + + NHOPS_LOCK_INIT(ctl); + + rh->nh_control = ctl; + ctl->ctl_rh = rh; + + DPRINTF("NHOPS init for fib %u af %u: ctl %p rh %p", rh->rib_fibnum, + rh->rib_family, ctl, rh); + + return (0); +} + +static void +destroy_ctl(struct nh_control *ctl) +{ + + NHOPS_LOCK_DESTROY(ctl); + free(ctl->nh_head.ptr, M_NHOP); + free(ctl->nh_idx_head.idx, M_NHOP); + free(ctl, M_NHOP); +} + +/* + * Epoch callback indicating ctl is safe to destroy + */ +static void +destroy_ctl_epoch(epoch_context_t ctx) +{ + struct nh_control *ctl; + + ctl = __containerof(ctx, struct nh_control, ctl_epoch_ctx); + + destroy_ctl(ctl); +} + +void +nhops_destroy_rib(struct rib_head *rh) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv; + + ctl = rh->nh_control; + + /* + * All routes should have been deleted in rt_table_destroy(). + * However, TCP stack or other consumers may store referenced + * nexthop pointers. When these references go to zero, + * nhop_free() will try to unlink these records from the + * datastructures, most likely leading to panic. + * + * Avoid that by explicitly marking all of the remaining + * nexthops as unlinked by removing a reference from a special + * counter. Please see nhop_free() comments for more + * details. + */ + + NHOPS_WLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx); + refcount_release(&nh_priv->nh_linked); + } CHT_SLIST_FOREACH_END; + NHOPS_WUNLOCK(ctl); + + /* + * Postpone destruction till the end of current epoch + * so nhop_free() can safely use nh_control pointer. + */ + epoch_call(net_epoch_preempt, destroy_ctl_epoch, + &ctl->ctl_epoch_ctx); +} + +/* + * Nexhop hash calculation: + * + * Nexthops distribution: + * 2 "mandatory" nexthops per interface ("interface route", "loopback"). + * For direct peering: 1 nexthop for the peering router per ifp/af. + * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af. + * IGP control plane & broadcast segment: tens of nexthops per ifp/af. + * + * Each fib/af combination has its own hash table. + * With that in mind, hash nexthops by the combination of the interface + * and GW IP address. + * + * To optimize hash calculation, ignore higher bytes of ifindex, as they + * give very little entropy. + * Similarly, use lower 4 bytes of IPv6 address to distinguish between the + * neighbors. + */ +struct _hash_data { + uint16_t ifindex; + uint8_t family; + uint8_t nh_type; + uint32_t gw_addr; +}; + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +static uint32_t +hash_priv(const struct nhop_priv *priv) +{ + struct nhop_object *nh; + uint16_t ifindex; + struct _hash_data key; + + nh = priv->nh; + ifindex = nh->nh_ifp->if_index & 0xFFFF; + memset(&key, 0, sizeof(key)); + + key.ifindex = ifindex; + key.family = nh->gw_sa.sa_family; + key.nh_type = priv->nh_type & 0xFF; + if (nh->gw_sa.sa_family == AF_INET6) + memcpy(&key.gw_addr, &nh->gw6_sa.sin6_addr.s6_addr32[3], 4); + else if (nh->gw_sa.sa_family == AF_INET) + memcpy(&key.gw_addr, &nh->gw4_sa.sin_addr, 4); + + return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key))); +} + +/* + * Checks if hash needs resizing and performs this resize if necessary + * + */ +static void +consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) +{ + void *nh_ptr, *nh_idx_ptr; + void *old_idx_ptr; + size_t alloc_size; + + nh_ptr = NULL; + if (new_nh_buckets != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); + nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + nh_idx_ptr = NULL; + if (new_idx_items != 0) { + alloc_size = bitmask_get_size(new_idx_items); + nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + if (nh_ptr == NULL && nh_idx_ptr == NULL) { + /* Either resize is not required or allocations have failed. */ + return; + } + + DPRINTF("going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr, + new_nh_buckets, nh_idx_ptr, new_idx_items); + + old_idx_ptr = NULL; + + NHOPS_WLOCK(ctl); + if (nh_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets); + } + if (nh_idx_ptr != NULL) { + if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items) == 0) + bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); + } + NHOPS_WUNLOCK(ctl); + + if (nh_ptr != NULL) + free(nh_ptr, M_NHOP); + if (old_idx_ptr != NULL) + free(old_idx_ptr, M_NHOP); +} + +/* + * Links nextop @nh_priv to the nexhop hash table and allocates + * nexhop index. + * Returns allocated index or 0 on failure. + */ +int +link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv) +{ + uint16_t idx; + uint32_t num_buckets_new, num_items_new; + + KASSERT((nh_priv->nh_idx == 0), ("nhop index is already allocated")); + NHOPS_WLOCK(ctl); + + /* + * Check if we need to resize hash and index. + * The following 2 functions returns either new size or 0 + * if resize is not required. + */ + num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); + num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head); + + if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) { + NHOPS_WUNLOCK(ctl); + DPRINTF("Unable to allocate nhop index"); + RTSTAT_INC(rts_nh_idx_alloc_failure); + consider_resize(ctl, num_buckets_new, num_items_new); + return (0); + } + + nh_priv->nh_idx = idx; + nh_priv->nh_control = ctl; + + CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv); + + NHOPS_WUNLOCK(ctl); + + DPRINTF("Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx, + hash_priv(nh_priv), ctl); + consider_resize(ctl, num_buckets_new, num_items_new); + + return (idx); +} + +/* + * Unlinks nexthop specified by @nh_priv data from the hash. + * + * Returns found nexthop or NULL. + */ +struct nhop_priv * +unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv_del) +{ + struct nhop_priv *priv_ret; + int idx; + uint32_t num_buckets_new, num_items_new; + + idx = 0; + + NHOPS_WLOCK(ctl); + CHT_SLIST_REMOVE_BYOBJ(&ctl->nh_head, nhops, nh_priv_del, priv_ret); + + if (priv_ret != NULL) { + idx = priv_ret->nh_idx; + priv_ret->nh_idx = 0; + + KASSERT((idx != 0), ("bogus nhop index 0")); + if ((bitmask_free_idx(&ctl->nh_idx_head, idx)) != 0) { + DPRINTF("Unable to remove index %d from fib %u af %d", + idx, ctl->ctl_rh->rib_fibnum, + ctl->ctl_rh->rib_family); + } + } + + /* Check if hash or index needs to be resized */ + num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head); + num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head); + + NHOPS_WUNLOCK(ctl); + + if (priv_ret == NULL) + DPRINTF("Unable to unlink nhop priv %p from hash, hash %u ctl %p", + nh_priv_del, hash_priv(nh_priv_del), ctl); + else + DPRINTF("Unlinked nhop %p priv idx %d", priv_ret, idx); + + consider_resize(ctl, num_buckets_new, num_items_new); + + return (priv_ret); +} + +/* + * Searches for the nexthop by data specifcied in @nh_priv. + * Returns referenced nexthop or NULL. + */ +struct nhop_priv * +find_nhop(struct nh_control *ctl, const struct nhop_priv *nh_priv) +{ + struct nhop_priv *nh_priv_ret; + + NHOPS_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret); + if (nh_priv_ret != NULL) { + if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){ + /* refcount was 0 -> nhop is being deleted */ + nh_priv_ret = NULL; + } + } + NHOPS_RUNLOCK(ctl); + + return (nh_priv_ret); +} + diff --git a/sys/net/route/nhop.h b/sys/net/route/nhop.h new file mode 100644 index 000000000000..c747a6399c2c --- /dev/null +++ b/sys/net/route/nhop.h @@ -0,0 +1,229 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains public definitions for the nexthop routing subsystem. + */ + +#ifndef _NET_ROUTE_NHOP_H_ +#define _NET_ROUTE_NHOP_H_ + +#include /* sockaddr_in && sockaddr_in6 */ + +#include + +enum nhop_type { + NH_TYPE_IPV4_ETHER_RSLV = 1, /* IPv4 ethernet without GW */ + NH_TYPE_IPV4_ETHER_NHOP = 2, /* IPv4 with pre-calculated ethernet encap */ + NH_TYPE_IPV6_ETHER_RSLV = 3, /* IPv6 ethernet, without GW */ + NH_TYPE_IPV6_ETHER_NHOP = 4 /* IPv6 with pre-calculated ethernet encap*/ +}; + +#ifdef _KERNEL + +/* + * Define shorter version of AF_LINK sockaddr. + * + * Currently the only use case of AF_LINK gateway is storing + * interface index of the interface of the source IPv6 address. + * This is used by the IPv6 code for the connections over loopback + * interface. + * + * The structure below copies 'struct sockaddr_dl', reducing the + * size of sdl_data buffer, as it is not used. This change + * allows to store the AF_LINK gateways in the nhop gateway itself, + * simplifying control plane handling. + */ +struct sockaddr_dl_short { + u_char sdl_len; /* Total length of sockaddr */ + u_char sdl_family; /* AF_LINK */ + u_short sdl_index; /* if != 0, system given index for interface */ + u_char sdl_type; /* interface type */ + u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */ + u_char sdl_alen; /* link level address length */ + u_char sdl_slen; /* link layer selector length */ + char sdl_data[8]; /* unused */ +}; + +#define NHOP_RELATED_FLAGS \ + (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_BLACKHOLE | \ + RTF_FIXEDMTU | RTF_LOCAL | RTF_BROADCAST | RTF_MULTICAST) + +struct nh_control; +struct nhop_priv; + +/* + * Struct 'nhop_object' field description: + * + * nh_flags: NHF_ flags used in the dataplane code. NHF_GATEWAY or NHF_BLACKHOLE + * can be examples of such flags. + * nh_mtu: ready-to-use nexthop mtu. Already accounts for the link-level header, + * interface MTU and protocol-specific limitations. + * nh_prepend_len: link-level prepend length. Currently unused. + * nh_ifp: logical transmit interface. The one from which if_transmit() will be + * called. Guaranteed to be non-NULL. + * nh_aifp: ifnet of the source address. Same as nh_ifp except IPv6 loopback + * routes. See the example below. + * nh_ifa: interface address to use. Guaranteed to be non-NULL. + * nh_pksent: counter(9) reflecting the number of packets transmitted. + * + * gw_: storage suitable to hold AF_INET, AF_INET6 or AF_LINK gateway. More + * details ara available in the examples below. + * + * Examples: + * + * Direct routes (routes w/o gateway): + * NHF_GATEWAY is NOT set. + * nh_ifp denotes the logical transmit interface (). + * nh_aifp is the same as nh_ifp + * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat) + * Loopback routes: + * NHF_GATEWAY is NOT set. + * nh_ifp points to the loopback interface (lo0). + * nh_aifp points to the interface where the destination address belongs to. + * This is useful in IPv6 link-local-over-loopback communications. + * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat) + * GW routes: + * NHF_GATEWAY is set. + * nh_ifp denotes the logical transmit interface. + * nh_aifp is the same as nh_ifp + * gw_sa contains L3 address (either AF_INET or AF_INET6). + * + * + * Note: struct nhop_object fields are ordered in a way that + * supports memcmp-based comparisons. + * + */ +#define NHOP_END_CMP (__offsetof(struct nhop_object, nh_pksent)) + +struct nhop_object { + uint16_t nh_flags; /* nhop flags */ + uint16_t nh_mtu; /* nexthop mtu */ + union { + struct sockaddr_in gw4_sa; /* GW accessor as IPv4 */ + struct sockaddr_in6 gw6_sa; /* GW accessor as IPv6 */ + struct sockaddr gw_sa; + struct sockaddr_dl_short gwl_sa; /* AF_LINK gw (compat) */ + char gw_buf[28]; + }; + struct ifnet *nh_ifp; /* Logical egress interface. Always != NULL */ + struct ifaddr *nh_ifa; /* interface address to use. Always != NULL */ + struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */ + counter_u64_t nh_pksent; /* packets sent using this nhop */ + /* 32 bytes + 4xPTR == 64(amd64) / 48(i386) */ + uint8_t nh_prepend_len; /* length of prepend data */ + uint8_t spare[3]; + uint32_t spare1; /* alignment */ + char nh_prepend[48]; /* L2 prepend */ + struct nhop_priv *nh_priv; /* control plane data */ + /* -- 128 bytes -- */ +}; + +/* + * Nhop validness. + * + * Currently we verify whether link is up or not on every packet, which can be + * quite costy. + * TODO: subscribe for the interface notifications and update the nexthops + * with NHF_INVALID flag. + */ + +#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp) +#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) + +#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) +#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) + +#define NH_FREE(_nh) do { \ + nhop_free(_nh); \ + /* guard against invalid refs */ \ + _nh = NULL; \ +} while (0) + + +void nhop_free(struct nhop_object *nh); + +struct sysctl_req; +struct sockaddr_dl; +struct rib_head; + +uint32_t nhop_get_idx(const struct nhop_object *nh); +enum nhop_type nhop_get_type(const struct nhop_object *nh); +int nhop_get_rtflags(const struct nhop_object *nh); + +int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + +#endif /* _KERNEL */ + +/* Kernel <> userland structures */ + +/* Structure usage and layout are described in dump_nhop_entry() */ +struct nhop_external { + uint32_t nh_len; /* length of the datastructure */ + uint32_t nh_idx; /* Nexthop index */ + uint32_t nh_fib; /* Fib nexhop is attached to */ + uint32_t ifindex; /* transmit interface ifindex */ + uint32_t aifindex; /* address ifindex */ + uint8_t prepend_len; /* length of the prepend */ + uint8_t nh_family; /* address family */ + uint16_t nh_type; /* nexthop type */ + uint16_t nh_mtu; /* nexthop mtu */ + + uint16_t nh_flags; /* nhop flags */ + struct in_addr nh_addr; /* GW/DST IPv4 address */ + struct in_addr nh_src; /* default source IPv4 address */ + uint64_t nh_pksent; + /* control plane */ + /* lookup key: address, family, type */ + char nh_prepend[64]; /* L2 prepend */ + uint64_t nh_refcount; /* number of references */ +}; + +struct nhop_addrs { + uint32_t na_len; /* length of the datastructure */ + uint16_t gw_sa_off; /* offset of gateway SA */ + uint16_t src_sa_off; /* offset of src address SA */ +}; + +struct mpath_nhop_external { + uint32_t nh_idx; + uint32_t nh_weight; +}; + +struct mpath_external { + uint32_t mp_idx; + uint32_t mp_refcount; + uint32_t mp_nh_count; + uint32_t mp_group_size; +}; + + +#endif + + diff --git a/sys/net/route/nhop_ctl.c b/sys/net/route/nhop_ctl.c new file mode 100644 index 000000000000..cb1617e1cc3a --- /dev/null +++ b/sys/net/route/nhop_ctl.c @@ -0,0 +1,827 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains core functionality for the nexthop ("nhop") route subsystem. + * The business logic needed to create nexhop objects is implemented here. + * + * Nexthops in the original sense are the objects containing all the necessary + * information to forward the packet to the selected destination. + * In particular, nexthop is defined by a combination of + * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and + * NHF_DEFAULT + * + * Additionally, each nexthop gets assigned its unique index (nexthop index). + * It serves two purposes: first one is to ease the ability of userland programs to + * reference nexthops by their index. The second one allows lookup algorithms to + * to store index instead of pointer (2 bytes vs 8) as a lookup result. + * All nexthops are stored in the resizable hash table. + * + * Basically, this file revolves around supporting 3 functions: + * 1) nhop_create_from_info / nhop_create_from_nhop, which contains all + * business logic on filling the nexthop fields based on the provided request. + * 2) nhop_get(), which gets a usable referenced nexthops. + * + * Conventions: + * 1) non-exported functions start with verb + * 2) exported function starts with the subsystem prefix: "nhop" + */ + +static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w); + +static struct nhop_priv *alloc_nhop_structure(void); +static int get_nhop(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_priv **pnh_priv); +static int finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info, + struct nhop_priv *nh_priv); +static struct ifnet *get_aifp(const struct nhop_object *nh, int reference); +static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp); + +static void destroy_nhop_epoch(epoch_context_t ctx); +static void destroy_nhop(struct nhop_priv *nh_priv); + +static void print_nhop(const char *prefix, const struct nhop_object *nh); + +_Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32, + "nhop_object: wrong nh_ifp offset"); +_Static_assert(sizeof(struct nhop_object) <= 128, + "nhop_object: size exceeds 128 bytes"); + +static uma_zone_t nhops_zone; /* Global zone for each and every nexthop */ + + +#define NHOP_OBJECT_ALIGNED_SIZE roundup2(sizeof(struct nhop_object), \ + 2 * CACHE_LINE_SIZE) +#define NHOP_PRIV_ALIGNED_SIZE roundup2(sizeof(struct nhop_priv), \ + 2 * CACHE_LINE_SIZE) +void +nhops_init(void) +{ + + nhops_zone = uma_zcreate("routing nhops", + NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE, + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} + +/* + * Fetches the interface of source address used by the route. + * In all cases except interface-address-route it would be the + * same as the transmit interfaces. + * However, for the interface address this function will return + * this interface ifp instead of loopback. This is needed to support + * link-local IPv6 loopback communications. + * + * If @reference is non-zero, found ifp is referenced. + * + * Returns found ifp. + */ +static struct ifnet * +get_aifp(const struct nhop_object *nh, int reference) +{ + struct ifnet *aifp = NULL; + + /* + * Adjust the "outgoing" interface. If we're going to loop + * the packet back to ourselves, the ifp would be the loopback + * interface. However, we'd rather know the interface associated + * to the destination address (which should probably be one of + * our own addresses). + */ + if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) && + nh->gw_sa.sa_family == AF_LINK) { + if (reference) + aifp = ifnet_byindex_ref(nh->gwl_sa.sdl_index); + else + aifp = ifnet_byindex(nh->gwl_sa.sdl_index); + if (aifp == NULL) { + DPRINTF("unable to get aifp for %s index %d", + if_name(nh->nh_ifp), nh->gwl_sa.sdl_index); + } + } + + if (aifp == NULL) { + aifp = nh->nh_ifp; + if (reference) + if_ref(aifp); + } + + return (aifp); +} + +int +cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two) +{ + + if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0) + return (0); + + if ((_one->nh_type != _two->nh_type) || + (_one->nh_family != _two->nh_family)) + return (0); + + return (1); +} + +/* + * Conditionally sets @nh mtu data based on the @info data. + */ +static void +set_nhop_mtu_from_info(struct nhop_object *nh, const struct rt_addrinfo *info) +{ + + if (info->rti_mflags & RTV_MTU) { + if (info->rti_rmx->rmx_mtu != 0) { + + /* + * MTU was explicitly provided by user. + * Keep it. + */ + + nh->nh_priv->rt_flags |= RTF_FIXEDMTU; + } else { + + /* + * User explicitly sets MTU to 0. + * Assume rollback to default. + */ + nh->nh_priv->rt_flags &= ~RTF_FIXEDMTU; + } + nh->nh_mtu = info->rti_rmx->rmx_mtu; + } +} + +/* + * Fills in shorted link-level sockadd version suitable to be stored inside the + * nexthop gateway buffer. + */ +static void +fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp) +{ + + sdl->sdl_family = AF_LINK; + sdl->sdl_len = sizeof(struct sockaddr_dl_short); + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = ifp->if_type; +} + +static int +set_nhop_gw_from_info(struct nhop_object *nh, struct rt_addrinfo *info) +{ + struct sockaddr *gw; + + gw = info->rti_info[RTAX_GATEWAY]; + if (info->rti_flags & RTF_GATEWAY) { + if (gw->sa_len > sizeof(struct sockaddr_in6)) { + DPRINTF("nhop SA size too big: AF %d len %u", + gw->sa_family, gw->sa_len); + return (ENOMEM); + } + memcpy(&nh->gw_sa, gw, gw->sa_len); + } else { + /* + * Interface route. Currently the route.c code adds + * sa of type AF_LINK, which is 56 bytes long. The only + * meaningful data there is the interface index. It is used + * used is the IPv6 loopback output, where we need to preserve + * the original interface to maintain proper scoping. + * Despite the fact that nexthop code stores original interface + * in the separate field (nh_aifp, see below), write AF_LINK + * compatible sa with shorter total length. + */ + fill_sdl_from_ifp(&nh->gwl_sa, nh->nh_ifp); + } + + return (0); +} + +static int +fill_nhop_from_info(struct nhop_priv *nh_priv, struct rt_addrinfo *info) +{ + int error, rt_flags; + struct nhop_object *nh; + + nh = nh_priv->nh; + + rt_flags = info->rti_flags & NHOP_RT_FLAG_MASK; + + nh->nh_priv->rt_flags = rt_flags; + nh_priv->nh_family = info->rti_info[RTAX_DST]->sa_family; + nh_priv->nh_type = 0; // hook responsibility to set nhop type + + nh->nh_flags = fib_rte_to_nh_flags(rt_flags); + set_nhop_mtu_from_info(nh, info); + nh->nh_ifp = info->rti_ifa->ifa_ifp; + nh->nh_ifa = info->rti_ifa; + nh->nh_aifp = get_aifp(nh, 0); + + if ((error = set_nhop_gw_from_info(nh, info)) != 0) + return (error); + + /* + * Note some of the remaining data is set by the + * per-address-family pre-add hook. + */ + + return (0); +} + +/* + * Creates a new nexthop based on the information in @info. + * + * Returns: + * 0 on success, filling @nh_ret with the desired nexthop object ptr + * errno otherwise + */ +int +nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object **nh_ret) +{ + struct nhop_priv *nh_priv; + int error; + + NET_EPOCH_ASSERT(); + + nh_priv = alloc_nhop_structure(); + + error = fill_nhop_from_info(nh_priv, info); + if (error != 0) { + uma_zfree(nhops_zone, nh_priv->nh); + return (error); + } + + error = get_nhop(rnh, info, &nh_priv); + if (error == 0) + *nh_ret = nh_priv->nh; + + return (error); +} + +/* + * Gets linked nhop using the provided @pnh_priv nexhop data. + * If linked nhop is found, returns it, freeing the provided one. + * If there is no such nexthop, attaches the remaining data to the + * provided nexthop and links it. + * + * Returns 0 on success, storing referenced nexthop in @pnh_priv. + * Otherwise, errno is returned. + */ +static int +get_nhop(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_priv **pnh_priv) +{ + const struct sockaddr *dst, *gateway, *netmask; + struct nhop_priv *nh_priv, *tmp_priv; + int error; + + nh_priv = *pnh_priv; + + /* Give the protocols chance to augment the request data */ + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + gateway = info->rti_info[RTAX_GATEWAY]; + + error = rnh->rnh_preadd(rnh->rib_fibnum, dst, netmask, nh_priv->nh); + if (error != 0) { + uma_zfree(nhops_zone, nh_priv->nh); + return (error); + } + + tmp_priv = find_nhop(rnh->nh_control, nh_priv); + if (tmp_priv != NULL) { + uma_zfree(nhops_zone, nh_priv->nh); + *pnh_priv = tmp_priv; + return (0); + } + + /* + * Existing nexthop not found, need to create new one. + * Note: multiple simultaneous get_nhop() requests + * can result in multiple equal nexhops existing in the + * nexthop table. This is not a not a problem until the + * relative number of such nexthops is significant, which + * is extremely unlikely. + */ + + error = finalize_nhop(rnh->nh_control, info, nh_priv); + if (error != 0) + return (error); + + return (0); +} + +/* + * Update @nh with data supplied in @info. + * This is a helper function to support route changes. + * + * It limits the changes that can be done to the route to the following: + * 1) all combination of gateway changes (gw, interface, blackhole/reject) + * 2) route flags (FLAG[123],STATIC,BLACKHOLE,REJECT) + * 3) route MTU + * + * Returns: + * 0 on success + */ +static int +alter_nhop_from_info(struct nhop_object *nh, struct rt_addrinfo *info) +{ + struct sockaddr *info_gw; + int error; + + /* Update MTU if set in the request*/ + set_nhop_mtu_from_info(nh, info); + + /* XXX: allow only one of BLACKHOLE,REJECT,GATEWAY */ + + /* Allow some flags (FLAG1,STATIC,BLACKHOLE,REJECT) to be toggled on change. */ + nh->nh_priv->rt_flags &= ~RTF_FMASK; + nh->nh_priv->rt_flags |= info->rti_flags & RTF_FMASK; + + /* Consider gateway change */ + info_gw = info->rti_info[RTAX_GATEWAY]; + if (info_gw != NULL) { + error = set_nhop_gw_from_info(nh, info); + if (error != 0) + return (error); + /* Update RTF_GATEWAY flag status */ + nh->nh_priv->rt_flags &= ~RTF_GATEWAY; + nh->nh_priv->rt_flags |= (RTF_GATEWAY & info->rti_flags); + } + /* Update datapath flags */ + nh->nh_flags = fib_rte_to_nh_flags(nh->nh_priv->rt_flags); + + if (info->rti_ifa != NULL) + nh->nh_ifa = info->rti_ifa; + if (info->rti_ifp != NULL) + nh->nh_ifp = info->rti_ifp; + nh->nh_aifp = get_aifp(nh, 0); + + return (0); +} + +/* + * Creates new nexthop based on @nh_orig and augmentation data from @info. + * Helper function used in the route changes, please see + * alter_nhop_from_info() comments for more details. + * + * Returns: + * 0 on success, filling @nh_ret with the desired nexthop object + * errno otherwise + */ +int +nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig, + struct rt_addrinfo *info, struct nhop_object **pnh) +{ + struct nhop_priv *nh_priv; + struct nhop_object *nh; + int error; + + NET_EPOCH_ASSERT(); + + nh_priv = alloc_nhop_structure(); + nh = nh_priv->nh; + + /* Start with copying data from original nexthop */ + nh_priv->nh_family = nh_orig->nh_priv->nh_family; + nh_priv->rt_flags = nh_orig->nh_priv->rt_flags; + nh_priv->nh_type = nh_orig->nh_priv->nh_type; + + nh->nh_ifp = nh_orig->nh_ifp; + nh->nh_ifa = nh_orig->nh_ifa; + nh->nh_aifp = nh_orig->nh_aifp; + nh->nh_mtu = nh_orig->nh_mtu; + nh->nh_flags = nh_orig->nh_flags; + memcpy(&nh->gw_sa, &nh_orig->gw_sa, nh_orig->gw_sa.sa_len); + + error = alter_nhop_from_info(nh, info); + if (error != 0) { + uma_zfree(nhops_zone, nh_priv->nh); + return (error); + } + + error = get_nhop(rnh, info, &nh_priv); + if (error == 0) + *pnh = nh_priv->nh; + + return (error); +} + +/* + * Allocates memory for public/private nexthop structures. + * + * Returns pointer to nhop_priv or NULL. + */ +static struct nhop_priv * +alloc_nhop_structure() +{ + struct nhop_object *nh; + struct nhop_priv *nh_priv; + + nh = (struct nhop_object *)uma_zalloc(nhops_zone, M_NOWAIT | M_ZERO); + if (nh == NULL) + return (NULL); + nh_priv = (struct nhop_priv *)((char *)nh + NHOP_OBJECT_ALIGNED_SIZE); + + nh->nh_priv = nh_priv; + nh_priv->nh = nh; + + return (nh_priv); +} + +/* + * Alocates/references the remaining bits of nexthop data and links + * it to the hash table. + * Returns 0 if successful, + * errno otherwise. @nh_priv is freed in case of error. + */ +static int +finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info, + struct nhop_priv *nh_priv) +{ + struct nhop_object *nh; + + nh = nh_priv->nh; + + /* Allocate per-cpu packet counter */ + nh->nh_pksent = counter_u64_alloc(M_NOWAIT); + if (nh->nh_pksent == NULL) { + uma_zfree(nhops_zone, nh); + RTSTAT_INC(rts_nh_alloc_failure); + DPRINTF("nh_alloc_finalize failed"); + return (ENOMEM); + } + + /* Reference external objects and calculate (referenced) ifa */ + if_ref(nh->nh_ifp); + ifa_ref(nh->nh_ifa); + nh->nh_aifp = get_aifp(nh, 1); + DPRINTF("AIFP: %p nh_ifp %p", nh->nh_aifp, nh->nh_ifp); + + refcount_init(&nh_priv->nh_refcnt, 1); + + /* Please see nhop_free() comments on the initial value */ + refcount_init(&nh_priv->nh_linked, 2); + + print_nhop("FINALIZE", nh); + + if (link_nhop(ctl, nh_priv) == 0) { + + /* + * Adding nexthop to the datastructures + * failed. Call destructor w/o waiting for + * the epoch end, as nexthop is not used + * and return. + */ + DPRINTF("link_nhop failed!"); + destroy_nhop(nh_priv); + + return (ENOBUFS); + } + + return (0); +} + +static void +print_nhop_sa(char *buf, size_t buflen, const struct sockaddr *sa) +{ + + if (sa->sa_family == AF_INET) { + const struct sockaddr_in *sin4; + sin4 = (const struct sockaddr_in *)sa; + inet_ntop(AF_INET, &sin4->sin_addr, buf, buflen); + } else if (sa->sa_family == AF_INET6) { + const struct sockaddr_in6 *sin6; + sin6 = (const struct sockaddr_in6 *)sa; + inet_ntop(AF_INET6, &sin6->sin6_addr, buf, buflen); + } else if (sa->sa_family == AF_LINK) { + const struct sockaddr_dl *sdl; + sdl = (const struct sockaddr_dl *)sa; + snprintf(buf, buflen, "if#%d", sdl->sdl_index); + } else + snprintf(buf, buflen, "af:%d", sa->sa_family); +} + +static void +print_nhop(const char *prefix, const struct nhop_object *nh) +{ + char src_buf[INET6_ADDRSTRLEN], addr_buf[INET6_ADDRSTRLEN]; + + print_nhop_sa(src_buf, sizeof(src_buf), nh->nh_ifa->ifa_addr); + print_nhop_sa(addr_buf, sizeof(addr_buf), &nh->gw_sa); + + DPRINTF("%s nhop priv %p: AF %d ifp %p %s addr %s src %p %s aifp %p %s mtu %d nh_flags %X", + prefix, nh->nh_priv, nh->nh_priv->nh_family, nh->nh_ifp, + if_name(nh->nh_ifp), addr_buf, nh->nh_ifa, src_buf, nh->nh_aifp, + if_name(nh->nh_aifp), nh->nh_mtu, nh->nh_flags); +} + +static void +destroy_nhop(struct nhop_priv *nh_priv) +{ + struct nhop_object *nh; + + nh = nh_priv->nh; + + print_nhop("DEL", nh); + + if_rele(nh->nh_ifp); + if_rele(nh->nh_aifp); + ifa_free(nh->nh_ifa); + counter_u64_free(nh->nh_pksent); + + uma_zfree(nhops_zone, nh); +} + +/* + * Epoch callback indicating nhop is safe to destroy + */ +static void +destroy_nhop_epoch(epoch_context_t ctx) +{ + struct nhop_priv *nh_priv; + + nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx); + + destroy_nhop(nh_priv); +} + +int +nhop_ref_object(struct nhop_object *nh) +{ + + return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt)); +} + +void +nhop_free(struct nhop_object *nh) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv = nh->nh_priv; + struct epoch_tracker et; + + if (!refcount_release(&nh_priv->nh_refcnt)) + return; + + /* + * There are only 2 places, where nh_linked can be decreased: + * rib destroy (nhops_destroy_rib) and this function. + * nh_link can never be increased. + * + * Hence, use initial value of 2 to make use of + * refcount_release_if_not_last(). + * + * There can be two scenarious when calling this function: + * + * 1) nh_linked value is 2. This means that either + * nhops_destroy_rib() has not been called OR it is running, + * but we are guaranteed that nh_control won't be freed in + * this epoch. Hence, nexthop can be safely unlinked. + * + * 2) nh_linked value is 1. In that case, nhops_destroy_rib() + * has been called and nhop unlink can be skipped. + */ + + NET_EPOCH_ENTER(et); + if (refcount_release_if_not_last(&nh_priv->nh_linked)) { + ctl = nh_priv->nh_control; + if (unlink_nhop(ctl, nh_priv) == NULL) { + /* Do not try to reclaim */ + DPRINTF("Failed to unlink nexhop %p", nh_priv); + NET_EPOCH_EXIT(et); + return; + } + } + NET_EPOCH_EXIT(et); + + epoch_call(net_epoch_preempt, destroy_nhop_epoch, + &nh_priv->nh_epoch_ctx); +} + +int +nhop_ref_any(struct nhop_object *nh) +{ + + return (nhop_ref_object(nh)); +} + +void +nhop_free_any(struct nhop_object *nh) +{ + + nhop_free(nh); +} + + +/* Helper functions */ + +uint32_t +nhop_get_idx(const struct nhop_object *nh) +{ + + return (nh->nh_priv->nh_idx); +} + +enum nhop_type +nhop_get_type(const struct nhop_object *nh) +{ + + return (nh->nh_priv->nh_type); +} + +void +nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type) +{ + + nh->nh_priv->nh_type = nh_type; +} + +int +nhop_get_rtflags(const struct nhop_object *nh) +{ + + return (nh->nh_priv->rt_flags); +} + +void +nhop_set_rtflags(struct nhop_object *nh, int rt_flags) +{ + + nh->nh_priv->rt_flags = rt_flags; +} + +void +nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv; + struct nhop_object *nh; + + ctl = rh->nh_control; + + NHOPS_WLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + nh = nh_priv->nh; + if (nh->nh_ifp == ifp) { + if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 || + nh->nh_mtu > mtu) { + /* Update MTU directly */ + nh->nh_mtu = mtu; + } + } + } CHT_SLIST_FOREACH_END; + NHOPS_WUNLOCK(ctl); + +} + +/* + * Dumps a single entry to sysctl buffer. + * + * Layout: + * rt_msghdr - generic RTM header to allow users to skip non-understood messages + * nhop_external - nexhop description structure (with length) + * nhop_addrs - structure encapsulating GW/SRC sockaddrs + */ +static int +dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w) +{ + struct { + struct rt_msghdr rtm; + struct nhop_external nhe; + struct nhop_addrs na; + } arpc; + struct nhop_external *pnhe; + struct sockaddr *gw_sa, *src_sa; + struct sockaddr_storage ss; + size_t addrs_len; + int error; + + //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w); + + memset(&arpc, 0, sizeof(arpc)); + + arpc.rtm.rtm_msglen = sizeof(arpc); + arpc.rtm.rtm_version = RTM_VERSION; + arpc.rtm.rtm_type = RTM_GET; + //arpc.rtm.rtm_flags = RTF_UP; + arpc.rtm.rtm_flags = nh->nh_priv->rt_flags; + + /* nhop_external */ + pnhe = &arpc.nhe; + pnhe->nh_len = sizeof(struct nhop_external); + pnhe->nh_idx = nh->nh_priv->nh_idx; + pnhe->nh_fib = rh->rib_fibnum; + pnhe->ifindex = nh->nh_ifp->if_index; + pnhe->aifindex = nh->nh_aifp->if_index; + pnhe->nh_family = nh->nh_priv->nh_family; + pnhe->nh_type = nh->nh_priv->nh_type; + pnhe->nh_mtu = nh->nh_mtu; + pnhe->nh_flags = nh->nh_flags; + + memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend)); + pnhe->prepend_len = nh->nh_prepend_len; + pnhe->nh_refcount = nh->nh_priv->nh_refcnt; + pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent); + + /* sockaddr container */ + addrs_len = sizeof(struct nhop_addrs); + arpc.na.gw_sa_off = addrs_len; + gw_sa = (struct sockaddr *)&nh->gw4_sa; + addrs_len += gw_sa->sa_len; + + src_sa = nh->nh_ifa->ifa_addr; + if (src_sa->sa_family == AF_LINK) { + /* Shorten structure */ + memset(&ss, 0, sizeof(struct sockaddr_storage)); + fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss, + nh->nh_ifa->ifa_ifp); + src_sa = (struct sockaddr *)&ss; + } + arpc.na.src_sa_off = addrs_len; + addrs_len += src_sa->sa_len; + + /* Write total container length */ + arpc.na.na_len = addrs_len; + + arpc.rtm.rtm_msglen += arpc.na.na_len - sizeof(struct nhop_addrs); + + error = SYSCTL_OUT(w, &arpc, sizeof(arpc)); + if (error == 0) + error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len); + if (error == 0) + error = SYSCTL_OUT(w, src_sa, src_sa->sa_len); + + return (error); +} + +int +nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) +{ + struct nh_control *ctl; + struct nhop_priv *nh_priv; + int error; + + ctl = rh->nh_control; + + NHOPS_RLOCK(ctl); + DPRINTF("NHDUMP: count=%u", ctl->nh_head.items_count); + CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) { + error = dump_nhop_entry(rh, nh_priv->nh, w); + if (error != 0) { + NHOPS_RUNLOCK(ctl); + return (error); + } + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + + return (0); +} + diff --git a/sys/net/route/nhop_utils.c b/sys/net/route/nhop_utils.c new file mode 100644 index 000000000000..56bca99c9ed8 --- /dev/null +++ b/sys/net/route/nhop_utils.c @@ -0,0 +1,219 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_route.h" +#include "opt_mpath.h" + +#include +#include +#include +#include +#include +#include + +#include + +#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */ + +#define _BLOCKS_TO_SZ(_blocks) ((size_t)(_blocks) * sizeof(u_long)) +#define _BLOCKS_TO_ITEMS(_blocks) ((uint32_t)(_blocks) * BLOCK_ITEMS) +#define _ITEMS_TO_BLOCKS(_items) ((_items) / BLOCK_ITEMS) + + +static void _bitmask_init_idx(void *index, uint32_t items); + +void +bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items) +{ + + if (idx != NULL) + _bitmask_init_idx(idx, num_items); + + memset(bh, 0, sizeof(struct bitmask_head)); + bh->blocks = _ITEMS_TO_BLOCKS(num_items); + bh->idx = (u_long *)idx; +} + +uint32_t +bitmask_get_resize_items(const struct bitmask_head *bh) +{ + if ((bh->items_count * 2 > _BLOCKS_TO_ITEMS(bh->blocks)) && bh->items_count < 65536) + return (_BLOCKS_TO_ITEMS(bh->blocks) * 2); + + return (0); +} + +int +bitmask_should_resize(const struct bitmask_head *bh) +{ + + return (bitmask_get_resize_items(bh) != 0); +} + +#if 0 +uint32_t +_bitmask_get_blocks(uint32_t items) +{ + + return (items / BLOCK_ITEMS); +} +#endif + +size_t +bitmask_get_size(uint32_t items) +{ +#if _KERNEL + KASSERT((items % BLOCK_ITEMS) == 0, + ("bitmask size needs to power of 2 and greater or equal to %zu", + BLOCK_ITEMS)); +#else + assert((items % BLOCK_ITEMS) == 0); +#endif + + return (items / 8); +} + +static void +_bitmask_init_idx(void *_idx, uint32_t items) +{ + size_t size = bitmask_get_size(items); + u_long *idx = (u_long *)_idx; + + /* Mark all as free */ + memset(idx, 0xFF, size); + *idx &= ~(u_long)1; /* Always skip index 0 */ +} + + +/* + * _try_merge api to allow shrinking? + */ +int +bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items) +{ + uint32_t new_blocks = _BLOCKS_TO_ITEMS(new_items); + + _bitmask_init_idx(new_idx, new_items); + + if (bi->blocks < new_blocks) { + /* extend current blocks */ + if (bi->blocks > 0) + memcpy(new_idx, bi->idx, _BLOCKS_TO_SZ(bi->blocks)); + return (0); + } else { + /* XXX: ensure all other blocks are non-zero */ + for (int i = new_blocks; i < bi->blocks; i++) { + } + + return (1); + } +} + +void +bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx) +{ + void *old_ptr; + + old_ptr = bh->idx; + + bh->idx = (u_long *)new_idx; + bh->blocks = _ITEMS_TO_BLOCKS(new_items); + + if (pidx != NULL) + *pidx = old_ptr; +} + +/* + * Allocate new index in given instance and stores in in @pidx. + * Returns 0 on success. + */ +int +bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx) +{ + u_long *mask; + int i, off, v; + + off = bi->free_off; + mask = &bi->idx[off]; + + for (i = off; i < bi->blocks; i++, mask++) { + if ((v = ffsl(*mask)) == 0) + continue; + + /* Mark as busy */ + *mask &= ~ ((u_long)1 << (v - 1)); + + bi->free_off = i; + + v = BLOCK_ITEMS * i + v - 1; + + *pidx = v; + bi->items_count++; + return (0); + } + + return (1); +} + +/* + * Removes index from given set. + * Returns 0 on success. + */ +int +bitmask_free_idx(struct bitmask_head *bi, uint16_t idx) +{ + u_long *mask; + int i, v; + + if (idx == 0) + return (1); + + i = idx / BLOCK_ITEMS; + v = idx % BLOCK_ITEMS; + + if (i >= bi->blocks) + return (1); + + mask = &bi->idx[i]; + + if ((*mask & ((u_long)1 << v)) != 0) + return (1); + + /* Mark as free */ + *mask |= (u_long)1 << v; + bi->items_count--; + + /* Update free offset */ + if (bi->free_off > i) + bi->free_off = i; + + return (0); +} + diff --git a/sys/net/route/nhop_utils.h b/sys/net/route/nhop_utils.h new file mode 100644 index 000000000000..a2876178cbb0 --- /dev/null +++ b/sys/net/route/nhop_utils.h @@ -0,0 +1,200 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_ROUTE_NHOP_UTILS_H_ +#define _NET_ROUTE_NHOP_UTILS_H_ + +/* Chained hash table */ +struct _cht_head { + uint32_t hash_size; + uint32_t items_count; + void **ptr; +}; + +static inline uint32_t +_cht_get_resize_size(const struct _cht_head *head) +{ + uint32_t new_size = 0; + + if ((head->items_count * 2 > head->hash_size) && (head->hash_size < 65536)) + new_size = head->hash_size * 2; + else if ((head->items_count * 4 < head->hash_size) && head->hash_size > 16) + new_size = head->hash_size / 2; + + return (new_size); +} + +static inline int +_cht_need_resize(const struct _cht_head *head) +{ + + return (_cht_get_resize_size(head) > 0); +} + + +#ifndef typeof +#define typeof __typeof +#endif + +#define CHT_SLIST_NEED_RESIZE(_head) \ + _cht_need_resize((const struct _cht_head *)(_head)) +#define CHT_SLIST_GET_RESIZE_BUCKETS(_head) \ + _cht_get_resize_size((const struct _cht_head *)(_head)) +#define CHT_SLIST_GET_RESIZE_SIZE(_buckets) ((_buckets) * sizeof(void *)) + +#define CHT_SLIST_DEFINE(_HNAME, _ITEM_TYPE) \ +struct _HNAME##_head { \ + uint32_t hash_size; \ + uint32_t items_count; \ + _ITEM_TYPE **ptr; \ +} + +#define CHT_SLIST_INIT(_head, _ptr, _num_buckets) \ + (_head)->hash_size = _num_buckets; \ + (_head)->items_count = 0; \ + (_head)->ptr = _ptr; + +/* Default hash method for constant-size keys */ + +#define CHT_GET_BUCK(_head, _PX, _key) _PX##_hash_key(_key) & ((_head)->hash_size - 1) +#define CHT_GET_BUCK_OBJ(_head, _PX, _obj) _PX##_hash_obj(_obj) & ((_head)->hash_size - 1) + +#define CHT_FIRST(_head, idx) _CHT_FIRST((_head)->ptr, idx) +#define _CHT_FIRST(_ptr, idx) (_ptr)[idx] + +#define CHT_SLIST_FIND(_head, _PX, _key, _ret) do { \ + uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \ + _ret = CHT_FIRST(_head, _buck); \ + for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_key, (_ret))) \ + break; \ + } \ +} while(0) + +/* + * hash_obj, nhop_cmp + */ +#define CHT_SLIST_FIND_BYOBJ(_head, _PX, _obj, _ret) do { \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _ret = CHT_FIRST(_head, _buck); \ + for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_obj, _ret)) \ + break; \ + } \ +} while(0) + +#define CHT_SLIST_INSERT_HEAD(_head, _PX, _obj) do { \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _PX##_next(_obj) = CHT_FIRST(_head, _buck); \ + CHT_FIRST(_head, _buck) = _obj; \ + (_head)->items_count++; \ +} while(0) + +#define CHT_SLIST_REMOVE(_head, _PX, _key, _ret) do { \ + typeof(*(_head)->ptr) _tmp; \ + uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \ + _ret = CHT_FIRST(_head, _buck); \ + _tmp = NULL; \ + for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_key, _ret)) \ + break; \ + } \ + if (_ret != NULL) { \ + if (_tmp == NULL) \ + CHT_FIRST(_head, _buck) = _PX##_next(_ret); \ + else \ + _PX##_next(_tmp) = _PX##_next(_ret); \ + (_head)->items_count--; \ + } \ +} while(0) + +#define CHT_SLIST_REMOVE_BYOBJ(_head, _PX, _obj, _ret) do { \ + typeof(*(_head)->ptr) _tmp; \ + uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \ + _ret = CHT_FIRST(_head, _buck); \ + _tmp = NULL; \ + for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \ + if (_PX##_cmp(_obj, _ret)) \ + break; \ + } \ + if (_ret != NULL) { \ + if (_tmp == NULL) \ + CHT_FIRST(_head, _buck) = _PX##_next(_ret); \ + else \ + _PX##_next(_tmp) = _PX##_next(_ret); \ + (_head)->items_count--; \ + } \ +} while(0) + + +#define CHT_SLIST_FOREACH(_head, _PX, _x) \ + for (uint32_t _i = 0; _i < (_head)->hash_size; _i++) { \ + for (_x = CHT_FIRST(_head, _i); _x; _x = _PX##_next(_x)) + +#define CHT_SLIST_FOREACH_END } + +#define CHT_SLIST_RESIZE(_head, _PX, _new_void_ptr, _new_hsize) \ + uint32_t _new_idx; \ + typeof((_head)->ptr) _new_ptr = (void *)_new_void_ptr; \ + typeof(*(_head)->ptr) _x, _y; \ + for (uint32_t _old_idx = 0; _old_idx < (_head)->hash_size; _old_idx++) {\ + _x = CHT_FIRST(_head, _old_idx); \ + _y = _x; \ + while (_y != NULL) { \ + _y = _PX##_next(_x); \ + _new_idx = _PX##_hash_obj(_x) & (_new_hsize - 1);\ + _PX##_next(_x) = _CHT_FIRST(_new_ptr, _new_idx);\ + _CHT_FIRST(_new_ptr, _new_idx) = _x; \ + _x = _y; \ + } \ + } \ + (_head)->hash_size = _new_hsize; \ + _new_void_ptr = (void *)(_head)->ptr; \ + (_head)->ptr = _new_ptr; + +/* bitmasks */ + +struct bitmask_head { + uint16_t free_off; /* index of the first potentially free block */ + uint16_t blocks; /* number of 4/8-byte blocks in the index */ + uint32_t items_count; /* total number of items */ + u_long *idx; +}; + +size_t bitmask_get_size(uint32_t items); +uint32_t bitmask_get_resize_items(const struct bitmask_head *nh); +int bitmask_should_resize(const struct bitmask_head *bh); +void bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx); +void bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items); +int bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items); +int bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx); +int bitmask_free_idx(struct bitmask_head *bi, uint16_t idx); + +#endif + diff --git a/sys/net/route/nhop_var.h b/sys/net/route/nhop_var.h new file mode 100644 index 000000000000..4bf26ff54269 --- /dev/null +++ b/sys/net/route/nhop_var.h @@ -0,0 +1,96 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains private definitions for nexthop routing. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_NHOP_VAR_H_ +#define _NET_ROUTE_NHOP_VAR_H_ + +/* define nhop hash table */ +struct nhop_priv; +CHT_SLIST_DEFINE(nhops, struct nhop_priv); +/* produce hash value for an object */ +#define nhops_hash_obj(_obj) hash_priv(_obj) +/* compare two objects */ +#define nhops_cmp(_one, _two) cmp_priv(_one, _two) +/* next object accessor */ +#define nhops_next(_obj) (_obj)->nh_next + + +struct nh_control { + struct nhops_head nh_head; /* hash table head */ + struct bitmask_head nh_idx_head; /* nhop index head */ + struct rwlock ctl_lock; /* overall ctl lock */ + struct rib_head *ctl_rh; /* pointer back to rnh */ + struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */ +}; + +#define NHOPS_WLOCK(ctl) rw_wlock(&(ctl)->ctl_lock) +#define NHOPS_RLOCK(ctl) rw_rlock(&(ctl)->ctl_lock) +#define NHOPS_WUNLOCK(ctl) rw_wunlock(&(ctl)->ctl_lock) +#define NHOPS_RUNLOCK(ctl) rw_runlock(&(ctl)->ctl_lock) +#define NHOPS_LOCK_INIT(ctl) rw_init(&(ctl)->ctl_lock, "nhop_ctl") +#define NHOPS_LOCK_DESTROY(ctl) rw_destroy(&(ctl)->ctl_lock) +#define NHOPS_WLOCK_ASSERT(ctl) rw_assert(&(ctl)->ctl_lock, RA_WLOCKED) + + +/* Control plane-only nhop data */ +struct nhop_object; +struct nhop_priv { + uint32_t nh_idx; /* nexthop index */ + uint8_t nh_family; /* address family of the lookup */ + uint16_t nh_type; /* nexthop type */ + void *cb_func; /* function handling additional rewrite caps */ + u_int nh_refcnt; /* number of references, refcount(9) */ + u_int nh_linked; /* refcount(9), == 2 if linked to the list */ + int rt_flags; /* routing flags for the control plane */ + struct nhop_object *nh; /* backreference to the dataplane nhop */ + struct nh_control *nh_control; /* backreference to the rnh */ + struct nhop_priv *nh_next; /* hash table membership */ + struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ +}; + +#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED) + +/* nhop.c */ +struct nhop_priv *find_nhop(struct nh_control *ctl, + const struct nhop_priv *nh_priv); +int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); +struct nhop_priv *unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv); + +/* nhop_ctl.c */ +int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two); + +#endif + diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c new file mode 100644 index 000000000000..09c2ded80796 --- /dev/null +++ b/sys/net/route/route_ctl.c @@ -0,0 +1,65 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * This file contains control plane routing tables functions. + * + * All functions assumes they are called in net epoch. + */ + + diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c new file mode 100644 index 000000000000..c124a52b0b77 --- /dev/null +++ b/sys/net/route/route_helpers.c @@ -0,0 +1,83 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * RIB helper functions. + */ + +/* + * Calls @wa_f with @arg for each entry in the table specified by + * @af and @fibnum. + * + * Table is traversed under read lock. + */ +void +rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rnh; + + if ((rnh = rt_tables_get_rnh(fibnum, af)) == NULL) + return; + + RIB_RLOCK(rnh); + rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); + RIB_RUNLOCK(rnh); +} + diff --git a/sys/net/route/shared.h b/sys/net/route/shared.h new file mode 100644 index 000000000000..a4476373dd97 --- /dev/null +++ b/sys/net/route/shared.h @@ -0,0 +1,68 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Contains various definitions shared between the parts of a routing subsystem. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_SHARED_H_ +#define _NET_ROUTE_SHARED_H_ + +#ifdef RTDEBUG +#define DPRINTF(_fmt, ...) printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__) +#else +#define DPRINTF(_fmt, ...) +#endif + +struct rib_head; + +/* Nexhops */ +void nhops_init(void); +int nhops_init_rib(struct rib_head *rh); +void nhops_destroy_rib(struct rib_head *rh); +int nhop_ref_object(struct nhop_object *nh); +int nhop_ref_any(struct nhop_object *nh); +void nhop_free_any(struct nhop_object *nh); + +void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type); +void nhop_set_rtflags(struct nhop_object *nh, int rt_flags); + +int nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object **nh_ret); +int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig, + struct rt_addrinfo *info, struct nhop_object **pnh_priv); + +void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); +int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + +#endif + diff --git a/sys/net/route_var.h b/sys/net/route_var.h index db0aa07f60a9..92b5e433a972 100644 --- a/sys/net/route_var.h +++ b/sys/net/route_var.h @@ -32,6 +32,10 @@ #ifndef _NET_ROUTE_VAR_H_ #define _NET_ROUTE_VAR_H_ +struct nh_control; +typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr, + const struct sockaddr *mask, struct nhop_object *nh); + struct rib_head { struct radix_head head; rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ @@ -41,6 +45,7 @@ struct rib_head { rn_walktree_t *rnh_walktree; /* traverse tree */ rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ rn_close_t *rnh_close; /*do something when the last ref drops*/ + rnh_preadd_entry_f_t *rnh_preadd; /* hook to alter record prior to insertion */ rt_gen_t rnh_gen; /* generation counter */ int rnh_multipath; /* multipath capable ? */ struct radix_node rnh_nodes[3]; /* empty tree for common case */ @@ -51,6 +56,7 @@ struct rib_head { u_int rib_fibnum; /* fib number */ struct callout expire_callout; /* Callout for expiring dynamic routes */ time_t next_expire; /* Next expire run ts */ + struct nh_control *nh_control; /* nexthop subsystem data */ }; #define RIB_RLOCK_TRACKER struct rm_priotracker _rib_tracker @@ -90,6 +96,44 @@ _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new) struct rib_head *rt_tables_get_rnh(int fib, int family); void rt_mpath_init_rnh(struct rib_head *rnh); +VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat); +#define RTSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val)) +#define RTSTAT_INC(name) RTSTAT_ADD(name, 1) + +/* + * With the split between the routing entry and the nexthop, + * rt_flags has to be split between these 2 entries. As rtentry + * mostly contains prefix data and is thought to be generic enough + * so one can transparently change the nexthop pointer w/o requiring + * any other rtentry changes, most of rt_flags shifts to the particular nexthop. + * / + * + * RTF_UP: rtentry, as an indication that it is linked. + * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath + * RTF_DYNAMIC: nhop, to make rtentry generic. + * RTF_MODIFIED: nhop, to make rtentry generic. (legacy) + * -- "native" path (nhop) properties: + * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU, + * RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST + */ + +/* Nexthop rt flags mask */ +#define NHOP_RT_FLAG_MASK (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \ + RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \ + RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST) + +/* rtentry rt flag mask */ +#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST) + +/* Nexthop selection */ +#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh)) +#define _SELECT_NHOP(_nh, _flowid) \ + (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size] +#define _RT_SELECT_NHOP(_nh, _flowid) \ + ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid)) +#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid) + /* rte<>nhop translation */ static inline uint16_t fib_rte_to_nh_flags(int rt_flags) diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index 521d2fdc3d99..0ba071459ca6 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -77,6 +77,7 @@ #include #include #endif +#include #ifdef COMPAT_FREEBSD32 #include @@ -1076,6 +1077,7 @@ rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); + out->rmx_nhidx = nhop_get_idx(rt->rt_nhop); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; @@ -2025,7 +2027,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) namelen--; if (req->newptr) return (EPERM); - if (name[1] == NET_RT_DUMP) { + if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) @@ -2092,7 +2094,25 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) error = EAFNOSUPPORT; } break; - + case NET_RT_NHOP: + /* Allow dumping one specific af/fib at a time */ + if (namelen < 4) { + error = EINVAL; + break; + } + fib = name[3]; + if (fib < 0 || fib > rt_numfibs) { + error = EINVAL; + break; + } + rnh = rt_tables_get_rnh(fib, af); + if (rnh == NULL) { + error = EAFNOSUPPORT; + break; + } + if (w.w_op == NET_RT_NHOP) + error = nhops_dump_sysctl(rnh, w.w_req); + break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); diff --git a/sys/netinet/in_fib.c b/sys/netinet/in_fib.c index f7a02e36a40b..4456856c426e 100644 --- a/sys/netinet/in_fib.c +++ b/sys/netinet/in_fib.c @@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #ifdef RADIX_MPATH @@ -60,59 +62,49 @@ __FBSDID("$FreeBSD$"); #include #ifdef INET -static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, +static void fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_basic *pnh4); -static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, +static void fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_extended *pnh4); #define RNTORT(p) ((struct rtentry *)(p)) static void -fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, +fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_basic *pnh4) { - struct sockaddr_in *gw; if ((flags & NHR_IFAIF) != 0) - pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; + pnh4->nh_ifp = nh->nh_ifa->ifa_ifp; + else + pnh4->nh_ifp = nh->nh_ifp; + pnh4->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) + pnh4->nh_addr = nh->gw4_sa.sin_addr; else - pnh4->nh_ifp = rte->rt_ifp; - pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); - if (rte->rt_flags & RTF_GATEWAY) { - gw = (struct sockaddr_in *)rte->rt_gateway; - pnh4->nh_addr = gw->sin_addr; - } else pnh4->nh_addr = dst; /* Set flags */ - pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in *)rt_key(rte); - if (gw->sin_addr.s_addr == 0) - pnh4->nh_flags |= NHF_DEFAULT; + pnh4->nh_flags = nh->nh_flags; /* TODO: Handle RTF_BROADCAST here */ } static void -fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, +fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst, uint32_t flags, struct nhop4_extended *pnh4) { - struct sockaddr_in *gw; if ((flags & NHR_IFAIF) != 0) - pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; + pnh4->nh_ifp = nh->nh_ifa->ifa_ifp; + else + pnh4->nh_ifp = nh->nh_ifp; + pnh4->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) + pnh4->nh_addr = nh->gw4_sa.sin_addr; else - pnh4->nh_ifp = rte->rt_ifp; - pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); - if (rte->rt_flags & RTF_GATEWAY) { - gw = (struct sockaddr_in *)rte->rt_gateway; - pnh4->nh_addr = gw->sin_addr; - } else pnh4->nh_addr = dst; /* Set flags */ - pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in *)rt_key(rte); - if (gw->sin_addr.s_addr == 0) - pnh4->nh_flags |= NHF_DEFAULT; - pnh4->nh_ia = ifatoia(rte->rt_ifa); + pnh4->nh_flags = nh->nh_flags; + pnh4->nh_ia = ifatoia(nh->nh_ifa); pnh4->nh_src = IA_SIN(pnh4->nh_ia)->sin_addr; } @@ -135,7 +127,7 @@ fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags, struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; - struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); @@ -150,10 +142,10 @@ fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); + nh = RNTORT(rn)->rt_nhop; /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib4_rte_to_nh_basic(rte, dst, flags, pnh4); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib4_rte_to_nh_basic(nh, dst, flags, pnh4); RIB_RUNLOCK(rh); return (0); @@ -185,6 +177,7 @@ fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, struct radix_node *rn; struct sockaddr_in sin; struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); @@ -207,9 +200,10 @@ fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, return (ENOENT); } #endif + nh = rte->rt_nhop; /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib4_rte_to_nh_extended(rte, dst, flags, pnh4); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib4_rte_to_nh_extended(nh, dst, flags, pnh4); if ((flags & NHR_REF) != 0) { /* TODO: lwref on egress ifp's ? */ } @@ -229,4 +223,138 @@ fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4) } +/* + * Looks up path in fib @fibnum specified by @dst. + * Returns path nexthop on success. Nexthop is safe to use + * within the current network epoch. If longer lifetime is required, + * one needs to pass NHR_REF as a flag. This will return referenced + * nexthop. + */ +struct nhop_object * +fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, uint32_t flowid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct rtentry *rt; + struct nhop_object *nh; + + KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (NULL); + + /* Prepare lookup key */ + struct sockaddr_in sin4; + memset(&sin4, 0, sizeof(sin4)); + sin4.sin_family = AF_INET; + sin4.sin_len = sizeof(struct sockaddr_in); + sin4.sin_addr = dst; + + nh = NULL; + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rt = RNTORT(rn); +#ifdef RADIX_MPATH + if (rt_mpath_next(rt) != NULL) + rt = rt_mpath_selectrte(rt, flowid); +#endif + nh = rt->rt_nhop; + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + RIB_RUNLOCK(rh); + return (nh); + } + } + RIB_RUNLOCK(rh); + + RTSTAT_INC(rts_unreach); + return (NULL); +} + +inline static int +check_urpf(const struct nhop_object *nh, uint32_t flags, + const struct ifnet *src_if) +{ + + if (src_if != NULL && nh->nh_aifp == src_if) { + return (1); + } + if (src_if == NULL) { + if ((flags & NHR_NODEFAULT) == 0) + return (1); + else if ((nh->nh_flags & NHF_DEFAULT) == 0) + return (1); + } + + return (0); +} + +#ifdef RADIX_MPATH +inline static int +check_urpf_mpath(struct rtentry *rt, uint32_t flags, + const struct ifnet *src_if) +{ + + while (rt != NULL) { + if (check_urpf(rt->rt_nhop, flags, src_if) != 0) + return (1); + rt = rt_mpath_next(rt); + } + + return (0); +} +#endif + +/* + * Performs reverse path forwarding lookup. + * If @src_if is non-zero, verifies that at least 1 path goes via + * this interface. + * If @src_if is zero, verifies that route exist. + * if @flags contains NHR_NOTDEFAULT, do not consider default route. + * + * Returns 1 if route matching conditions is found, 0 otherwise. + */ +int +fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, const struct ifnet *src_if) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct rtentry *rt; + int ret; + + KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (0); + + /* Prepare lookup key */ + struct sockaddr_in sin4; + memset(&sin4, 0, sizeof(sin4)); + sin4.sin_len = sizeof(struct sockaddr_in); + sin4.sin_addr = dst; + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rt = RNTORT(rn); +#ifdef RADIX_MPATH + ret = check_urpf_mpath(rt, flags, src_if); +#else + ret = check_urpf(rt->rt_nhop, flags, src_if); +#endif + RIB_RUNLOCK(rh); + return (ret); + } + RIB_RUNLOCK(rh); + + return (0); +} + #endif diff --git a/sys/netinet/in_fib.h b/sys/netinet/in_fib.h index f0b4d159d5e1..ff78967061e4 100644 --- a/sys/netinet/in_fib.h +++ b/sys/netinet/in_fib.h @@ -58,5 +58,9 @@ int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, uint32_t flowid, struct nhop4_extended *pnh4); void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4); +struct nhop_object *fib4_lookup(uint32_t fibnum, struct in_addr dst, + uint32_t scopeid, uint32_t flags, uint32_t flowid); +int fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, + uint32_t flags, const struct ifnet *src_if); #endif diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c index 8ea777b90158..eeb7760c5ccb 100644 --- a/sys/netinet/in_rmx.c +++ b/sys/netinet/in_rmx.c @@ -43,6 +43,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include @@ -56,6 +58,67 @@ extern int in_inithead(void **head, int off, u_int fibnum); extern int in_detachhead(void **head, int off); #endif +static int +rib4_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, + struct nhop_object *nh) +{ + const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr; + uint16_t nh_type; + int rt_flags; + + /* XXX: RTF_LOCAL && RTF_MULTICAST */ + + rt_flags = nhop_get_rtflags(nh); + + if (rt_flags & RTF_HOST) { + + /* + * Backward compatibility: + * if the destination is broadcast, + * mark route as broadcast. + * This behavior was useful when route cloning + * was in place, so there was an explicit cloned + * route for every broadcasted address. + * Currently (2020-04) there is no kernel machinery + * to do route cloning, though someone might explicitly + * add these routes to support some cases with active-active + * load balancing. Given that, retain this support. + */ + if (in_broadcast(addr4->sin_addr, nh->nh_ifp)) { + rt_flags |= RTF_BROADCAST; + nhop_set_rtflags(nh, rt_flags); + nh->nh_flags |= NHF_BROADCAST; + } + } + + /* + * Check route MTU: + * inherit interface MTU if not set or + * check if MTU is too large. + */ + if (nh->nh_mtu == 0) { + nh->nh_mtu = nh->nh_ifp->if_mtu; + } else if (nh->nh_mtu > nh->nh_ifp->if_mtu) + nh->nh_mtu = nh->nh_ifp->if_mtu; + + /* Ensure that default route nhop has special flag */ + const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask; + if ((rt_flags & RTF_HOST) == 0 && mask4->sin_addr.s_addr == 0) + nh->nh_flags |= NHF_DEFAULT; + + /* Set nhop type to basic per-AF nhop */ + if (nhop_get_type(nh) == 0) { + if (nh->nh_flags & NHF_GATEWAY) + nh_type = NH_TYPE_IPV4_ETHER_NHOP; + else + nh_type = NH_TYPE_IPV4_ETHER_RSLV; + + nhop_set_type(nh, nh_type); + } + + return (0); +} + /* * Do what we need to do when inserting a route. */ @@ -126,6 +189,7 @@ in_inithead(void **head, int off, u_int fibnum) if (rh == NULL) return (0); + rh->rnh_preadd = rib4_preadd; rh->rnh_addaddr = in_addroute; #ifdef RADIX_MPATH rt_mpath_init_rnh(rh); diff --git a/sys/netinet6/in6_fib.c b/sys/netinet6/in6_fib.c index ae4beab3b5ce..b3effb2b422e 100644 --- a/sys/netinet6/in6_fib.c +++ b/sys/netinet6/in6_fib.c @@ -50,6 +50,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #ifdef RADIX_MPATH @@ -68,94 +70,63 @@ __FBSDID("$FreeBSD$"); #include #ifdef INET6 -static void fib6_rte_to_nh_extended(struct rtentry *rte, +static void fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6); -static void fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst, +static void fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_basic *pnh6); -static struct ifnet *fib6_get_ifaifp(struct rtentry *rte); #define RNTORT(p) ((struct rtentry *)(p)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst); -/* - * Gets real interface for the @rte. - * Returns rt_ifp for !IFF_LOOPBACK routers. - * Extracts "real" address interface from interface address - * loopback routes. - */ -static struct ifnet * -fib6_get_ifaifp(struct rtentry *rte) -{ - struct ifnet *ifp; - struct sockaddr_dl *sdl; - ifp = rte->rt_ifp; - if ((ifp->if_flags & IFF_LOOPBACK) && - rte->rt_gateway->sa_family == AF_LINK) { - sdl = (struct sockaddr_dl *)rte->rt_gateway; - return (ifnet_byindex(sdl->sdl_index)); - } - - return (ifp); -} static void -fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst, +fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_basic *pnh6) { - struct sockaddr_in6 *gw; /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); if ((flags & NHR_IFAIF) != 0) - pnh6->nh_ifp = fib6_get_ifaifp(rte); + pnh6->nh_ifp = nh->nh_aifp; else - pnh6->nh_ifp = rte->rt_ifp; + pnh6->nh_ifp = nh->nh_ifp; - pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); - if (rte->rt_flags & RTF_GATEWAY) { + pnh6->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) { /* Return address with embedded scope. */ - gw = (struct sockaddr_in6 *)rte->rt_gateway; - pnh6->nh_addr = gw->sin6_addr; + pnh6->nh_addr = nh->gw6_sa.sin6_addr; } else pnh6->nh_addr = *dst; /* Set flags */ - pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in6 *)rt_key(rte); - if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr)) - pnh6->nh_flags |= NHF_DEFAULT; + pnh6->nh_flags = nh->nh_flags; } static void -fib6_rte_to_nh_extended(struct rtentry *rte, const struct in6_addr *dst, +fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6) { - struct sockaddr_in6 *gw; /* Do explicit nexthop zero unless we're copying it */ memset(pnh6, 0, sizeof(*pnh6)); if ((flags & NHR_IFAIF) != 0) - pnh6->nh_ifp = fib6_get_ifaifp(rte); + pnh6->nh_ifp = nh->nh_aifp; else - pnh6->nh_ifp = rte->rt_ifp; + pnh6->nh_ifp = nh->nh_ifp; - pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp)); - if (rte->rt_flags & RTF_GATEWAY) { + pnh6->nh_mtu = nh->nh_mtu; + if (nh->nh_flags & NHF_GATEWAY) { /* Return address with embedded scope. */ - gw = (struct sockaddr_in6 *)rte->rt_gateway; - pnh6->nh_addr = gw->sin6_addr; + pnh6->nh_addr = nh->gw6_sa.sin6_addr; } else pnh6->nh_addr = *dst; /* Set flags */ - pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); - gw = (struct sockaddr_in6 *)rt_key(rte); - if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr)) - pnh6->nh_flags |= NHF_DEFAULT; - pnh6->nh_ia = ifatoia6(rte->rt_ifa); + pnh6->nh_flags = nh->nh_flags; + pnh6->nh_ia = ifatoia6(nh->nh_ifa); } /* @@ -180,7 +151,7 @@ fib6_lookup_nh_basic(uint32_t fibnum, const struct in6_addr *dst, uint32_t scope struct rib_head *rh; struct radix_node *rn; struct sockaddr_in6 sin6; - struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); @@ -198,10 +169,10 @@ fib6_lookup_nh_basic(uint32_t fibnum, const struct in6_addr *dst, uint32_t scope RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); + nh = RNTORT(rn)->rt_nhop; /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib6_rte_to_nh_basic(rte, &sin6.sin6_addr, flags, pnh6); + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib6_rte_to_nh_basic(nh, &sin6.sin6_addr, flags, pnh6); RIB_RUNLOCK(rh); return (0); } @@ -231,6 +202,7 @@ fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst,uint32_t scopeid, struct radix_node *rn; struct sockaddr_in6 sin6; struct rtentry *rte; + struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); @@ -256,9 +228,10 @@ fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst,uint32_t scopeid, return (ENOENT); } #endif + nh = rte->rt_nhop; /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(rte->rt_ifp)) { - fib6_rte_to_nh_extended(rte, &sin6.sin6_addr, flags, + if (RT_LINK_IS_UP(nh->nh_ifp)) { + fib6_rte_to_nh_extended(nh, &sin6.sin6_addr, flags, pnh6); if ((flags & NHR_REF) != 0) { /* TODO: Do lwref on egress ifp's */ @@ -279,5 +252,145 @@ fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6) } +/* + * Looks up path in fib @fibnum specified by @dst. + * Assumes scope is deembedded and provided in @scopeid. + * + * Returns path nexthop on success. Nexthop is safe to use + * within the current network epoch. If longer lifetime is required, + * one needs to pass NHR_REF as a flag. This will return referenced + * nexthop. + */ +struct nhop_object * +fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, uint32_t flowid) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct rtentry *rt; + struct nhop_object *nh; + struct sockaddr_in6 sin6; + + KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET6); + if (rh == NULL) + return (NULL); + + /* TODO: radix changes */ + //addr = *dst6; + /* Prepare lookup key */ + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_addr = *dst6; + + /* Assume scopeid is valid and embed it directly */ + if (IN6_IS_SCOPE_LINKLOCAL(dst6)) + sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rt = RNTORT(rn); +#ifdef RADIX_MPATH + if (rt_mpath_next(rt) != NULL) + rt = rt_mpath_selectrte(rt, flowid); +#endif + nh = rt->rt_nhop; + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(nh->nh_ifp)) { + if (flags & NHR_REF) + nhop_ref_object(nh); + RIB_RUNLOCK(rh); + return (nh); + } + } + RIB_RUNLOCK(rh); + + RTSTAT_INC(rts_unreach); + return (NULL); +} + +inline static int +check_urpf(const struct nhop_object *nh, uint32_t flags, + const struct ifnet *src_if) +{ + + if (src_if != NULL && nh->nh_aifp == src_if) { + return (1); + } + if (src_if == NULL) { + if ((flags & NHR_NODEFAULT) == 0) + return (1); + else if ((nh->nh_flags & NHF_DEFAULT) == 0) + return (1); + } + + return (0); +} + +#ifdef RADIX_MPATH +inline static int +check_urpf_mpath(struct rtentry *rt, uint32_t flags, + const struct ifnet *src_if) +{ + + while (rt != NULL) { + if (check_urpf(rt->rt_nhop, flags, src_if) != 0) + return (1); + rt = rt_mpath_next(rt); + } + + return (0); +} +#endif + +/* + * Performs reverse path forwarding lookup. + * If @src_if is non-zero, verifies that at least 1 path goes via + * this interface. + * If @src_if is zero, verifies that route exist. + * if @flags contains NHR_NOTDEFAULT, do not consider default route. + * + * Returns 1 if route matching conditions is found, 0 otherwise. + */ +int +fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, const struct ifnet *src_if) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rh; + struct radix_node *rn; + struct rtentry *rt; + struct in6_addr addr; + int ret; + + KASSERT((fibnum < rt_numfibs), ("fib6_check_urpf: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET6); + if (rh == NULL) + return (0); + + addr = *dst6; + /* Assume scopeid is valid and embed it directly */ + if (IN6_IS_SCOPE_LINKLOCAL(dst6)) + addr.s6_addr16[1] = htons(scopeid & 0xffff); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&addr, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rt = RNTORT(rn); +#ifdef RADIX_MPATH + ret = check_urpf_mpath(rt, flags, src_if); +#else + ret = check_urpf(rt->rt_nhop, flags, src_if); +#endif + RIB_RUNLOCK(rh); + return (ret); + } + RIB_RUNLOCK(rh); + + return (0); +} + #endif diff --git a/sys/netinet6/in6_fib.h b/sys/netinet6/in6_fib.h index fa07a5ce9a3e..bf8d367309cc 100644 --- a/sys/netinet6/in6_fib.h +++ b/sys/netinet6/in6_fib.h @@ -58,5 +58,11 @@ int fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst, uint32_t scopeid, uint32_t flags, uint32_t flowid, struct nhop6_extended *pnh6); void fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6); + +struct nhop_object *fib6_lookup(uint32_t fibnum, + const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, + uint32_t flowid); +int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, + uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); #endif diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c index 35756cf95868..7f10b290309b 100644 --- a/sys/netinet6/in6_rmx.c +++ b/sys/netinet6/in6_rmx.c @@ -82,6 +82,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include @@ -103,6 +105,43 @@ extern int in6_inithead(void **head, int off, u_int fibnum); extern int in6_detachhead(void **head, int off); #endif +static int +rib6_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, + struct nhop_object *nh) +{ + uint16_t nh_type; + + /* XXX: RTF_LOCAL */ + + /* + * Check route MTU: + * inherit interface MTU if not set or + * check if MTU is too large. + */ + if (nh->nh_mtu == 0) { + nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp); + } else if (nh->nh_mtu > IN6_LINKMTU(nh->nh_ifp)) + nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp); + + /* Ensure that default route nhop has special flag */ + const struct sockaddr_in6 *mask6 = (const struct sockaddr_in6 *)mask; + if ((nhop_get_rtflags(nh) & RTF_HOST) == 0 && + IN6_IS_ADDR_UNSPECIFIED(&mask6->sin6_addr)) + nh->nh_flags |= NHF_DEFAULT; + + /* Set nexthop type */ + if (nhop_get_type(nh) == 0) { + if (nh->nh_flags & NHF_GATEWAY) + nh_type = NH_TYPE_IPV6_ETHER_NHOP; + else + nh_type = NH_TYPE_IPV6_ETHER_RSLV; + + nhop_set_type(nh, nh_type); + } + + return (0); +} + /* * Do what we need to do when inserting a route. */ @@ -169,6 +208,7 @@ in6_inithead(void **head, int off, u_int fibnum) return (0); rh->rnh_addaddr = in6_addroute; + rh->rnh_preadd = rib6_preadd; #ifdef RADIX_MPATH rt_mpath_init_rnh(rh); #endif diff --git a/sys/sys/socket.h b/sys/sys/socket.h index eaad9b1bacdb..1768480cc8c8 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -416,6 +416,7 @@ struct sockproto { #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ +#define NET_RT_NHOP 6 /* dump routing nexthops */ #endif /* __BSD_VISIBLE */ /* diff --git a/usr.bin/netstat/Makefile b/usr.bin/netstat/Makefile index 0e60b0b40359..b61afdc410b0 100644 --- a/usr.bin/netstat/Makefile +++ b/usr.bin/netstat/Makefile @@ -5,7 +5,7 @@ PROG= netstat SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \ - unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \ + unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \ nl_defs.h nl_symbols.c: nlist_symbols diff --git a/usr.bin/netstat/common.c b/usr.bin/netstat/common.c new file mode 100644 index 000000000000..ac721b3e9ab0 --- /dev/null +++ b/usr.bin/netstat/common.c @@ -0,0 +1,140 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1983, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netstat.h" +#include "common.h" + +const char * +fmt_flags(const struct bits *p, int f) +{ + static char name[33]; + char *flags; + + for (flags = name; p->b_mask; p++) + if (p->b_mask & f) + *flags++ = p->b_val; + *flags = '\0'; + return (name); +} + +void +print_flags_generic(int flags, const struct bits *pbits, const char *format, + const char *tag_name) +{ + const struct bits *p; + char tag_fmt[64]; + + xo_emit(format, fmt_flags(pbits, flags)); + + snprintf(tag_fmt, sizeof(tag_fmt), "{le:%s/%%s}", tag_name); + xo_open_list(tag_name); + for (p = pbits; p->b_mask; p++) + if (p->b_mask & flags) + xo_emit(tag_fmt, p->b_name); + xo_close_list(tag_name); +} + +struct ifmap_entry * +prepare_ifmap(size_t *pifmap_size) +{ + int ifindex = 0, size; + struct ifaddrs *ifap, *ifa; + struct sockaddr_dl *sdl; + + struct ifmap_entry *ifmap = NULL; + int ifmap_size = 0; + + /* + * Retrieve interface list at first + * since we need #ifindex -> if_xname match + */ + if (getifaddrs(&ifap) != 0) + err(EX_OSERR, "getifaddrs"); + + for (ifa = ifap; ifa; ifa = ifa->ifa_next) { + + if (ifa->ifa_addr->sa_family != AF_LINK) + continue; + + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + ifindex = sdl->sdl_index; + + if (ifindex >= ifmap_size) { + size = roundup(ifindex + 1, 32) * + sizeof(struct ifmap_entry); + if ((ifmap = realloc(ifmap, size)) == NULL) + errx(2, "realloc(%d) failed", size); + memset(&ifmap[ifmap_size], 0, + size - ifmap_size * + sizeof(struct ifmap_entry)); + + ifmap_size = roundup(ifindex + 1, 32); + } + + if (*ifmap[ifindex].ifname != '\0') + continue; + + strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ); + } + + freeifaddrs(ifap); + + *pifmap_size = ifmap_size; + + return (ifmap); +} + diff --git a/usr.bin/netstat/common.h b/usr.bin/netstat/common.h new file mode 100644 index 000000000000..aafa45df8936 --- /dev/null +++ b/usr.bin/netstat/common.h @@ -0,0 +1,58 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1992, 1993 + * Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)netstat.h 8.2 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#ifndef _NETSTAT_COMMON_H_ +#define _NETSTAT_COMMON_H_ + +struct bits { + u_long b_mask; + char b_val; + const char *b_name; +}; +extern struct bits rt_bits[]; + +const char *fmt_flags(const struct bits *p, int f); +void print_flags_generic(int flags, const struct bits *pbits, + const char *format, const char *tag_name); +int print_sockaddr(const char *name, struct sockaddr *sa, + struct sockaddr *mask, int flags, int width); + +struct ifmap_entry { + char ifname[IFNAMSIZ]; +}; + +struct ifmap_entry *prepare_ifmap(size_t *ifmap_size); + +#endif + diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c index 03dceab993cf..329c551cfc5d 100644 --- a/usr.bin/netstat/main.c +++ b/usr.bin/netstat/main.c @@ -214,6 +214,7 @@ int mflag; /* show memory stats */ int noutputs = 0; /* how much outputs before we exit */ int numeric_addr; /* show addresses numerically */ int numeric_port; /* show ports numerically */ +int oflag; /* show nexthop objects*/ int Pflag; /* show TCP log ID */ static int pflag; /* show given protocol */ static int Qflag; /* show netisr information */ @@ -248,7 +249,7 @@ main(int argc, char *argv[]) if (argc < 0) exit(EXIT_FAILURE); - while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz")) + while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz")) != -1) switch(ch) { case '4': @@ -345,6 +346,9 @@ main(int argc, char *argv[]) case 'n': numeric_addr = numeric_port = 1; break; + case 'o': + oflag = 1; + break; case 'P': Pflag = 1; break; @@ -494,6 +498,14 @@ main(int argc, char *argv[]) xo_finish(); exit(0); } + if (oflag) { + xo_open_container("statistics"); + nhops_print(fib, af); + xo_close_container("statistics"); + xo_finish(); + exit(0); + } + if (gflag) { xo_open_container("statistics"); diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h index 5f35ff097851..713608431a12 100644 --- a/usr.bin/netstat/netstat.h +++ b/usr.bin/netstat/netstat.h @@ -147,6 +147,10 @@ void rt_stats(void); char *routename(struct sockaddr *, int); const char *netname(struct sockaddr *, struct sockaddr *); void routepr(int, int); +int p_sockaddr(const char *name, struct sockaddr *sa, + struct sockaddr *mask, int flags, int width); +const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, + int flags); #ifdef NETGRAPH void netgraphprotopr(u_long, const char *, int, int); @@ -157,3 +161,4 @@ void unixpr(u_long, u_long, u_long, u_long, u_long, bool *); void mroutepr(void); void mrt_stats(void); void bpf_stats(char *); +void nhops_print(int fibnum, int af); diff --git a/usr.bin/netstat/nhops.c b/usr.bin/netstat/nhops.c new file mode 100644 index 000000000000..d62eb7290f5c --- /dev/null +++ b/usr.bin/netstat/nhops.c @@ -0,0 +1,472 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1983, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netstat.h" +#include "common.h" + +/* column widths; each followed by one space */ +#ifndef INET6 +#define WID_DST_DEFAULT(af) 18 /* width of destination column */ +#define WID_GW_DEFAULT(af) 18 /* width of gateway column */ +#define WID_IF_DEFAULT(af) (Wflag ? 10 : 8) /* width of netif column */ +#else +#define WID_DST_DEFAULT(af) \ + ((af) == AF_INET6 ? (numeric_addr ? 33: 18) : 18) +#define WID_GW_DEFAULT(af) \ + ((af) == AF_INET6 ? (numeric_addr ? 29 : 18) : 18) +#define WID_IF_DEFAULT(af) ((af) == AF_INET6 ? 8 : (Wflag ? 10 : 8)) +#endif /*INET6*/ +static int wid_dst; +static int wid_gw; +static int wid_flags; +static int wid_pksent; +static int wid_mtu; +static int wid_if; +static int wid_nhidx; +static int wid_nhtype; +static int wid_refcnt; +static int wid_prepend; + +static struct bits nh_bits[] = { + { NHF_REJECT, 'R', "reject" }, + { NHF_BLACKHOLE,'B', "blackhole" }, + { NHF_REDIRECT, 'r', "redirect" }, + { NHF_GATEWAY, 'G', "gateway" }, + { NHF_DEFAULT, 'd', "default" }, + { NHF_BROADCAST,'b', "broadcast" }, + { 0 , 0, NULL } +}; + +static char *nh_types[] = { + "empty", /* 0 */ + "v4/resolve", /* 1 */ + "v4/gw", + "v6/resolve", + "v6/gw" +}; + +struct nhop_entry { + char gw[64]; + char ifname[IFNAMSIZ]; +}; + +struct nhop_map { + struct nhop_entry *ptr; + size_t size; +}; +static struct nhop_map global_nhop_map; + +static void nhop_map_update(struct nhop_map *map, uint32_t idx, + char *gw, char *ifname); +static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx); + + +static struct ifmap_entry *ifmap; +static size_t ifmap_size; + +static void +print_sockaddr_buf(char *buf, size_t bufsize, const struct sockaddr *sa) +{ + + switch (sa->sa_family) { + case AF_INET: + inet_ntop(AF_INET, &((struct sockaddr_in *)sa)->sin_addr, + buf, bufsize); + break; + case AF_INET6: + inet_ntop(AF_INET6, &((struct sockaddr_in6 *)sa)->sin6_addr, + buf, bufsize); + break; + default: + snprintf(buf, bufsize, "unknown:%d", sa->sa_family); + break; + } +} + +static int +print_addr(const char *name, const char *addr, int width) +{ + char buf[128]; + int protrusion; + + if (width < 0) { + snprintf(buf, sizeof(buf), "{:%s/%%s} ", name); + xo_emit(buf, addr); + protrusion = 0; + } else { + if (Wflag != 0 || numeric_addr) { + snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%s}{]:} ", + -width, name); + xo_emit(buf, addr); + protrusion = strlen(addr) - width; + if (protrusion < 0) + protrusion = 0; + } else { + snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%-.*s}{]:} ", + -width, name); + xo_emit(buf, width, addr); + protrusion = 0; + } + } + return (protrusion); +} + + +static void +print_nhop_header(int af1 __unused) +{ + + if (Wflag) { + xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} " + "{T:/%*.*s} {T:/%-*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*s}\n", + wid_nhidx, wid_nhidx, "Idx", + wid_nhtype, wid_nhtype, "Type", + wid_dst, wid_dst, "IFA", + wid_gw, wid_gw, "Gateway", + wid_flags, wid_flags, "Flags", + wid_pksent, wid_pksent, "Use", + wid_mtu, wid_mtu, "Mtu", + wid_if, wid_if, "Netif", + wid_if, wid_if, "Addrif", + wid_refcnt, wid_refcnt, "Refcnt", + wid_prepend, "Prepend"); + } else { + xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} " + " {T:/%*s}\n", + wid_nhidx, wid_nhidx, "Idx", + wid_dst, wid_dst, "IFA", + wid_gw, wid_gw, "Gateway", + wid_flags, wid_flags, "Flags", + wid_if, wid_if, "Netif", + wid_prepend, "Refcnt"); + } +} + +static void +nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname) +{ + if (idx >= map->size) { + uint32_t new_size; + size_t sz; + if (map->size == 0) + new_size = 32; + else + new_size = map->size * 2; + if (new_size <= idx) + new_size = roundup(idx + 1, 32); + + sz = new_size * (sizeof(struct nhop_entry)); + if ((map->ptr = realloc(map->ptr, sz)) == NULL) + errx(2, "realloc(%lu) failed", sz); + + memset(&map->ptr[map->size], 0, (new_size - map->size) * sizeof(struct nhop_entry)); + map->size = new_size; + } + + strlcpy(map->ptr[idx].ifname, ifname, sizeof(map->ptr[idx].ifname)); + strlcpy(map->ptr[idx].gw, gw, sizeof(map->ptr[idx].gw)); +} + +static struct nhop_entry * +nhop_get(struct nhop_map *map, uint32_t idx) +{ + + if (idx >= map->size) + return (NULL); + if (*map->ptr[idx].ifname == '\0') + return (NULL); + return &map->ptr[idx]; +} + +static void +print_nhop_entry_sysctl(const char *name, struct rt_msghdr *rtm, struct nhop_external *nh) +{ + char buffer[128]; + char iface_name[128]; + int protrusion; + char gw_addr[64]; + struct nhop_addrs *na; + struct sockaddr *sa_gw, *sa_ifa; + + xo_open_instance(name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:index/%%lu}{]:} ", wid_nhidx); + //xo_emit("{t:index/%-lu} ", wid_nhidx, nh->nh_idx); + xo_emit(buffer, nh->nh_idx); + + if (Wflag) { + char *cp = nh_types[nh->nh_type]; + xo_emit("{t:type_str/%*s} ", wid_nhtype, cp); + } + memset(iface_name, 0, sizeof(iface_name)); + if (nh->ifindex < (uint32_t)ifmap_size) { + strlcpy(iface_name, ifmap[nh->ifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + + na = (struct nhop_addrs *)((char *)nh + nh->nh_len); + //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr)); + //protrusion = p_addr("ifa", src_addr, wid_dst); + sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off); + sa_ifa = (struct sockaddr *)((char *)na + na->src_sa_off); + protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst); + + if (nh->nh_flags & NHF_GATEWAY) { + const char *cp; + cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST); + strlcpy(gw_addr, cp, sizeof(gw_addr)); + } else + snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name); + protrusion = print_addr("gateway", gw_addr, wid_dst - protrusion); + + nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ", + wid_flags - protrusion); + + //p_nhflags(nh->nh_flags, buffer); + print_flags_generic(rtm->rtm_flags, rt_bits, buffer, "rt_flags_pretty"); + + if (Wflag) { + xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent); + xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu); + } + //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n"); + + if (Wflag) + xo_emit("{t:interface-name/%*s}", wid_if, iface_name); + else + xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name); + + memset(iface_name, 0, sizeof(iface_name)); + if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) { + strlcpy(iface_name, ifmap[nh->aifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + if (Wflag) + xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name); + + xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount); + if (Wflag && nh->prepend_len) { + char *prepend_hex = "AABBCCDDEE"; + xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex); + } + + xo_emit("\n"); + xo_close_instance(name); +} + +struct nhops_map { + uint32_t idx; + struct rt_msghdr *rtm; +}; + +static int +cmp_nh_idx(const void *_a, const void *_b) +{ + const struct nhops_map *a, *b; + + a = _a; + b = _b; + + if (a->idx > b->idx) + return (1); + else if (a->idx < b->idx) + return (-1); + return (0); +} + +static void +print_nhops_sysctl(int fibnum, int af) +{ + size_t needed; + int mib[7]; + char *buf, *next, *lim; + struct rt_msghdr *rtm; + struct nhop_external *nh; + int fam; + struct nhops_map *nh_map; + size_t nh_count, nh_size; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = af; + mib[4] = NET_RT_NHOP; + mib[5] = 0; + mib[6] = fibnum; + if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) + err(EX_OSERR, "sysctl: net.route.0.%d.nhdump.%d estimate", af, + fibnum); + if ((buf = malloc(needed)) == NULL) + errx(2, "malloc(%lu)", (unsigned long)needed); + if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) + err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum); + lim = buf + needed; + xo_open_container("nhop-table"); + xo_open_list("rt-family"); + + /* + * nexhops are received unsorted. Collect everything first, sort and then display + * sorted. + */ + nh_count = 0; + nh_size = 16; + nh_map = calloc(nh_size, sizeof(struct nhops_map)); + for (next = buf; next < lim; next += rtm->rtm_msglen) { + rtm = (struct rt_msghdr *)next; + if (rtm->rtm_version != RTM_VERSION) + continue; + + if (nh_count >= nh_size) { + nh_size *= 2; + nh_map = realloc(nh_map, nh_size * sizeof(struct nhops_map)); + } + + nh = (struct nhop_external *)(rtm + 1); + nh_map[nh_count].idx = nh->nh_idx; + nh_map[nh_count].rtm = rtm; + nh_count++; + } + + if (nh_count > 0) { + qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx); + nh = (struct nhop_external *)(nh_map[0].rtm + 1); + fam = nh->nh_family; + + wid_dst = WID_GW_DEFAULT(fam); + wid_gw = WID_GW_DEFAULT(fam); + wid_nhidx = 5; + wid_nhtype = 12; + wid_refcnt = 6; + wid_flags = 6; + wid_pksent = 8; + wid_mtu = 6; + wid_if = WID_IF_DEFAULT(fam); + xo_open_instance("rt-family"); + pr_family(fam); + xo_open_list("nh-entry"); + + print_nhop_header(fam); + + for (size_t i = 0; i < nh_count; i++) { + rtm = nh_map[i].rtm; + nh = (struct nhop_external *)(rtm + 1); + print_nhop_entry_sysctl("nh-entry", rtm, nh); + } + + xo_close_list("nh-entry"); + xo_close_instance("rt-family"); + } + xo_close_list("rt-family"); + xo_close_container("nhop-table"); + free(buf); +} + +static void +p_nhflags(int f, const char *format) +{ + struct bits *p; + char *pretty_name = "nh_flags_pretty"; + + xo_emit(format, fmt_flags(nh_bits, f)); + + xo_open_list(pretty_name); + for (p = nh_bits; p->b_mask; p++) + if (p->b_mask & f) + xo_emit("{le:nh_flags_pretty/%s}", p->b_name); + xo_close_list(pretty_name); +} + +void +nhops_print(int fibnum, int af) +{ + size_t intsize; + int numfibs; + + intsize = sizeof(int); + if (fibnum == -1 && + sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1) + fibnum = 0; + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + numfibs = 1; + if (fibnum < 0 || fibnum > numfibs - 1) + errx(EX_USAGE, "%d: invalid fib", fibnum); + + ifmap = prepare_ifmap(&ifmap_size); + + xo_open_container("route-nhop-information"); + xo_emit("{T:Nexthop data}"); + if (fibnum) + xo_emit(" ({L:fib}: {:fib/%d})", fibnum); + xo_emit("\n"); + print_nhops_sysctl(fibnum, af); + xo_close_container("route-nhop-information"); +} + diff --git a/usr.bin/netstat/route.c b/usr.bin/netstat/route.c index e15cf1578029..ba47a4b56ac5 100644 --- a/usr.bin/netstat/route.c +++ b/usr.bin/netstat/route.c @@ -69,16 +69,13 @@ __FBSDID("$FreeBSD$"); #include #include #include "netstat.h" +#include "common.h" #include "nl_defs.h" /* * Definitions for showing gateway flags. */ -static struct bits { - u_long b_mask; - char b_val; - const char *b_name; -} bits[] = { +struct bits rt_bits[] = { { RTF_UP, 'U', "up" }, { RTF_GATEWAY, 'G', "gateway" }, { RTF_HOST, 'H', "host" }, @@ -99,11 +96,8 @@ static struct bits { { 0 , 0, NULL } }; -struct ifmap_entry { - char ifname[IFNAMSIZ]; -}; static struct ifmap_entry *ifmap; -static int ifmap_size; +static size_t ifmap_size; static struct timespec uptime; static const char *netname4(in_addr_t, in_addr_t); @@ -112,12 +106,7 @@ static const char *netname6(struct sockaddr_in6 *, struct sockaddr_in6 *); #endif static void p_rtable_sysctl(int, int); static void p_rtentry_sysctl(const char *name, struct rt_msghdr *); -static int p_sockaddr(const char *name, struct sockaddr *, struct sockaddr *, - int, int); -static const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, - int flags); static void p_flags(int, const char *); -static const char *fmt_flags(int f); static void domask(char *, size_t, u_long); @@ -229,7 +218,7 @@ pr_rthdr(int af1 __unused) wid_dst, wid_dst, "Destination", wid_gw, wid_gw, "Gateway", wid_flags, wid_flags, "Flags", - wid_pksent, wid_pksent, "Use", + wid_mtu, wid_mtu, "Nhop#", wid_mtu, wid_mtu, "Mtu", wid_if, wid_if, "Netif", wid_expire, "Expire"); @@ -252,46 +241,10 @@ p_rtable_sysctl(int fibnum, int af) char *buf, *next, *lim; struct rt_msghdr *rtm; struct sockaddr *sa; - int fam = AF_UNSPEC, ifindex = 0, size; + int fam = AF_UNSPEC; int need_table_close = false; - struct ifaddrs *ifap, *ifa; - struct sockaddr_dl *sdl; - - /* - * Retrieve interface list at first - * since we need #ifindex -> if_xname match - */ - if (getifaddrs(&ifap) != 0) - err(EX_OSERR, "getifaddrs"); - - for (ifa = ifap; ifa; ifa = ifa->ifa_next) { - - if (ifa->ifa_addr->sa_family != AF_LINK) - continue; - - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - ifindex = sdl->sdl_index; - - if (ifindex >= ifmap_size) { - size = roundup(ifindex + 1, 32) * - sizeof(struct ifmap_entry); - if ((ifmap = realloc(ifmap, size)) == NULL) - errx(2, "realloc(%d) failed", size); - memset(&ifmap[ifmap_size], 0, - size - ifmap_size * - sizeof(struct ifmap_entry)); - - ifmap_size = roundup(ifindex + 1, 32); - } - - if (*ifmap[ifindex].ifname != '\0') - continue; - - strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ); - } - - freeifaddrs(ifap); + ifmap = prepare_ifmap(&ifmap_size); mib[0] = CTL_NET; mib[1] = PF_ROUTE; @@ -377,7 +330,8 @@ p_rtentry_sysctl(const char *name, struct rt_msghdr *rtm) wid_flags - protrusion); p_flags(rtm->rtm_flags, buffer); if (Wflag) { - xo_emit("{t:use/%*lu} ", wid_pksent, rtm->rtm_rmx.rmx_pksent); + /* XXX: use=0? */ + xo_emit("{t:nhop/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_nhidx); if (rtm->rtm_rmx.rmx_mtu != 0) xo_emit("{t:mtu/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_mtu); @@ -410,7 +364,7 @@ p_rtentry_sysctl(const char *name, struct rt_msghdr *rtm) xo_close_instance(name); } -static int +int p_sockaddr(const char *name, struct sockaddr *sa, struct sockaddr *mask, int flags, int width) { @@ -442,7 +396,7 @@ p_sockaddr(const char *name, struct sockaddr *sa, struct sockaddr *mask, return (protrusion); } -static const char * +const char * fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, int flags) { static char buf[128]; @@ -519,30 +473,10 @@ fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, int flags) static void p_flags(int f, const char *format) { - struct bits *p; - xo_emit(format, fmt_flags(f)); - - xo_open_list("flags_pretty"); - for (p = bits; p->b_mask; p++) - if (p->b_mask & f) - xo_emit("{le:flags_pretty/%s}", p->b_name); - xo_close_list("flags_pretty"); + print_flags_generic(f, rt_bits, format, "flags_pretty"); } -static const char * -fmt_flags(int f) -{ - static char name[33]; - char *flags; - struct bits *p = bits; - - for (flags = name; p->b_mask; p++) - if (p->b_mask & f) - *flags++ = p->b_val; - *flags = '\0'; - return (name); -} char * routename(struct sockaddr *sa, int flags)