From 1a7f90755fb06c8dc4e880e0706c5b05a6c47c00 Mon Sep 17 00:00:00 2001 From: melifaro Date: Sat, 3 Oct 2020 10:47:17 +0000 Subject: [PATCH] Introduce scalable route multipath. This change is based on the nexthop objects landed in D24232. The change introduces the concept of nexthop groups. Each group contains the collection of nexthops with their relative weights and a dataplane-optimized structure to enable efficient nexthop selection. Simular to the nexthops, nexthop groups are immutable. Dataplane part gets compiled during group creation and is basically an array of nexthop pointers, compiled w.r.t their weights. With this change, `rt_nhop` field of `struct rtentry` contains either nexthop or nexthop group. They are distinguished by the presense of NHF_MULTIPATH flag. All dataplane lookup functions returns pointer to the nexthop object, leaving nexhop groups details inside routing subsystem. User-visible changes: The change is intended to be backward-compatible: all non-mpath operations should work as before with ROUTE_MPATH and net.route.multipath=1. All routes now comes with weight, default weight is 1, maximum is 2^24-1. Current maximum multipath group width is statically set to 64. This will become sysctl-tunable in the followup changes. Using functionality: * Recompile kernel with ROUTE_MPATH * set net.route.multipath to 1 route add -6 2001:db8::/32 2001:db8::2 -weight 10 route add -6 2001:db8::/32 2001:db8::3 -weight 20 netstat -6On Nexthop groups data Internet6: GrpIdx NhIdx Weight Slots Gateway Netif Refcnt 1 ------- ------- ------- --------------------------------------- --------- 1 13 10 1 2001:db8::2 vlan2 14 20 2 2001:db8::3 vlan2 Next steps: * Land outbound hashing for locally-originated routes ( D26523 ). * Fix net/bird multipath (net/frr seems to work fine) * Add ROUTE_MPATH to GENERIC * Set net.route.multipath=1 by default Tested by: olivier Reviewed by: glebius Relnotes: yes Differential Revision: https://reviews.freebsd.org/D26449 --- sys/conf/NOTES | 4 +- sys/conf/files | 4 +- sys/conf/options | 1 + sys/net/radix.c | 4 - sys/net/route.c | 1 - sys/net/route.h | 5 + sys/net/route/mpath_ctl.c | 165 +++++++ sys/net/route/nhgrp.c | 344 +++++++++++++++ sys/net/route/nhgrp_ctl.c | 788 ++++++++++++++++++++++++++++++++++ sys/net/route/nhgrp_var.h | 72 ++++ sys/net/route/nhop.c | 8 +- sys/net/route/nhop.h | 37 +- sys/net/route/nhop_ctl.c | 7 + sys/net/route/nhop_var.h | 11 +- sys/net/route/route_ctl.c | 275 ++++++++---- sys/net/route/route_ctl.h | 18 + sys/net/route/route_helpers.c | 164 +++++++ sys/net/route/route_var.h | 80 +++- sys/net/rtsock.c | 111 ++++- sys/netinet/in.c | 10 - sys/netinet/in_fib.c | 59 +-- sys/netinet/in_rmx.c | 5 - sys/netinet/ip_output.c | 5 - sys/netinet6/in6_fib.c | 55 +-- sys/netinet6/in6_rmx.c | 5 - sys/netinet6/nd6.c | 5 + sys/sys/socket.h | 1 + usr.bin/netstat/Makefile | 2 +- usr.bin/netstat/common.h | 17 + usr.bin/netstat/main.c | 14 +- usr.bin/netstat/netstat.h | 1 + usr.bin/netstat/nhgrp.c | 355 +++++++++++++++ usr.bin/netstat/nhops.c | 44 +- 33 files changed, 2433 insertions(+), 244 deletions(-) create mode 100644 sys/net/route/mpath_ctl.c create mode 100644 sys/net/route/nhgrp.c create mode 100644 sys/net/route/nhgrp_ctl.c create mode 100644 sys/net/route/nhgrp_var.h create mode 100644 usr.bin/netstat/nhgrp.c diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 7aa957efa271..0d9fac844365 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -1002,7 +1002,7 @@ device lagg # # TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack. # -# RADIX_MPATH provides support for equal-cost multi-path routing. +# ROUTE_MPATH provides support for multipath routing. # options MROUTING # Multicast routing options IPFIREWALL #firewall @@ -1023,7 +1023,7 @@ options TCPDEBUG options TCPPCAP options TCP_BLACKBOX options TCP_HHOOK -options RADIX_MPATH +options ROUTE_MPATH # The MBUF_STRESS_TEST option enables options which create # various random failures / extreme cases related to mbuf diff --git a/sys/conf/files b/sys/conf/files index e3c142441653..8ec5eacd053e 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4143,10 +4143,12 @@ net/debugnet.c optional inet debugnet net/debugnet_inet.c optional inet debugnet net/pfil.c optional ether | inet net/radix.c standard -net/radix_mpath.c standard net/raw_cb.c standard net/raw_usrreq.c standard net/route.c standard +net/route/mpath_ctl.c optional route_mpath +net/route/nhgrp.c optional route_mpath +net/route/nhgrp_ctl.c optional route_mpath net/route/nhop.c standard net/route/nhop_ctl.c standard net/route/nhop_utils.c standard diff --git a/sys/conf/options b/sys/conf/options index e22197093f58..e68621d61a37 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -454,6 +454,7 @@ NFSLOCKD PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h RADIX_MPATH opt_mpath.h +ROUTE_MPATH opt_route.h ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h diff --git a/sys/net/radix.c b/sys/net/radix.c index 3d9ed0a69538..f65153393d74 100644 --- a/sys/net/radix.c +++ b/sys/net/radix.c @@ -44,10 +44,6 @@ #include #include #include -#include "opt_mpath.h" -#ifdef RADIX_MPATH -#include -#endif #else /* !_KERNEL */ #include #include diff --git a/sys/net/route.c b/sys/net/route.c index d19a4cfc0afe..dac3211bc1f5 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -39,7 +39,6 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mrouting.h" -#include "opt_mpath.h" #include "opt_route.h" #include diff --git a/sys/net/route.h b/sys/net/route.h index 19c9ce0eb51b..021b4621692b 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -178,6 +178,7 @@ VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ */ /* Consumer-visible nexthop info flags */ +#define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ @@ -208,6 +209,10 @@ struct rtstat { uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ + uint64_t rts_add_failure; /* # of route addition failures */ + uint64_t rts_add_retry; /* # of route addition retries */ + uint64_t rts_del_failure; /* # of route deletion failure */ + uint64_t rts_del_retry; /* # of route deletion retries */ }; /* diff --git a/sys/net/route/mpath_ctl.c b/sys/net/route/mpath_ctl.c new file mode 100644 index 000000000000..1ac7c191ed05 --- /dev/null +++ b/sys/net/route/mpath_ctl.c @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* + * This file contains the supporting functions for adding/deleting/updating + * multipath routes to the routing table. + */ + +SYSCTL_DECL(_net_route); + +/* + * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for the + * prefix specified by @rt. + * + * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated + * with the operation result. + * Otherwise errno is returned. + * + * caller responsibility is to unlock/free rt and + * rt->rt_nhop. + */ +int +add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rt, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + RIB_RLOCK_TRACKER; + struct route_nhop_data rnd_new; + int error = 0; + + /* + * It is possible that multiple rtsock speakers will try to update + * the same route simultaneously. Reduce the chance of failing the + * request by retrying the cycle multiple times. + */ + for (int i = 0; i < RIB_MAX_RETRIES; i++) { + error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, + &rnd_new); + if (error != 0) { + if (error != EAGAIN) + break; + + /* + * Group creation failed, most probably because + * @rnd_orig data got scheduled for deletion. + * Refresh @rnd_orig data and retry. + */ + RIB_RLOCK(rnh); + lookup_prefix(rnh, info, rnd_orig); + RIB_RUNLOCK(rnh); + continue; + } + + error = change_route_conditional(rnh, rt, info, rnd_orig, + &rnd_new, rc); + if (error != EAGAIN) + break; + RTSTAT_INC(rts_add_retry); + } + + return (error); +} + +struct rt_match_info { + struct rt_addrinfo *info; + struct rtentry *rt; +}; + +static bool +gw_filter_func(const struct nhop_object *nh, void *_data) +{ + struct rt_match_info *ri = (struct rt_match_info *)_data; + + return (check_info_match_nhop(ri->info, ri->rt, nh) == 0); +} + +/* + * Tries to delete matching paths from @nhg. + * Returns 0 on success and updates operation result in @rc. + */ +int +del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info, + struct rtentry *rt, struct nhgrp_object *nhg, + struct rib_cmd_info *rc) +{ + struct route_nhop_data rnd; + struct rt_match_info ri = { .info = info, .rt = rt }; + int error; + + RIB_WLOCK_ASSERT(rh); + + /* + * Require gateway to delete multipath routes, to forbid + * deleting all paths at once. + * If the filter function is provided, skip gateway check to + * allow rib_walk_del() delete routes for any criteria based + * on provided callback. + */ + if ((info->rti_info[RTAX_GATEWAY] == NULL) && (info->rti_filter == NULL)) + return (ESRCH); + + error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)&ri, + &rnd); + if (error == 0) + error = change_route_nhop(rh, rt, info, &rnd, rc); + return (error); +} + diff --git a/sys/net/route/nhgrp.c b/sys/net/route/nhgrp.c new file mode 100644 index 000000000000..c25f4f09865b --- /dev/null +++ b/sys/net/route/nhgrp.c @@ -0,0 +1,344 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* + * This file contains data structures management logic for the nexthop + * groups ("nhgrp") route subsystem. + * + * Nexthop groups are used to store multiple routes available for the specific + * prefix. Nexthop groups are immutable and can be shared across multiple + * prefixes. + * + * Each group consists of a control plane part and a dataplane part. + * Control plane is basically a collection of nexthop objects with + * weights and refcount. + * + * Datapath consists of a array of nexthop pointers, compiled from control + * plane data to support O(1) nexthop selection. + * + * For example, consider the following group: + * [(nh1, weight=100), (nh2, weight=200)] + * It will compile to the following array: + * [nh1, nh2, nh2] + * + */ + +static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, + uint32_t new_idx_items); + +static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b); +static unsigned int hash_nhgrp(const struct nhgrp_priv *obj); + +static unsigned +djb_hash(const unsigned char *h, const int len) +{ + unsigned int result = 0; + int i; + + for (i = 0; i < len; i++) + result = 33 * result ^ h[i]; + + return (result); +} + +static int +cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b) +{ + + /* + * In case of consistent hashing, there can be multiple nexthop groups + * with the same "control plane" list of nexthops with weights and a + * different set of "data plane" nexthops. + * For now, ignore the data plane and focus on the control plane list. + */ + if (a->nhg_nh_count != b->nhg_nh_count) + return (0); + return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights, + sizeof(struct weightened_nhop) * a->nhg_nh_count); +} + +/* + * Hash callback: calculate hash of an object + */ +static unsigned int +hash_nhgrp(const struct nhgrp_priv *obj) +{ + const unsigned char *key; + + key = (const unsigned char *)obj->nhg_nh_weights; + + return (djb_hash(key, sizeof(struct weightened_nhop) * obj->nhg_nh_count)); +} + +/* + * Returns object referenced and unlocked + */ +struct nhgrp_priv * +find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key) +{ + struct nhgrp_priv *priv_ret; + + NHOPS_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret); + if (priv_ret != NULL) { + if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) { + /* refcount is 0 -> group is being deleted */ + priv_ret = NULL; + } + } + NHOPS_RUNLOCK(ctl); + + return (priv_ret); +} + +int +link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv) +{ + uint16_t idx; + uint32_t new_num_buckets, new_num_items; + + NHOPS_WLOCK(ctl); + /* Check if we need to resize hash and index */ + new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head); + new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head); + + if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) { + NHOPS_WUNLOCK(ctl); + DPRINTF("Unable to allocate mpath index"); + consider_resize(ctl, new_num_buckets, new_num_items); + return (0); + } + + grp_priv->nhg_idx = idx; + grp_priv->nh_control = ctl; + CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv); + + NHOPS_WUNLOCK(ctl); + + consider_resize(ctl, new_num_buckets, new_num_items); + + return (1); +} + +struct nhgrp_priv * +unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key) +{ + struct nhgrp_priv *nhg_priv_ret; + int ret, idx; + + NHOPS_WLOCK(ctl); + + CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret); + + if (nhg_priv_ret == NULL) { + DPRINTF("Unable to find nhop group!"); + NHOPS_WUNLOCK(ctl); + return (NULL); + } + + idx = nhg_priv_ret->nhg_idx; + ret = bitmask_free_idx(&ctl->gr_idx_head, idx); + nhg_priv_ret->nhg_idx = 0; + nhg_priv_ret->nh_control = NULL; + + NHOPS_WUNLOCK(ctl); + + return (nhg_priv_ret); +} + +/* + * Checks if hash needs resizing and performs this resize if necessary + * + */ +__noinline static void +consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items) +{ + void *nh_ptr, *nh_idx_ptr; + void *old_idx_ptr; + size_t alloc_size; + + nh_ptr = NULL ; + if (new_nh_buckets != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets); + nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + nh_idx_ptr = NULL; + if (new_idx_items != 0) { + alloc_size = bitmask_get_size(new_idx_items); + nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO); + } + + if (nh_ptr == NULL && nh_idx_ptr == NULL) { + /* Either resize is not required or allocations have failed. */ + return; + } + + DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", + nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items); + + old_idx_ptr = NULL; + + NHOPS_WLOCK(ctl); + if (nh_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets); + } + if (nh_idx_ptr != NULL) { + if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items)) + bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr); + } + NHOPS_WUNLOCK(ctl); + + if (nh_ptr != NULL) + free(nh_ptr, M_NHOP); + if (old_idx_ptr != NULL) + free(old_idx_ptr, M_NHOP); +} + +/* + * Function allocating the necessary group data structures. + */ +bool +nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags) +{ + size_t alloc_size; + uint32_t num_buckets, num_items; + void *cht_ptr, *mask_ptr; + + malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO; + + num_buckets = 8; + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags); + + if (cht_ptr == NULL) { + DPRINTF("mpath init failed"); + return (false); + } + + /* + * Allocate nexthop index bitmask. + */ + num_items = 128; + mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags); + if (mask_ptr == NULL) { + DPRINTF("mpath bitmask init failed"); + free(cht_ptr, M_NHOP); + return (false); + } + + NHOPS_WLOCK(ctl); + + if (ctl->gr_head.hash_size == 0) { + /* Init hash and bitmask */ + CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets); + bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items); + NHOPS_WUNLOCK(ctl); + } else { + /* Other thread has already initiliazed hash/bitmask */ + NHOPS_WUNLOCK(ctl); + free(cht_ptr, M_NHOP); + free(mask_ptr, M_NHOP); + } + + DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum, + ctl->rh->rib_family); + + return (true); +} + +int +nhgrp_ctl_init(struct nh_control *ctl) +{ + + /* + * By default, do not allocate datastructures as multipath + * routes will not be necessarily used. + */ + CHT_SLIST_INIT(&ctl->gr_head, NULL, 0); + bitmask_init(&ctl->gr_idx_head, NULL, 0); + return (0); +} + +void +nhgrp_ctl_free(struct nh_control *ctl) +{ + + if (ctl->gr_head.ptr != NULL) + free(ctl->gr_head.ptr, M_NHOP); + if (ctl->gr_idx_head.idx != NULL) + free(ctl->gr_idx_head.idx, M_NHOP); +} + +void +nhgrp_ctl_unlink_all(struct nh_control *ctl) +{ + struct nhgrp_priv *nhg_priv; + + NHOPS_WLOCK_ASSERT(ctl); + + CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) { + DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx); + refcount_release(&nhg_priv->nhg_linked); + } CHT_SLIST_FOREACH_END; +} + diff --git a/sys/net/route/nhgrp_ctl.c b/sys/net/route/nhgrp_ctl.c new file mode 100644 index 000000000000..a3a824992e08 --- /dev/null +++ b/sys/net/route/nhgrp_ctl.c @@ -0,0 +1,788 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +#define RTDEBUG +#include "opt_inet.h" +#include "opt_route.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* + * This file contains the supporting functions for creating multipath groups + * and compiling their dataplane parts. + */ + +/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */ +_Static_assert(MPF_MULTIPATH == NHF_MULTIPATH, + "MPF_MULTIPATH must be the same as NHF_MULTIPATH"); +/* Offset and size of flags field has to be the same for nhop/nhop groups */ +CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags); +/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */ +CTASSERT(RIB_MAX_MPATH_WIDTH <= 64); + +static int wn_cmp(const void *a, const void *b); +static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops); + +static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl, + struct weightened_nhop *wn, int num_nhops, int *perror); +static void destroy_nhgrp(struct nhgrp_priv *nhg_priv); +static void destroy_nhgrp_epoch(epoch_context_t ctx); +static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv); + +static int +wn_cmp(const void *a, const void *b) +{ + const struct weightened_nhop *wa = a; + const struct weightened_nhop *wb = b; + + if (wa->weight > wb->weight) + return (1); + else if (wa->weight < wb->weight) + return (-1); + + /* Compare nexthops by pointer */ + if (wa->nh > wb->nh) + return (1); + else if (wa->nh < wb->nh) + return (-1); + else + return (0); +} + +/* + * Perform in-place sorting for array of nexthops in @wn. + * + * To avoid nh groups duplication, nexthops/weights in the + * @wn need to be ordered deterministically. + * As this sorting is needed only for the control plane functionality, + * there are no specific external requirements. + * + * Sort by weight first, to ease calculation of the slot sizes. + */ +static void +sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops) +{ + + qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp); +} + +/* + * Calculate minimum number of slots required to fit the existing + * set of weights in the common use case where weights are "easily" + * comparable. + * Assumes @wn is sorted by weight ascending and each weight is > 0. + * Returns number of slots or 0 if precise calculation failed. + * + * Some examples: + * note: (i, X) pair means (nhop=i, weight=X): + * (1, 1) (2, 2) -> 3 slots [1, 2, 2] + * (1, 100), (2, 200) -> 3 slots [1, 2, 2] + * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3] + */ +static uint32_t +calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items) +{ + uint32_t i, last, xmin; + uint64_t total = 0; + + last = 0; + xmin = wn[0].weight; + for (i = 0; i < num_items; i++) { + total += wn[i].weight; + if ((wn[i].weight - last < xmin) && (wn[i].weight != last)) + xmin = wn[i].weight - last; + last = wn[i].weight; + } + /* xmin is the minimum unit of desired capacity */ + if ((total % xmin) != 0) + return (0); + for (i = 0; i < num_items; i++) { + if ((wn[i].weight % xmin) != 0) + return (0); + } + + return ((uint32_t)(total / xmin)); +} + +/* + * Calculate minimum number of slots required to fit the existing + * set of weights while maintaining weight coefficients. + * + * Assume @wn is sorted by weight ascending and each weight is > 0. + * + * Tries to find simple precise solution first and falls back to + * RIB_MAX_MPATH_WIDTH in case of any failure. + */ +static uint32_t +calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items) +{ + uint32_t v; + + v = calc_min_mpath_slots_fast(wn, num_items); + if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH)) + v = RIB_MAX_MPATH_WIDTH; + + return (v); +} + +/* + * Nexthop group data consists of + * 1) dataplane part, with nhgrp_object as a header followed by an + * arbitrary number of nexthop pointers. + * 2) control plane part, with nhgrp_priv as a header, followed by + * an arbirtrary number of 'struct weightened_nhop' object. + * + * Given nexthop groups are (mostly) immutable, allocate all data + * in one go. + * + */ +__noinline static size_t +get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops) +{ + size_t sz; + + sz = sizeof(struct nhgrp_object); + sz += nhg_size * sizeof(struct nhop_object *); + sz += sizeof(struct nhgrp_priv); + sz += num_nhops * sizeof(struct weightened_nhop); + return (sz); +} + +/* + * Compile actual list of nexthops to be used by datapath from + * the nexthop group @dst. + * + * For example, compiling control plane list of 2 nexthops + * [(200, A), (100, B)] would result in the datapath array + * [A, A, B] + */ +static void +compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x, + uint32_t num_slots) +{ + struct nhgrp_object *dst; + int i, slot_idx, remaining_slots; + uint64_t remaining_sum, nh_weight, nh_slots; + + slot_idx = 0; + dst = dst_priv->nhg; + /* Calculate sum of all weights */ + remaining_sum = 0; + for (i = 0; i < dst_priv->nhg_nh_count; i++) + remaining_sum += x[i].weight; + remaining_slots = num_slots; + DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots); + for (i = 0; i < dst_priv->nhg_nh_count; i++) { + /* Calculate number of slots for the current nexthop */ + if (remaining_sum > 0) { + nh_weight = (uint64_t)x[i].weight; + nh_slots = (nh_weight * remaining_slots / remaining_sum); + } else + nh_slots = 0; + + remaining_sum -= x[i].weight; + remaining_slots -= nh_slots; + + DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i, + (uint32_t)remaining_sum, remaining_slots, + (int)nh_slots, slot_idx); + + KASSERT((slot_idx + nh_slots <= num_slots), + ("index overflow during nhg compilation")); + while (nh_slots-- > 0) + dst->nhops[slot_idx++] = x[i].nh; + } +} + +/* + * Allocates new nexthop group for the list of weightened nexthops. + * Assume sorted list. + * Does NOT reference any nexthops in the group. + * Returns group with refcount=1 or NULL. + */ +static struct nhgrp_priv * +alloc_nhgrp(struct weightened_nhop *wn, int num_nhops) +{ + uint32_t nhgrp_size; + int flags = M_NOWAIT; + struct nhgrp_object *nhg; + struct nhgrp_priv *nhg_priv; + + nhgrp_size = calc_min_mpath_slots(wn, num_nhops); + if (nhgrp_size == 0) { + /* Zero weights, abort */ + return (NULL); + } + + size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops); + nhg = malloc(sz, M_NHOP, flags | M_ZERO); + if (nhg == NULL) { + return (NULL); + } + + /* Has to be the first to make NHGRP_PRIV() work */ + nhg->nhg_size = nhgrp_size; + DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size); + nhg->nhg_flags = MPF_MULTIPATH; + + nhg_priv = NHGRP_PRIV(nhg); + nhg_priv->nhg_nh_count = num_nhops; + refcount_init(&nhg_priv->nhg_refcount, 1); + + /* Please see nhgrp_free() comments on the initial value */ + refcount_init(&nhg_priv->nhg_linked, 2); + + nhg_priv->nhg = nhg; + memcpy(&nhg_priv->nhg_nh_weights[0], wn, + num_nhops * sizeof(struct weightened_nhop)); + + compile_nhgrp(nhg_priv, wn, nhg->nhg_size); + + return (nhg_priv); +} + +void +nhgrp_free(struct nhgrp_object *nhg) +{ + struct nhgrp_priv *nhg_priv; + struct nh_control *ctl; + struct epoch_tracker et; + + nhg_priv = NHGRP_PRIV(nhg); + + if (!refcount_release(&nhg_priv->nhg_refcount)) + return; + + /* + * group objects don't have an explicit lock attached to it. + * As groups are reclaimed based on reference count, it is possible + * that some groups will persist after vnet destruction callback + * called. Given that, handle scenario with nhgrp_free_group() being + * called either after or simultaneously with nhgrp_ctl_unlink_all() + * by using another reference counter: nhg_linked. + * + * There are only 2 places, where nhg_linked can be decreased: + * rib destroy (nhgrp_ctl_unlink_all) and this function. + * nhg_link can never be increased. + * + * Hence, use initial value of 2 to make use of + * refcount_release_if_not_last(). + * + * There can be two scenarious when calling this function: + * + * 1) nhg_linked value is 2. This means that either + * nhgrp_ctl_unlink_all() has not been called OR it is running, + * but we are guaranteed that nh_control won't be freed in + * this epoch. Hence, nexthop can be safely unlinked. + * + * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all() + * has been called and nhgrp unlink can be skipped. + */ + + NET_EPOCH_ENTER(et); + if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) { + ctl = nhg_priv->nh_control; + if (unlink_nhgrp(ctl, nhg_priv) == NULL) { + /* Do not try to reclaim */ + DPRINTF("Failed to unlink nexhop group %p", nhg_priv); + NET_EPOCH_EXIT(et); + return; + } + } + NET_EPOCH_EXIT(et); + + epoch_call(net_epoch_preempt, destroy_nhgrp_epoch, + &nhg_priv->nhg_epoch_ctx); +} + +/* + * Destroys all local resources belonging to @nhg_priv. + */ +__noinline static void +destroy_nhgrp_int(struct nhgrp_priv *nhg_priv) +{ + + free(nhg_priv->nhg, M_NHOP); +} + +__noinline static void +destroy_nhgrp(struct nhgrp_priv *nhg_priv) +{ + + KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0")); + + DPRINTF("DEL MPATH %p", nhg_priv); + + KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0")); + + free_nhgrp_nhops(nhg_priv); + + destroy_nhgrp_int(nhg_priv); +} + +/* + * Epoch callback indicating group is safe to destroy + */ +static void +destroy_nhgrp_epoch(epoch_context_t ctx) +{ + struct nhgrp_priv *nhg_priv; + + nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx); + + destroy_nhgrp(nhg_priv); +} + +static bool +ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv) +{ + + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { + if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0) + continue; + + /* + * Failed to ref the nexthop, b/c it's deleted. + * Need to rollback references back. + */ + for (int j = 0; j < i; j++) + nhop_free(nhg_priv->nhg_nh_weights[j].nh); + return (false); + } + + return (true); +} + +static void +free_nhgrp_nhops(struct nhgrp_priv *nhg_priv) +{ + + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) + nhop_free(nhg_priv->nhg_nh_weights[i].nh); +} + +/* + * Creates or looks up an existing nexthop group based on @wn and @num_nhops. + * + * Returns referenced nhop group or NULL, passing error code in @perror. + */ +struct nhgrp_priv * +get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops, + int *perror) +{ + struct nhgrp_priv *key, *nhg_priv; + + if (num_nhops > RIB_MAX_MPATH_WIDTH) { + *perror = E2BIG; + return (NULL); + } + + if (ctl->gr_head.hash_size == 0) { + /* First multipath request. Bootstrap mpath datastructures. */ + if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Sort nexthops & check there are no duplicates */ + sort_weightened_nhops(wn, num_nhops); + uint32_t last_id = 0; + for (int i = 0; i < num_nhops; i++) { + if (wn[i].nh->nh_priv->nh_idx == last_id) { + *perror = EEXIST; + return (NULL); + } + last_id = wn[i].nh->nh_priv->nh_idx; + } + + if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) { + *perror = ENOMEM; + return (NULL); + } + + nhg_priv = find_nhgrp(ctl, key); + if (nhg_priv != NULL) { + /* + * Free originally-created group. As it hasn't been linked + * and the dependent nexhops haven't been referenced, just free + * the group. + */ + destroy_nhgrp_int(key); + *perror = 0; + return (nhg_priv); + } else { + /* No existing group, try to link the new one */ + if (!ref_nhgrp_nhops(key)) { + /* + * Some of the nexthops have been scheduled for deletion. + * As the group hasn't been linked / no nexhops have been + * referenced, call the final destructor immediately. + */ + destroy_nhgrp_int(key); + *perror = EAGAIN; + return (NULL); + } + if (link_nhgrp(ctl, key) == 0) { + /* Unable to allocate index? */ + *perror = EAGAIN; + destroy_nhgrp(key); + } + *perror = 0; + return (key); + } + + /* NOTREACHED */ +} + +/* + * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig. + * + * Returns referenced nexthop group or NULL. In the latter case, @perror is + * filled with an error code. + * Note that function does NOT care if the next nexthops already exists + * in the @gr_orig. As a result, they will be added, resulting in the + * same nexthop being present multiple times in the new group. + */ +static struct nhgrp_priv * +append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig, + struct weightened_nhop *wn, int num_nhops, int *perror) +{ + char storage[64]; + struct weightened_nhop *pnhops; + struct nhgrp_priv *nhg_priv; + const struct nhgrp_priv *src_priv; + size_t sz; + int curr_nhops; + + src_priv = NHGRP_PRIV_CONST(gr_orig); + curr_nhops = src_priv->nhg_nh_count; + + *perror = 0; + + sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + pnhops = malloc(sz, M_TEMP, M_NOWAIT); + if (pnhops == NULL) { + *perror = ENOMEM; + return (NULL); + } + } + + /* Copy nhops from original group first */ + memcpy(pnhops, src_priv->nhg_nh_weights, + curr_nhops * sizeof(struct weightened_nhop)); + memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop)); + curr_nhops += num_nhops; + + nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror); + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + if (nhg_priv == NULL) + return (NULL); + + return (nhg_priv); +} + + +/* + * Creates/finds nexthop group based on @wn and @num_nhops. + * Returns 0 on success with referenced group in @rnd, or + * errno. + * + * If the error is EAGAIN, then the operation can be retried. + */ +int +nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, + struct route_nhop_data *rnd) +{ + struct nh_control *ctl = rh->nh_control; + struct nhgrp_priv *nhg_priv; + int error; + + nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error); + if (nhg_priv != NULL) + rnd->rnd_nhgrp = nhg_priv->nhg; + rnd->rnd_weight = 0; + + return (error); +} + +/* + * Creates new nexthop group based on @src group with the nexthops defined in bitmask + * @nhop_mask removed. + * Returns referenced nexthop group or NULL on failure. + */ +int +nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src, + nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd) +{ + char storage[64]; + struct nh_control *ctl = rh->nh_control; + struct weightened_nhop *pnhops; + const struct nhgrp_priv *mp_priv, *src_priv; + size_t sz; + int error, i, num_nhops; + + src_priv = NHGRP_PRIV_CONST(src); + + sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop)); + /* optimize for <= 4 paths, each path=16 bytes */ + if (sz <= sizeof(storage)) + pnhops = (struct weightened_nhop *)&storage[0]; + else { + if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL) + return (ENOMEM); + } + + /* Filter nexthops */ + error = 0; + num_nhops = 0; + for (i = 0; i < src_priv->nhg_nh_count; i++) { + if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data)) + continue; + memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i], + sizeof(struct weightened_nhop)); + } + + if (num_nhops == 0) { + rnd->rnd_nhgrp = NULL; + rnd->rnd_weight = 0; + } else if (num_nhops == 1) { + rnd->rnd_nhop = pnhops[0].nh; + rnd->rnd_weight = pnhops[0].weight; + if (nhop_try_ref_object(rnd->rnd_nhop) == 0) + error = EAGAIN; + } else { + mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error); + if (mp_priv != NULL) + rnd->rnd_nhgrp = mp_priv->nhg; + rnd->rnd_weight = 0; + } + + if (pnhops != (struct weightened_nhop *)&storage[0]) + free(pnhops, M_TEMP); + + return (error); +} + +/* + * Creates new multipath group based on existing group/nhop in @rnd_orig and + * to-be-added nhop @wn_add. + * Returns 0 on success and stores result in @rnd_new. + */ +int +nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig, + struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new) +{ + struct nh_control *ctl = rh->nh_control; + struct nhgrp_priv *nhg_priv; + struct weightened_nhop wn[2]; + int error; + + if (rnd_orig->rnd_nhop == NULL) { + /* No paths to add to, just reference current nhop */ + *rnd_new = *rnd_add; + if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0) + return (EAGAIN); + return (0); + } + + wn[0].nh = rnd_add->rnd_nhop; + wn[0].weight = rnd_add->rnd_weight; + + if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) { + /* Simple merge of 2 non-multipath nexthops */ + wn[1].nh = rnd_orig->rnd_nhop; + wn[1].weight = rnd_orig->rnd_weight; + nhg_priv = get_nhgrp(ctl, wn, 2, &error); + } else { + /* Get new nhop group with @rt->rt_nhop as an additional nhop */ + nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1, + &error); + } + + if (nhg_priv == NULL) + return (error); + rnd_new->rnd_nhgrp = nhg_priv->nhg; + rnd_new->rnd_weight = 0; + + return (0); +} + +/* + * Returns pointer to array of nexthops with weights for + * given @nhg. Stores number of items in the array into @pnum_nhops. + */ +struct weightened_nhop * +nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops) +{ + struct nhgrp_priv *nhg_priv; + + KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath")); + + nhg_priv = NHGRP_PRIV(nhg); + *pnum_nhops = nhg_priv->nhg_nh_count; + + return (nhg_priv->nhg_nh_weights); +} + +__noinline static int +dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv, + char *buffer, size_t buffer_size, struct sysctl_req *w) +{ + struct rt_msghdr *rtm; + struct nhgrp_external *nhge; + struct nhgrp_container *nhgc; + const struct nhgrp_object *nhg; + struct nhgrp_nhop_external *ext; + int error; + size_t sz; + + nhg = nhg_priv->nhg; + + sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external); + /* controlplane nexthops */ + sz += sizeof(struct nhgrp_container); + sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count; + /* dataplane nexthops */ + sz += sizeof(struct nhgrp_container); + sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size; + + KASSERT(sz <= buffer_size, ("increase nhgrp buffer size")); + + bzero(buffer, sz); + + rtm = (struct rt_msghdr *)buffer; + rtm->rtm_msglen = sz; + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = RTM_GET; + + nhge = (struct nhgrp_external *)(rtm + 1); + + nhge->nhg_idx = nhg_priv->nhg_idx; + nhge->nhg_refcount = nhg_priv->nhg_refcount; + + /* fill in control plane nexthops firs */ + nhgc = (struct nhgrp_container *)(nhge + 1); + nhgc->nhgc_type = NHG_C_TYPE_CNHOPS; + nhgc->nhgc_subtype = 0; + nhgc->nhgc_len = sizeof(struct nhgrp_container); + nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count; + nhgc->nhgc_count = nhg_priv->nhg_nh_count; + + ext = (struct nhgrp_nhop_external *)(nhgc + 1); + for (int i = 0; i < nhg_priv->nhg_nh_count; i++) { + ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx; + ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight; + } + + /* fill in dataplane nexthops */ + nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]); + nhgc->nhgc_type = NHG_C_TYPE_DNHOPS; + nhgc->nhgc_subtype = 0; + nhgc->nhgc_len = sizeof(struct nhgrp_container); + nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size; + nhgc->nhgc_count = nhg->nhg_size; + + ext = (struct nhgrp_nhop_external *)(nhgc + 1); + for (int i = 0; i < nhg->nhg_size; i++) { + ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx; + ext[i].nh_weight = 0; + } + + error = SYSCTL_OUT(w, buffer, sz); + + return (error); +} + +int +nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w) +{ + struct nh_control *ctl = rh->nh_control; + struct epoch_tracker et; + struct nhgrp_priv *nhg_priv; + char *buffer; + size_t sz; + int error = 0; + + if (ctl->gr_head.items_count == 0) + return (0); + + /* Calculate the maximum nhop group size in bytes */ + sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external); + sz += 2 * sizeof(struct nhgrp_container); + sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH; + buffer = malloc(sz, M_TEMP, M_WAITOK); + + NET_EPOCH_ENTER(et); + NHOPS_RLOCK(ctl); + CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) { + error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w); + if (error != 0) + break; + } CHT_SLIST_FOREACH_END; + NHOPS_RUNLOCK(ctl); + NET_EPOCH_EXIT(et); + + free(buffer, M_TEMP); + + return (error); +} diff --git a/sys/net/route/nhgrp_var.h b/sys/net/route/nhgrp_var.h new file mode 100644 index 000000000000..ba90a3feedc8 --- /dev/null +++ b/sys/net/route/nhgrp_var.h @@ -0,0 +1,72 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file contains private definitions for the nexthop groups. + * + * Header is not intended to be included by the code external to the + * routing subsystem. + */ + +#ifndef _NET_ROUTE_NHGRP_VAR_H_ +#define _NET_ROUTE_NHGRP_VAR_H_ + +/* nhgrp hash definition */ +/* produce hash value for an object */ +#define mpath_hash_obj(_obj) (hash_nhgrp(_obj)) +/* compare two objects */ +#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two)) +/* next object accessor */ +#define mpath_next(_obj) (_obj)->nhg_priv_next + +struct nhgrp_priv { + uint32_t nhg_idx; + uint8_t nhg_nh_count; /* number of items in nh_weights */ + uint8_t nhg_spare[3]; + u_int nhg_refcount; /* use refcount */ + u_int nhg_linked; /* refcount(9), == 2 if linked to the list */ + struct nh_control *nh_control; /* parent control structure */ + struct nhgrp_priv *nhg_priv_next; + struct nhgrp_object *nhg; + struct epoch_context nhg_epoch_ctx; /* epoch data for nhop */ + struct weightened_nhop nhg_nh_weights[0]; +}; + +#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->nhg_size]) +#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src)) +#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src)) + +/* nhgrp.c */ +bool nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags); +struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key); +int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv); +struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key); + +#endif + diff --git a/sys/net/route/nhop.c b/sys/net/route/nhop.c index 4b9a79ffbf20..0db47db9916e 100644 --- a/sys/net/route/nhop.c +++ b/sys/net/route/nhop.c @@ -64,7 +64,7 @@ __FBSDID("$FreeBSD$"); * is backed by the bitmask array. */ -static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); +MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data"); /* Hash management functions */ @@ -112,6 +112,9 @@ destroy_ctl(struct nh_control *ctl) NHOPS_LOCK_DESTROY(ctl); free(ctl->nh_head.ptr, M_NHOP); free(ctl->nh_idx_head.idx, M_NHOP); +#ifdef ROUTE_MPATH + nhgrp_ctl_free(ctl); +#endif free(ctl, M_NHOP); } @@ -154,6 +157,9 @@ nhops_destroy_rib(struct rib_head *rh) DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx); refcount_release(&nh_priv->nh_linked); } CHT_SLIST_FOREACH_END; +#ifdef ROUTE_MPATH + nhgrp_ctl_unlink_all(ctl); +#endif NHOPS_WUNLOCK(ctl); /* diff --git a/sys/net/route/nhop.h b/sys/net/route/nhop.h index 1f6aff134c2d..3944d8946b07 100644 --- a/sys/net/route/nhop.h +++ b/sys/net/route/nhop.h @@ -155,7 +155,7 @@ struct nhop_object { */ #define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp) -#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) +#define NH_IS_NHGRP(_nh) ((_nh)->nh_flags & NHF_MULTIPATH) #define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) #define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa) @@ -166,6 +166,11 @@ struct nhop_object { _nh = NULL; \ } while (0) +struct weightened_nhop { + struct nhop_object *nh; + uint32_t weight; +}; + void nhop_free(struct nhop_object *nh); struct sysctl_req; @@ -209,16 +214,34 @@ struct nhop_addrs { uint16_t src_sa_off; /* offset of src address SA */ }; -struct mpath_nhop_external { +#define NHG_C_TYPE_CNHOPS 0x1 /* Control plane nhops list */ +#define NHG_C_TYPE_DNHOPS 0x2 /* Dataplane nhops list */ +struct nhgrp_container { + uint32_t nhgc_len; /* container length */ + uint16_t nhgc_count; /* number of items */ + uint8_t nhgc_type; /* container type */ + uint8_t nhgc_subtype; /* container subtype */ +}; + +struct nhgrp_nhop_external { uint32_t nh_idx; uint32_t nh_weight; }; -struct mpath_external { - uint32_t mp_idx; - uint32_t mp_refcount; - uint32_t mp_nh_count; - uint32_t mp_group_size; +/* + * Layout: + * - nhgrp_external + * - nhgrp_container (control plane nhops list) + * - nhgrp_nhop_external + * - nhgrp_nhop_external + * .. + * - nhgrp_container (dataplane nhops list) + * - nhgrp_nhop_external + * - nhgrp_nhop_external + */ +struct nhgrp_external { + uint32_t nhg_idx; /* Nexthop group index */ + uint32_t nhg_refcount; /* number of references */ }; #endif diff --git a/sys/net/route/nhop_ctl.c b/sys/net/route/nhop_ctl.c index b9ac4d63218d..150ae5c4be58 100644 --- a/sys/net/route/nhop_ctl.c +++ b/sys/net/route/nhop_ctl.c @@ -695,7 +695,14 @@ void nhop_free_any(struct nhop_object *nh) { +#ifdef ROUTE_MPATH + if (!NH_IS_NHGRP(nh)) + nhop_free(nh); + else + nhgrp_free((struct nhgrp_object *)nh); +#else nhop_free(nh); +#endif } /* Helper functions */ diff --git a/sys/net/route/nhop_var.h b/sys/net/route/nhop_var.h index 220b6c9a7634..6e1aba670e3c 100644 --- a/sys/net/route/nhop_var.h +++ b/sys/net/route/nhop_var.h @@ -37,6 +37,8 @@ #ifndef _NET_ROUTE_NHOP_VAR_H_ #define _NET_ROUTE_NHOP_VAR_H_ +MALLOC_DECLARE(M_NHOP); + /* define nhop hash table */ struct nhop_priv; CHT_SLIST_DEFINE(nhops, struct nhop_priv); @@ -47,9 +49,15 @@ CHT_SLIST_DEFINE(nhops, struct nhop_priv); /* next object accessor */ #define nhops_next(_obj) (_obj)->nh_next +/* define multipath hash table */ +struct nhgrp_priv; +CHT_SLIST_DEFINE(nhgroups, struct nhgrp_priv); + struct nh_control { struct nhops_head nh_head; /* hash table head */ struct bitmask_head nh_idx_head; /* nhop index head */ + struct nhgroups_head gr_head; /* nhgrp hash table head */ + struct bitmask_head gr_idx_head; /* nhgrp index head */ struct rwlock ctl_lock; /* overall ctl lock */ struct rib_head *ctl_rh; /* pointer back to rnh */ struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */ @@ -80,7 +88,8 @@ struct nhop_priv { struct epoch_context nh_epoch_ctx; /* epoch data for nhop */ }; -#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED) +#define NH_IS_PINNED(_nh) ((!NH_IS_NHGRP(_nh)) && \ + ((_nh)->nh_priv->rt_flags & RTF_PINNED)) /* nhop.c */ struct nhop_priv *find_nhop(struct nh_control *ctl, diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c index 37c23e2cb1cb..f720d08f1f52 100644 --- a/sys/net/route/route_ctl.c +++ b/sys/net/route/route_ctl.c @@ -29,7 +29,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" -#include "opt_mpath.h" +#include "opt_route.h" #include #include @@ -83,9 +83,6 @@ static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); -static int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, - struct rt_addrinfo *info, struct route_nhop_data *rnd, - struct rib_cmd_info *rc); static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); @@ -94,6 +91,20 @@ static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc); static void destroy_subscription_epoch(epoch_context_t ctx); +static bool rib_can_multipath(struct rib_head *rh); + +/* Per-vnet multipath routing configuration */ +SYSCTL_DECL(_net_route); +#define V_rib_route_multipath VNET(rib_route_multipath) +#ifdef ROUTE_MPATH +#define _MP_FLAGS CTLFLAG_RW +#else +#define _MP_FLAGS CTLFLAG_RD +#endif +VNET_DEFINE(u_int, rib_route_multipath) = 0; +SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, + &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); +#undef _MP_FLAGS /* Routing table UMA zone */ VNET_DEFINE_STATIC(uma_zone_t, rtzone); @@ -128,7 +139,7 @@ destroy_rtentry(struct rtentry *rt) CURVNET_SET(nhop_get_vnet(rt->rt_nhop)); /* Unreference nexthop */ - nhop_free(rt->rt_nhop); + nhop_free_any(rt->rt_nhop); uma_zfree(V_rtzone, rt); @@ -175,6 +186,41 @@ get_rnh(uint32_t fibnum, const struct rt_addrinfo *info) return (rnh); } +#ifdef ROUTE_MPATH +static bool +rib_can_multipath(struct rib_head *rh) +{ + int result; + + CURVNET_SET(rh->rib_vnet); + result = !!V_rib_route_multipath; + CURVNET_RESTORE(); + + return (result); +} + +/* + * Check is nhop is multipath-eligible. + * Avoid nhops without gateways and redirects. + * + * Returns 1 for multipath-eligible nexthop, + * 0 otherwise. + */ +bool +nhop_can_multipath(const struct nhop_object *nh) +{ + + if ((nh->nh_flags & NHF_MULTIPATH) != 0) + return (1); + if ((nh->nh_flags & NHF_GATEWAY) == 0) + return (0); + if ((nh->nh_flags & NHF_REDIRECT) != 0) + return (0); + + return (1); +} +#endif + static int get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) { @@ -206,7 +252,7 @@ rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info) * * Returns true if matches, false otherwise. */ -static bool +bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) { @@ -461,7 +507,7 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct nhop_object *nh_orig; - struct route_nhop_data rnd; + struct route_nhop_data rnd_orig, rnd_add; struct nhop_object *nh; struct rtentry *rt, *rt_orig; int error; @@ -470,32 +516,19 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, if (error != 0) return (error); - rnd.rnd_nhop = rt->rt_nhop; - rnd.rnd_weight = rt->rt_weight; + rnd_add.rnd_nhop = rt->rt_nhop; + rnd_add.rnd_weight = rt->rt_weight; nh = rt->rt_nhop; RIB_WLOCK(rnh); -#ifdef RADIX_MPATH - struct sockaddr *netmask; - netmask = info->rti_info[RTAX_NETMASK]; - /* do not permit exactly the same dst/mask/gw pair */ - if (rt_mpath_capable(rnh) && - rt_mpath_conflict(rnh, rt, netmask)) { - RIB_WUNLOCK(rnh); - - nhop_free(nh); - uma_zfree(V_rtzone, rt); - return (EEXIST); - } -#endif - error = add_route_nhop(rnh, rt, info, &rnd, rc); + error = add_route_nhop(rnh, rt, info, &rnd_add, rc); if (error == 0) { RIB_WUNLOCK(rnh); return (0); } /* addition failed. Lookup prefix in the rib to determine the cause */ - rt_orig = lookup_prefix(rnh, info, &rnd); + rt_orig = lookup_prefix(rnh, info, &rnd_orig); if (rt_orig == NULL) { /* No prefix -> rnh_addaddr() failed to allocate memory */ RIB_WUNLOCK(rnh); @@ -505,11 +538,11 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, } /* We have existing route in the RIB. */ - nh_orig = rnd.rnd_nhop; + nh_orig = rnd_orig.rnd_nhop; /* Check if new route has higher preference */ if (can_override_nhop(info, nh_orig) > 0) { /* Update nexthop to the new route */ - change_route_nhop(rnh, rt_orig, info, &rnd, rc); + change_route_nhop(rnh, rt_orig, info, &rnd_add, rc); RIB_WUNLOCK(rnh); uma_zfree(V_rtzone, rt); nhop_free(nh_orig); @@ -518,11 +551,26 @@ add_route(struct rib_head *rnh, struct rt_addrinfo *info, RIB_WUNLOCK(rnh); +#ifdef ROUTE_MPATH + if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && + nhop_can_multipath(rnd_orig.rnd_nhop)) + error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); + else +#endif /* Unable to add - another route with the same preference exists */ error = EEXIST; + /* + * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. + * ROUTE_MPATH enabled: original nhop reference is unused in any case, + * free rt only if not _adding_ new route to rib (e.g. the case + * when initial lookup returned existing route, but then it got + * deleted prior to multipath group insertion, leading to a simple + * non-multipath add as a result). + */ nhop_free(nh); - uma_zfree(V_rtzone, rt); + if ((error != 0) || rc->rc_cmd != RTM_ADD) + uma_zfree(V_rtzone, rt); return (error); } @@ -588,7 +636,13 @@ rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info return (ESRCH); nh = rt->rt_nhop; - +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + error = del_route_mpath(rnh, info, rt, + (struct nhgrp_object *)nh, rc); + return (error); + } +#endif error = check_info_match_nhop(info, rt, nh); if (error != 0) return (error); @@ -600,14 +654,6 @@ rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ -#ifdef RADIX_MPATH - info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; - if (rt_mpath_capable(rnh)) { - rn = rt_mpath_unlink(rnh, info, rt, &error); - if (error != 0) - return (error); - } else -#endif rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rn == NULL) @@ -648,7 +694,18 @@ del_route(struct rib_head *rnh, struct rt_addrinfo *info, * If the caller wants it, then it can have it, * the entry will be deleted after the end of the current epoch. */ - rtfree(rc->rc_rt); + if (rc->rc_cmd == RTM_DELETE) + rtfree(rc->rc_rt); +#ifdef ROUTE_MPATH + else { + /* + * Deleting 1 path may result in RTM_CHANGE to + * a different mpath group/nhop. + * Free old mpath group. + */ + nhop_free_any(rc->rc_nh_old); + } +#endif return (0); } @@ -694,19 +751,6 @@ rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, return (ESRCH); } -#ifdef RADIX_MPATH - /* - * If we got multipath routes, - * we require users to specify a matching RTAX_GATEWAY. - */ - if (rt_mpath_capable(rnh)) { - rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); - if (rt == NULL) { - RIB_RUNLOCK(rnh); - return (ESRCH); - } - } -#endif rnd_orig.rnd_nhop = rt->rt_nhop; rnd_orig.rnd_weight = rt->rt_weight; @@ -722,18 +766,11 @@ rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, } static int -change_route(struct rib_head *rnh, struct rt_addrinfo *info, - struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, + struct nhop_object *nh_orig, struct nhop_object **nh_new) { - int error = 0; int free_ifa = 0; - struct nhop_object *nh, *nh_orig; - struct route_nhop_data rnd_new; - - nh = NULL; - nh_orig = rnd_orig->rnd_nhop; - if (nh_orig == NULL) - return (ESRCH); + int error; /* * New gateway could require new ifaddr, ifp; @@ -759,20 +796,97 @@ change_route(struct rib_head *rnh, struct rt_addrinfo *info, } } - error = nhop_create_from_nhop(rnh, nh_orig, info, &nh); + error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); if (free_ifa) { ifa_free(info->rti_ifa); info->rti_ifa = NULL; } + + return (error); +} + +#ifdef ROUTE_MPATH +static int +change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + int error = 0; + struct nhop_object *nh, *nh_orig, *nh_new; + struct route_nhop_data rnd_new; + + nh = NULL; + nh_orig = rnd_orig->rnd_nhop; + + struct weightened_nhop *wn = NULL, *wn_new; + uint32_t num_nhops; + + wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops); + nh_orig = NULL; + for (int i = 0; i < num_nhops; i++) { + if (check_info_match_nhop(info, NULL, wn[i].nh)) { + nh_orig = wn[i].nh; + break; + } + } + + if (nh_orig == NULL) + return (ESRCH); + + error = change_nhop(rnh, info, nh_orig, &nh_new); if (error != 0) return (error); - rnd_new.rnd_nhop = nh; - if (info->rti_mflags & RTV_WEIGHT) - rnd_new.rnd_weight = info->rti_rmx->rmx_weight; - else - rnd_new.rnd_weight = rnd_orig->rnd_weight; + wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), + M_TEMP, M_NOWAIT | M_ZERO); + if (wn_new == NULL) { + nhop_free(nh_new); + return (EAGAIN); + } + memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); + for (int i = 0; i < num_nhops; i++) { + if (wn[i].nh == nh_orig) { + wn[i].nh = nh_new; + wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight); + break; + } + } + + error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new); + nhop_free(nh_new); + free(wn_new, M_TEMP); + + if (error != 0) + return (error); + + error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); + + return (error); +} +#endif + +static int +change_route(struct rib_head *rnh, struct rt_addrinfo *info, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) +{ + int error = 0; + struct nhop_object *nh, *nh_orig; + struct route_nhop_data rnd_new; + + nh = NULL; + nh_orig = rnd_orig->rnd_nhop; + if (nh_orig == NULL) + return (ESRCH); + +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh_orig)) + return (change_mpath_route(rnh, info, rnd_orig, rc)); +#endif + + rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); + error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); + if (error != 0) + return (error); error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); return (error); @@ -827,7 +941,7 @@ add_route_nhop(struct rib_head *rnh, struct rtentry *rt, * Conditionally set rt_expire if set in @info. * Returns 0 on success. */ -static int +int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc) @@ -855,6 +969,8 @@ change_route_nhop(struct rib_head *rnh, struct rtentry *rt, rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); if (rn == NULL) return (ESRCH); + rt = RNTORT(rn); + rt->rte_flags &= ~RTF_UP; } /* Finalize notification */ @@ -989,7 +1105,6 @@ rt_checkdelroute(struct radix_node *rn, void *arg) info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); - info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa; error = rt_unlinkrte(di->rnh, info, &di->rc); @@ -1000,7 +1115,7 @@ rt_checkdelroute(struct radix_node *rn, void *arg) * XXX: Delayed notifications not implemented * for nexthop updates. */ - if (error == 0) { + if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) { /* Add to the list and return */ rt->rt_chain = di->head; di->head = rt; @@ -1024,6 +1139,7 @@ rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; + struct nhop_object *nh; struct epoch_tracker et; rnh = rt_tables_get_rnh(fibnum, family); @@ -1049,18 +1165,31 @@ rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; + nh = rt->rt_nhop; di.rc.rc_rt = rt; - di.rc.rc_nh_old = rt->rt_nhop; + di.rc.rc_nh_old = nh; rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); - if (report) - rt_routemsg(RTM_DELETE, rt, rt->rt_nhop->nh_ifp, 0, - fibnum); + if (report) { +#ifdef ROUTE_MPATH + struct nhgrp_object *nhg; + struct weightened_nhop *wn; + uint32_t num_nhops; + if (NH_IS_NHGRP(nh)) { + nhg = (struct nhgrp_object *)nh; + wn = nhgrp_get_nhops(nhg, &num_nhops); + for (int i = 0; i < num_nhops; i++) + rt_routemsg(RTM_DELETE, rt, + wn[i].nh->nh_ifp, 0, fibnum); + } else +#endif + rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum); + } rtfree(rt); } diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h index fb6dda47b3ba..151771146e65 100644 --- a/sys/net/route/route_ctl.h +++ b/sys/net/route/route_ctl.h @@ -53,6 +53,10 @@ int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, struct rib_cmd_info *rc); +typedef void route_notification_t(struct rib_cmd_info *rc, void *); +void rib_decompose_notification(struct rib_cmd_info *rc, + route_notification_t *cb, void *cbdata); + int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int expire_sec); @@ -66,6 +70,20 @@ typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *); void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *); void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg); +struct route_nhop_data; +const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family, + const struct sockaddr *dst, const struct sockaddr *netmask, + struct route_nhop_data *rnd); +const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family, + const struct sockaddr *dst, struct route_nhop_data *rnd); + +/* Multipath */ +struct nhgrp_object; +struct weightened_nhop; + +struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *nhg, + uint32_t *pnum_nhops); + enum rib_subscription_type { RIB_NOTIFY_IMMEDIATE, RIB_NOTIFY_DELAYED diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c index b5b45ef662cc..dfa573d23a66 100644 --- a/sys/net/route/route_helpers.c +++ b/sys/net/route/route_helpers.c @@ -131,3 +131,167 @@ rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, return (nh); } + +#ifdef ROUTE_MPATH +static void +decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, + void *cbdata) +{ + uint32_t num_old, num_new; + uint32_t nh_idx_old, nh_idx_new; + struct weightened_nhop *wn_old, *wn_new; + struct weightened_nhop tmp = { NULL, 0 }; + uint32_t idx_old = 0, idx_new = 0; + + struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; + struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; + + if (NH_IS_NHGRP(rc->rc_nh_old)) { + wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); + } else { + tmp.nh = rc->rc_nh_old; + tmp.weight = rc->rc_nh_weight; + wn_old = &tmp; + num_old = 1; + } + if (NH_IS_NHGRP(rc->rc_nh_new)) { + wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); + } else { + tmp.nh = rc->rc_nh_new; + tmp.weight = rc->rc_nh_weight; + wn_new = &tmp; + num_new = 1; + } + + /* Use the fact that each @wn array is sorted */ + /* + * Want to convert into set of add and delete operations + * [1] -> [1, 2] = A{2} + * [2] -> [1, 2] = A{1} + * [1, 2, 4]->[1, 3, 4] = A{2}, D{3} + * [1, 2, 4]->[1, 4] = D{2} + * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3} + * [1, 2] -> [3, 4] = + * + */ + idx_old = 0; + while ((idx_old < num_old) && (idx_new < num_new)) { + nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; + nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; + + if (nh_idx_old == nh_idx_new) { + if (wn_old[idx_old].weight != wn_new[idx_new].weight) { + /* Update weight by providing del/add notifications */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + } + idx_old++; + idx_new++; + } else if (nh_idx_old < nh_idx_new) { + /* + * [1, ~2~, 4], [1, ~3~, 4] + * [1, ~2~, 5], [1, ~3~, 4] + * [1, ~2~], [1, ~3~, 4] + */ + if ((idx_old + 1 >= num_old) || + (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) { + /* Add new unless the next old item is still <= new */ + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } + /* In any case, delete current old */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } else { + /* + * nh_idx_old > nh_idx_new + * + * [1, ~3~, 4], [1, ~2~, 4] + * [1, ~3~, 5], [1, ~2~, 4] + * [1, ~3~, 4], [1, ~2~] + */ + if ((idx_new + 1 >= num_new) || + (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) { + /* No next item or next item is > current one */ + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } + /* In any case, delete current old */ + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } + } + + while (idx_old < num_old) { + rc_del.rc_nh_old = wn_old[idx_old].nh; + rc_del.rc_nh_weight = wn_old[idx_old].weight; + cb(&rc_del, cbdata); + idx_old++; + } + + while (idx_new < num_new) { + rc_add.rc_nh_new = wn_new[idx_new].nh; + rc_add.rc_nh_weight = wn_new[idx_new].weight; + cb(&rc_add, cbdata); + idx_new++; + } +} + +/* + * Decompose multipath cmd info @rc into a list of add/del/change + * single-path operations, calling @cb callback for each operation. + * Assumes at least one of the nexthops in @rc is multipath. + */ +void +rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, + void *cbdata) +{ + struct weightened_nhop *wn; + uint32_t num_nhops; + struct rib_cmd_info rc_new; + + rc_new = *rc; + DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p", + cb, rc->cmd, rc->nh_old, rc->nh_new); + switch (rc->rc_cmd) { + case RTM_ADD: + if (!NH_IS_NHGRP(rc->rc_nh_new)) + return; + wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + rc_new.rc_nh_new = wn[i].nh; + rc_new.rc_nh_weight = wn[i].weight; + cb(&rc_new, cbdata); + } + break; + case RTM_DELETE: + if (!NH_IS_NHGRP(rc->rc_nh_old)) + return; + wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); + for (uint32_t i = 0; i < num_nhops; i++) { + rc_new.rc_nh_old = wn[i].nh; + rc_new.rc_nh_weight = wn[i].weight; + cb(&rc_new, cbdata); + } + break; + case RTM_CHANGE: + if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) + return; + decompose_change_notification(rc, cb, cbdata); + break; + } +} +#endif diff --git a/sys/net/route/route_var.h b/sys/net/route/route_var.h index 6164ec08850c..12d081d410a2 100644 --- a/sys/net/route/route_var.h +++ b/sys/net/route/route_var.h @@ -87,6 +87,7 @@ struct rib_head { /* Constants */ #define RIB_MAX_RETRIES 3 #define RT_MAXFIBS UINT16_MAX +#define RIB_MAX_MPATH_WIDTH 64 /* Macro for verifying fields in af-specific 'struct route' structures */ #define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2) \ @@ -113,12 +114,7 @@ _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new) "ro_dst and " #_dst_new " are at different offset") struct rib_head *rt_tables_get_rnh(uint32_t table, sa_family_t family); -void rt_mpath_init_rnh(struct rib_head *rnh); int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum); -#ifdef RADIX_MPATH -struct radix_node *rt_mpath_unlink(struct rib_head *rnh, - struct rt_addrinfo *info, struct rtentry *rto, int *perror); -#endif struct rib_cmd_info; VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat); @@ -202,14 +198,6 @@ struct rtentry { /* rtentry rt flag mask */ #define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST) -/* Nexthop selection */ -#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh)) -#define _SELECT_NHOP(_nh, _flowid) \ - (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size] -#define _RT_SELECT_NHOP(_nh, _flowid) \ - ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid)) -#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid) - /* route_temporal.c */ void tmproutes_update(struct rib_head *rnh, struct rtentry *rt); void tmproutes_init(struct rib_head *rh); @@ -217,14 +205,24 @@ void tmproutes_destroy(struct rib_head *rh); /* route_ctl.c */ struct route_nhop_data { - struct nhop_object *rnd_nhop; - uint32_t rnd_weight; + union { + struct nhop_object *rnd_nhop; + struct nhgrp_object *rnd_nhgrp; + }; + uint32_t rnd_weight; }; + +int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, + struct rt_addrinfo *info, struct route_nhop_data *rnd, + struct rib_cmd_info *rc); int change_route_conditional(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct route_nhop_data *nhd_new, struct rib_cmd_info *rc); struct rtentry *lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, struct route_nhop_data *rnd); + +bool nhop_can_multipath(const struct nhop_object *nh); +bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw); int check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, const struct nhop_object *nh); int can_override_nhop(const struct rt_addrinfo *info, @@ -256,5 +254,57 @@ int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_ori void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu); int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); +/* MULTIPATH */ +#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */ + +struct nhgrp_object { + uint16_t nhg_flags; /* nexthop group flags */ + uint8_t nhg_size; /* dataplain group size */ + uint8_t spare; + struct nhop_object *nhops[0]; /* nhops */ +}; + +static inline struct nhop_object * +nhop_select(struct nhop_object *nh, uint32_t flowid) +{ + +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct nhgrp_object *nhg = (struct nhgrp_object *)nh; + nh = nhg->nhops[flowid % nhg->nhg_size]; + } +#endif + return (nh); +} + + +struct weightened_nhop; + +/* mpath_ctl.c */ +int add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rt, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc); +int del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info, + struct rtentry *rt, struct nhgrp_object *nhg, struct rib_cmd_info *rc); + +/* nhgrp.c */ +int nhgrp_ctl_init(struct nh_control *ctl); +void nhgrp_ctl_free(struct nh_control *ctl); +void nhgrp_ctl_unlink_all(struct nh_control *ctl); + + +/* nhgrp_ctl.c */ +int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w); + +int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, + int num_nhops, struct route_nhop_data *rnd); +typedef bool nhgrp_filter_cb_t(const struct nhop_object *nh, void *data); +int nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src, + nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd); +int nhgrp_get_addition_group(struct rib_head *rnh, + struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_add, + struct route_nhop_data *rnd_new); + +void nhgrp_free(struct nhgrp_object *nhg); #endif diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index f3b0ecec2430..c2e2273d0d31 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -32,7 +32,7 @@ * $FreeBSD$ */ #include "opt_ddb.h" -#include "opt_mpath.h" +#include "opt_route.h" #include "opt_inet.h" #include "opt_inet6.h" @@ -158,8 +158,7 @@ MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) -static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - ""); +SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); struct walkarg { int w_tmemsize; @@ -650,6 +649,25 @@ fill_addrinfo(struct rt_msghdr *rtm, int len, u_int fibnum, struct rt_addrinfo * return (0); } +static struct nhop_object * +select_nhop(struct nhop_object *nh, const struct sockaddr *gw) +{ + if (!NH_IS_NHGRP(nh)) + return (nh); +#ifdef ROUTE_MPATH + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + if (gw == NULL) + return (wn[0].nh); + for (int i = 0; i < num_nhops; i++) { + if (match_nhop_gw(wn[i].nh, gw)) + return (wn[i].nh); + } +#endif + return (NULL); +} + /* * Handles RTM_GET message from routing socket, returning matching rt. * @@ -663,6 +681,7 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, { RIB_RLOCK_TRACKER; struct rib_head *rnh; + struct nhop_object *nh; sa_family_t saf; saf = info->rti_info[RTAX_DST]->sa_family; @@ -690,21 +709,12 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, RIB_RUNLOCK(rnh); return (ESRCH); } -#ifdef RADIX_MPATH - /* - * for RTM_GET, gate is optional even with multipath. - * if gate == NULL the first match is returned. - * (no need to call rt_mpath_matchgate if gate == NULL) - */ - if (rt_mpath_capable(rnh) && info->rti_info[RTAX_GATEWAY]) { - rc->rc_rt = rt_mpath_matchgate(rc->rc_rt, - info->rti_info[RTAX_GATEWAY]); - if (rc->rc_rt == NULL) { - RIB_RUNLOCK(rnh); - return (ESRCH); - } + + nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]); + if (nh == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); } -#endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform @@ -740,8 +750,13 @@ handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, RIB_RUNLOCK(rnh); return (ESRCH); } + nh = select_nhop(rc->rc_rt->rt_nhop, info->rti_info[RTAX_GATEWAY]); + if (nh == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } } - rc->rc_nh_new = rc->rc_rt->rt_nhop; + rc->rc_nh_new = nh; rc->rc_nh_weight = rc->rc_rt->rt_weight; RIB_RUNLOCK(rnh); @@ -832,6 +847,24 @@ update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm, return (0); } +static void +save_del_notification(struct rib_cmd_info *rc, void *_cbdata) +{ + struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; + + if (rc->rc_cmd == RTM_DELETE) + *rc_new = *rc; +} + +static void +save_add_notification(struct rib_cmd_info *rc, void *_cbdata) +{ + struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; + + if (rc->rc_cmd == RTM_ADD) + *rc_new = *rc; +} + /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) @@ -918,6 +951,15 @@ route_output(struct mbuf *m, struct socket *so, ...) if (error == 0) { #ifdef INET6 rti_need_deembed = 1; +#endif +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(rc.rc_nh_new) || + (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) { + struct rib_cmd_info rc_simple = {}; + rib_decompose_notification(&rc, + save_add_notification, (void *)&rc_simple); + rc = rc_simple; + } #endif nh = rc.rc_nh_new; rtm->rtm_index = nh->nh_ifp->if_index; @@ -927,6 +969,15 @@ route_output(struct mbuf *m, struct socket *so, ...) case RTM_DELETE: error = rib_action(fibnum, RTM_DELETE, &info, &rc); if (error == 0) { +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(rc.rc_nh_old) || + (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) { + struct rib_cmd_info rc_simple = {}; + rib_decompose_notification(&rc, + save_del_notification, (void *)&rc_simple); + rc = rc_simple; + } +#endif nh = rc.rc_nh_old; goto report; } @@ -1708,7 +1759,19 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) if (!can_export_rte(w->w_req->td->td_ucred, rt)) return (0); nh = rt->rt_nhop; - error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w); +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w); + if (error != 0) + return (error); + } + } else +#endif + error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w); return (0); } @@ -1748,6 +1811,7 @@ sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight, rtm->rtm_flags = rt->rte_flags; rtm->rtm_flags |= nhop_get_rtflags(nh); rt_getmetrics(rt, nh, &rtm->rtm_rmx); + rtm->rtm_rmx.rmx_weight = weight; rtm->rtm_index = nh->nh_ifp->if_index; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); @@ -2028,7 +2092,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) namelen--; if (req->newptr) return (EPERM); - if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) { + if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) @@ -2096,6 +2160,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) } break; case NET_RT_NHOP: + case NET_RT_NHGRP: /* Allow dumping one specific af/fib at a time */ if (namelen < 4) { error = EINVAL; @@ -2113,6 +2178,12 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) } if (w.w_op == NET_RT_NHOP) error = nhops_dump_sysctl(rnh, w.w_req); + else +#ifdef ROUTE_MPATH + error = nhgrp_dump_sysctl(rnh, w.w_req); +#else + error = ENOTSUP; +#endif break; case NET_RT_IFLIST: case NET_RT_IFLISTL: diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 5553530628bf..0bc02b5f20d3 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -35,8 +35,6 @@ #include __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include #include #include @@ -699,14 +697,6 @@ in_addprefix(struct in_ifaddr *target, int flags) * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { -#ifdef RADIX_MPATH - if (ia->ia_addr.sin_addr.s_addr == - target->ia_addr.sin_addr.s_addr) { - IN_IFADDR_RUNLOCK(&in_ifa_tracker); - return (EEXIST); - } else - break; -#endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); diff --git a/sys/netinet/in_fib.c b/sys/netinet/in_fib.c index c46c55bd7d00..4c84de2c7281 100644 --- a/sys/netinet/in_fib.c +++ b/sys/netinet/in_fib.c @@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_route.h" -#include "opt_mpath.h" #include #include @@ -48,14 +47,11 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include -#ifdef RADIX_MPATH -#include -#endif - #include #include #include @@ -80,7 +76,6 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum")); @@ -99,12 +94,7 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, flowid); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, flowid); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) @@ -120,7 +110,7 @@ fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, } inline static int -check_urpf(const struct nhop_object *nh, uint32_t flags, +check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { @@ -137,21 +127,24 @@ check_urpf(const struct nhop_object *nh, uint32_t flags, return (0); } -#ifdef RADIX_MPATH -inline static int -check_urpf_mpath(struct rtentry *rt, uint32_t flags, +static int +check_urpf(struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { - - while (rt != NULL) { - if (check_urpf(rt->rt_nhop, flags, src_if) != 0) - return (1); - rt = rt_mpath_next(rt); - } - - return (0); -} +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0) + return (1); + } + return (0); + } else #endif + return (check_urpf_nhop(nh, flags, src_if)); +} /* * Performs reverse path forwarding lookup. @@ -169,7 +162,6 @@ fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; int ret; KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum")); @@ -186,12 +178,7 @@ fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - ret = check_urpf_mpath(rt, flags, src_if); -#else - ret = check_urpf(rt->rt_nhop, flags, src_if); -#endif + ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if); RIB_RUNLOCK(rh); return (ret); } @@ -206,7 +193,6 @@ fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, { struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum")); @@ -225,12 +211,7 @@ fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, /* unlocked lookup */ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, 0); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, 0); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c index ef40fdc6af6c..6dfa1e56eff1 100644 --- a/sys/netinet/in_rmx.c +++ b/sys/netinet/in_rmx.c @@ -30,8 +30,6 @@ #include __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include #include #include @@ -127,9 +125,6 @@ in_inithead(uint32_t fibnum) return (NULL); rh->rnh_preadd = rib4_preadd; -#ifdef RADIX_MPATH - rt_mpath_init_rnh(rh); -#endif return (rh); } diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index a26722c97f88..a7e72f4ec407 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_mbuf_stress_test.h" -#include "opt_mpath.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" @@ -470,11 +469,7 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, * for correct operation (as it is for ARP). */ uint32_t flowid; -#ifdef RADIX_MPATH - flowid = ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr); -#else flowid = m->m_pkthdr.flowid; -#endif ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_REF, flowid); diff --git a/sys/netinet6/in6_fib.c b/sys/netinet6/in6_fib.c index a0e4dacc86e1..9fd869b2730b 100644 --- a/sys/netinet6/in6_fib.c +++ b/sys/netinet6/in6_fib.c @@ -33,7 +33,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" -#include "opt_mpath.h" #include #include @@ -49,14 +48,11 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include -#ifdef RADIX_MPATH -#include -#endif - #include #include #include @@ -88,7 +84,6 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; struct sockaddr_in6 sin6; @@ -111,12 +106,7 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - if (rt_mpath_next(rt) != NULL) - rt = rt_mpath_selectrte(rt, flowid); -#endif - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, flowid); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) @@ -132,7 +122,7 @@ fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, } inline static int -check_urpf(const struct nhop_object *nh, uint32_t flags, +check_urpf_nhop(const struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { @@ -149,21 +139,24 @@ check_urpf(const struct nhop_object *nh, uint32_t flags, return (0); } -#ifdef RADIX_MPATH -inline static int -check_urpf_mpath(struct rtentry *rt, uint32_t flags, +static int +check_urpf(struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { - - while (rt != NULL) { - if (check_urpf(rt->rt_nhop, flags, src_if) != 0) - return (1); - rt = rt_mpath_next(rt); - } - - return (0); -} +#ifdef ROUTE_MPATH + if (NH_IS_NHGRP(nh)) { + struct weightened_nhop *wn; + uint32_t num_nhops; + wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); + for (int i = 0; i < num_nhops; i++) { + if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0) + return (1); + } + return (0); + } else #endif + return (check_urpf_nhop(nh, flags, src_if)); +} /* * Performs reverse path forwarding lookup. @@ -181,7 +174,6 @@ fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct sockaddr_in6 sin6; int ret; @@ -203,12 +195,7 @@ fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); -#ifdef RADIX_MPATH - ret = check_urpf_mpath(rt, flags, src_if); -#else - ret = check_urpf(rt->rt_nhop, flags, src_if); -#endif + ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if); RIB_RUNLOCK(rh); return (ret); } @@ -223,7 +210,6 @@ fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, { struct rib_head *rh; struct radix_node *rn; - struct rtentry *rt; struct nhop_object *nh; struct sockaddr_in6 sin6; @@ -245,8 +231,7 @@ fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rt = RNTORT(rn); - nh = rt->rt_nhop; + nh = nhop_select((RNTORT(rn))->rt_nhop, 0); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c index 5f2e2fe3ae6e..54136f9983b2 100644 --- a/sys/netinet6/in6_rmx.c +++ b/sys/netinet6/in6_rmx.c @@ -64,8 +64,6 @@ #include __FBSDID("$FreeBSD$"); -#include "opt_mpath.h" - #include #include #include @@ -153,9 +151,6 @@ in6_inithead(uint32_t fibnum) return (NULL); rh->rnh_preadd = rib6_preadd; -#ifdef RADIX_MPATH - rt_mpath_init_rnh(rh); -#endif rs = rib_subscribe_internal(rh, nd6_subscription_cb, NULL, RIB_NOTIFY_IMMEDIATE, true); diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index ad31d750eb70..1597a4cb6b93 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_route.h" #include #include @@ -1591,7 +1592,11 @@ void nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg) { +#ifdef ROUTE_MPATH + rib_decompose_notification(rc, check_release_defrouter, NULL); +#else check_release_defrouter(rc, NULL); +#endif } int diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 038c4d3ef8b9..311d65671051 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -417,6 +417,7 @@ struct sockproto { #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #define NET_RT_NHOP 6 /* dump routing nexthops */ +#define NET_RT_NHGRP 7 /* dump routing nexthop groups */ #endif /* __BSD_VISIBLE */ /* diff --git a/usr.bin/netstat/Makefile b/usr.bin/netstat/Makefile index b61afdc410b0..febab940be4a 100644 --- a/usr.bin/netstat/Makefile +++ b/usr.bin/netstat/Makefile @@ -5,7 +5,7 @@ PROG= netstat SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \ - unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \ + unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c nhgrp.c \ nl_defs.h nl_symbols.c: nlist_symbols diff --git a/usr.bin/netstat/common.h b/usr.bin/netstat/common.h index aafa45df8936..da058c97d910 100644 --- a/usr.bin/netstat/common.h +++ b/usr.bin/netstat/common.h @@ -54,5 +54,22 @@ struct ifmap_entry { struct ifmap_entry *prepare_ifmap(size_t *ifmap_size); +struct rt_msghdr; +struct nhops_map { + uint32_t idx; + struct rt_msghdr *rtm; +}; + +struct nhops_dump { + void *nh_buf; + struct nhops_map *nh_map; + size_t nh_count; +}; + +void dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd); +struct nhop_map; +void nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname); + + #endif diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c index 5ba9dcdcd5ee..68fa97ea9f3b 100644 --- a/usr.bin/netstat/main.c +++ b/usr.bin/netstat/main.c @@ -215,6 +215,7 @@ int mflag; /* show memory stats */ int noutputs = 0; /* how much outputs before we exit */ int numeric_addr; /* show addresses numerically */ int numeric_port; /* show ports numerically */ +int Oflag; /* show nhgrp objects*/ int oflag; /* show nexthop objects*/ int Pflag; /* show TCP log ID */ static int pflag; /* show given protocol */ @@ -250,7 +251,7 @@ main(int argc, char *argv[]) if (argc < 0) exit(EXIT_FAILURE); - while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz")) + while ((ch = getopt(argc, argv, "46AaBbCdF:f:ghI:iLlM:mN:nOoPp:Qq:RrSTsuWw:xz")) != -1) switch(ch) { case '4': @@ -353,6 +354,9 @@ main(int argc, char *argv[]) case 'o': oflag = 1; break; + case 'O': + Oflag = 1; + break; case 'P': Pflag = 1; break; @@ -509,6 +513,14 @@ main(int argc, char *argv[]) xo_finish(); exit(0); } + if (Oflag) { + xo_open_container("statistics"); + nhgrp_print(fib, af); + xo_close_container("statistics"); + xo_finish(); + exit(0); + } + if (gflag) { diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h index 9f9e716a31b3..0c2cc6eee553 100644 --- a/usr.bin/netstat/netstat.h +++ b/usr.bin/netstat/netstat.h @@ -163,3 +163,4 @@ void mroutepr(void); void mrt_stats(void); void bpf_stats(char *); void nhops_print(int fibnum, int af); +void nhgrp_print(int fibnum, int af); diff --git a/usr.bin/netstat/nhgrp.c b/usr.bin/netstat/nhgrp.c new file mode 100644 index 000000000000..557580e6cd2e --- /dev/null +++ b/usr.bin/netstat/nhgrp.c @@ -0,0 +1,355 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netstat.h" +#include "common.h" + +#define WID_GW_DEFAULT(af) (((af) == AF_INET6) ? 40 : 18) + +static int wid_gw; +static int wid_if = 10; +static int wid_nhidx = 8; +static int wid_refcnt = 8; + +struct nhop_entry { + char gw[64]; + char ifname[IFNAMSIZ]; +}; + +struct nhop_map { + struct nhop_entry *ptr; + size_t size; +}; +static struct nhop_map global_nhop_map; + +static struct ifmap_entry *ifmap; +static size_t ifmap_size; + +static struct nhop_entry * +nhop_get(struct nhop_map *map, uint32_t idx) +{ + + if (idx >= map->size) + return (NULL); + if (*map->ptr[idx].ifname == '\0') + return (NULL); + return &map->ptr[idx]; +} + +static void +print_nhgroup_header(int af1 __unused) +{ + + xo_emit("{T:/%-*.*s}{T:/%-*.*s}{T:/%*.*s}{T:/%*.*s}{T:/%*.*s}" + "{T:/%*.*s}{T:/%*s}\n", + wid_nhidx, wid_nhidx, "GrpIdx", + wid_nhidx, wid_nhidx, "NhIdx", + wid_nhidx, wid_nhidx, "Weight", + wid_nhidx, wid_nhidx, "Slots", + wid_gw, wid_gw, "Gateway", + wid_if, wid_if, "Netif", + wid_refcnt, "Refcnt"); +} + +static void +print_padding(char sym, int len) +{ + char buffer[56]; + + memset(buffer, sym, sizeof(buffer)); + buffer[0] = '{'; + buffer[1] = 'P'; + buffer[2] = ':'; + buffer[3] = ' '; + buffer[len + 3] = '}'; + buffer[len + 4] = '\0'; + xo_emit(buffer); +} + + +static void +print_nhgroup_entry_sysctl(const char *name, struct rt_msghdr *rtm, + struct nhgrp_external *nhge) +{ + char buffer[128]; + struct nhop_entry *ne; + struct nhgrp_nhop_external *ext_cp, *ext_dp; + struct nhgrp_container *nhg_cp, *nhg_dp; + + nhg_cp = (struct nhgrp_container *)(nhge + 1); + if (nhg_cp->nhgc_type != NHG_C_TYPE_CNHOPS || nhg_cp->nhgc_subtype != 0) + return; + ext_cp = (struct nhgrp_nhop_external *)(nhg_cp + 1); + + nhg_dp = (struct nhgrp_container *)((char *)nhg_cp + nhg_cp->nhgc_len); + if (nhg_dp->nhgc_type != NHG_C_TYPE_DNHOPS || nhg_dp->nhgc_subtype != 0) + return; + ext_dp = (struct nhgrp_nhop_external *)(nhg_dp + 1); + + xo_open_instance(name); + + snprintf(buffer, sizeof(buffer), "{[:-%d}{:nhgrp-index/%%lu}{]:} ", wid_nhidx); + + xo_emit(buffer, nhge->nhg_idx); + + /* nhidx */ + print_padding('-', wid_nhidx); + /* weight */ + print_padding('-', wid_nhidx); + /* slots */ + print_padding('-', wid_nhidx); + print_padding('-', wid_gw); + print_padding('-', wid_if); + xo_emit("{t:nhg-refcnt/%*lu}", wid_refcnt, nhge->nhg_refcount); + xo_emit("\n"); + + xo_open_list("nhop-weights"); + for (uint32_t i = 0; i < nhg_cp->nhgc_count; i++) { + /* TODO: optimize slots calculations */ + uint32_t slots = 0; + for (uint32_t sidx = 0; sidx < nhg_dp->nhgc_count; sidx++) { + if (ext_dp[sidx].nh_idx == ext_cp[i].nh_idx) + slots++; + } + xo_open_instance("nhop-weight"); + print_padding(' ', wid_nhidx); + // nh index + xo_emit("{t:nh-index/%*lu}", wid_nhidx, ext_cp[i].nh_idx); + xo_emit("{t:nh-weight/%*lu}", wid_nhidx, ext_cp[i].nh_weight); + xo_emit("{t:nh-slots/%*lu}", wid_nhidx, slots); + ne = nhop_get(&global_nhop_map, ext_cp[i].nh_idx); + if (ne != NULL) { + xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw); + xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname); + } + xo_emit("\n"); + xo_close_instance("nhop-weight"); + } + xo_close_list("nhop-weights"); + xo_close_instance(name); +} + +static int +cmp_nhg_idx(const void *_a, const void *_b) +{ + const struct nhops_map *a, *b; + + a = _a; + b = _b; + + if (a->idx > b->idx) + return (1); + else if (a->idx < b->idx) + return (-1); + return (0); +} + +static void +dump_nhgrp_sysctl(int fibnum, int af, struct nhops_dump *nd) +{ + size_t needed; + int mib[7]; + char *buf, *next, *lim; + struct rt_msghdr *rtm; + struct nhgrp_external *nhg; + struct nhops_map *nhg_map; + size_t nhg_count, nhg_size; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = af; + mib[4] = NET_RT_NHGRP; + mib[5] = 0; + mib[6] = fibnum; + if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) + err(EX_OSERR, "sysctl: net.route.0.%d.nhgrpdump.%d estimate", + af, fibnum); + if ((buf = malloc(needed)) == NULL) + errx(2, "malloc(%lu)", (unsigned long)needed); + if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) + err(1, "sysctl: net.route.0.%d.nhgrpdump.%d", af, fibnum); + lim = buf + needed; + + /* + * nexhops groups are received unsorted. Collect everything first, + * and sort prior displaying. + */ + nhg_count = 0; + nhg_size = 16; + nhg_map = calloc(nhg_size, sizeof(struct nhops_map)); + for (next = buf; next < lim; next += rtm->rtm_msglen) { + rtm = (struct rt_msghdr *)next; + if (rtm->rtm_version != RTM_VERSION) + continue; + + if (nhg_count >= nhg_size) { + nhg_size *= 2; + nhg_map = realloc(nhg_map, nhg_size * sizeof(struct nhops_map)); + } + + nhg = (struct nhgrp_external *)(rtm + 1); + nhg_map[nhg_count].idx = nhg->nhg_idx; + nhg_map[nhg_count].rtm = rtm; + nhg_count++; + } + + if (nhg_count > 0) + qsort(nhg_map, nhg_count, sizeof(struct nhops_map), cmp_nhg_idx); + nd->nh_buf = buf; + nd->nh_count = nhg_count; + nd->nh_map = nhg_map; +} + +static void +print_nhgrp_sysctl(int fibnum, int af) +{ + struct nhops_dump nd; + struct nhgrp_external *nhg; + struct rt_msghdr *rtm; + + dump_nhgrp_sysctl(fibnum, af, &nd); + + xo_open_container("nhgrp-table"); + xo_open_list("rt-family"); + if (nd.nh_count > 0) { + wid_gw = WID_GW_DEFAULT(af); + xo_open_instance("rt-family"); + pr_family(af); + xo_open_list("nhgrp-entry"); + + print_nhgroup_header(af); + + for (size_t i = 0; i < nd.nh_count; i++) { + rtm = nd.nh_map[i].rtm; + nhg = (struct nhgrp_external *)(rtm + 1); + print_nhgroup_entry_sysctl("nhgrp-entry", rtm, nhg); + } + } + xo_close_list("rt-family"); + xo_close_container("nhgrp-table"); + free(nd.nh_buf); +} + +static void +update_global_map(struct nhop_external *nh) +{ + char iface_name[128]; + char gw_addr[64]; + struct nhop_addrs *na; + struct sockaddr *sa_gw; + + na = (struct nhop_addrs *)((char *)nh + nh->nh_len); + sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off); + + memset(iface_name, 0, sizeof(iface_name)); + if (nh->ifindex < (uint32_t)ifmap_size) { + strlcpy(iface_name, ifmap[nh->ifindex].ifname, + sizeof(iface_name)); + if (*iface_name == '\0') + strlcpy(iface_name, "---", sizeof(iface_name)); + } + + if (nh->nh_flags & NHF_GATEWAY) { + const char *cp; + cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST); + strlcpy(gw_addr, cp, sizeof(gw_addr)); + } else + snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name); + + nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name); +} + +static void +prepare_nh_map(int fibnum, int af) +{ + struct nhops_dump nd; + struct nhop_external *nh; + struct rt_msghdr *rtm; + + dump_nhops_sysctl(fibnum, af, &nd); + + for (size_t i = 0; i < nd.nh_count; i++) { + rtm = nd.nh_map[i].rtm; + nh = (struct nhop_external *)(rtm + 1); + update_global_map(nh); + } + + free(nd.nh_buf); +} + +void +nhgrp_print(int fibnum, int af) +{ + size_t intsize; + int numfibs; + + intsize = sizeof(int); + if (fibnum == -1 && + sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1) + fibnum = 0; + if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) + numfibs = 1; + if (fibnum < 0 || fibnum > numfibs - 1) + errx(EX_USAGE, "%d: invalid fib", fibnum); + + ifmap = prepare_ifmap(&ifmap_size); + prepare_nh_map(fibnum, af); + + xo_open_container("route-nhgrp-information"); + xo_emit("{T:Nexthop groups data}"); + if (fibnum) + xo_emit(" ({L:fib}: {:fib/%d})", fibnum); + xo_emit("\n"); + print_nhgrp_sysctl(fibnum, af); + xo_close_container("route-nhgrp-information"); +} + diff --git a/usr.bin/netstat/nhops.c b/usr.bin/netstat/nhops.c index 63addb00a685..44006da7f1fe 100644 --- a/usr.bin/netstat/nhops.c +++ b/usr.bin/netstat/nhops.c @@ -118,8 +118,6 @@ struct nhop_map { }; static struct nhop_map global_nhop_map; -static void nhop_map_update(struct nhop_map *map, uint32_t idx, - char *gw, char *ifname); static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx); @@ -204,7 +202,7 @@ print_nhop_header(int af1 __unused) } } -static void +void nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname) { if (idx >= map->size) { @@ -322,11 +320,6 @@ print_nhop_entry_sysctl(const char *name, struct rt_msghdr *rtm, struct nhop_ext xo_close_instance(name); } -struct nhops_map { - uint32_t idx; - struct rt_msghdr *rtm; -}; - static int cmp_nh_idx(const void *_a, const void *_b) { @@ -342,15 +335,14 @@ cmp_nh_idx(const void *_a, const void *_b) return (0); } -static void -print_nhops_sysctl(int fibnum, int af) +void +dump_nhops_sysctl(int fibnum, int af, struct nhops_dump *nd) { size_t needed; int mib[7]; char *buf, *next, *lim; struct rt_msghdr *rtm; struct nhop_external *nh; - int fam; struct nhops_map *nh_map; size_t nh_count, nh_size; @@ -369,8 +361,6 @@ print_nhops_sysctl(int fibnum, int af) if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum); lim = buf + needed; - xo_open_container("nhop-table"); - xo_open_list("rt-family"); /* * nexhops are received unsorted. Collect everything first, sort and then display @@ -395,9 +385,27 @@ print_nhops_sysctl(int fibnum, int af) nh_count++; } - if (nh_count > 0) { + if (nh_count > 0) qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx); - nh = (struct nhop_external *)(nh_map[0].rtm + 1); + nd->nh_buf = buf; + nd->nh_count = nh_count; + nd->nh_map = nh_map; +} + +static void +print_nhops_sysctl(int fibnum, int af) +{ + struct nhops_dump nd; + struct nhop_external *nh; + int fam; + struct rt_msghdr *rtm; + + dump_nhops_sysctl(fibnum, af, &nd); + + xo_open_container("nhop-table"); + xo_open_list("rt-family"); + if (nd.nh_count > 0) { + nh = (struct nhop_external *)(nd.nh_map[0].rtm + 1); fam = nh->nh_family; wid_dst = WID_GW_DEFAULT(fam); @@ -415,8 +423,8 @@ print_nhops_sysctl(int fibnum, int af) print_nhop_header(fam); - for (size_t i = 0; i < nh_count; i++) { - rtm = nh_map[i].rtm; + for (size_t i = 0; i < nd.nh_count; i++) { + rtm = nd.nh_map[i].rtm; nh = (struct nhop_external *)(rtm + 1); print_nhop_entry_sysctl("nh-entry", rtm, nh); } @@ -426,7 +434,7 @@ print_nhops_sysctl(int fibnum, int af) } xo_close_list("rt-family"); xo_close_container("nhop-table"); - free(buf); + free(nd.nh_buf); } static void