freebsd-dev/sys/netlink/route/nexthop.c
Alexander V. Chernikov 19e43c163c netlink: add netlink KPI to the kernel by default
This change does the following:

Base Netlink KPIs (ability to register the family, parse and/or
 write a Netlink message) are always present in the kernel. Specifically,
* Implementation of genetlink family/group registration/removal,
  some base accessors (netlink_generic_kpi.c, 260 LoC) are compiled in
  unconditionally.
* Basic TLV parser functions (netlink_message_parser.c, 507 LoC) are
  compiled in unconditionally.
* Glue functions (netlink<>rtsock), malloc/core sysctl definitions
 (netlink_glue.c, 259 LoC) are compiled in unconditionally.
* The rest of the KPI _functions_ are defined in the netlink_glue.c,
 but their implementation calls a pointer to either the stub function
 or the actual function, depending on whether the module is loaded or not.

This approach allows to have only 1k LoC out of ~3.7k LoC (current
 sys/netlink implementation) in the kernel, which will not grow further.
It also allows for the generic netlink kernel customers to load
 successfully without requiring Netlink module and operate correctly
 once Netlink module is loaded.

Reviewed by:	imp
MFC after:	2 weeks
Differential Revision:	https://reviews.freebsd.org/D39269
2023-03-27 13:55:44 +00:00

1037 lines
27 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "opt_netlink.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_route.h"
#include <sys/types.h>
#include <sys/ck.h>
#include <sys/epoch.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <net/if.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <net/route/nhop_utils.h>
#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <netinet6/scope6_var.h>
#include <netlink/netlink.h>
#include <netlink/netlink_ctl.h>
#include <netlink/netlink_route.h>
#include <netlink/route/route_var.h>
#define DEBUG_MOD_NAME nl_nhop
#define DEBUG_MAX_LEVEL LOG_DEBUG3
#include <netlink/netlink_debug.h>
_DECLARE_DEBUG(LOG_DEBUG);
/*
* This file contains the logic to maintain kernel nexthops and
* nexhop groups based om the data provided by the user.
*
* Kernel stores (nearly) all of the routing data in the nexthops,
* including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
*
* Netlink API provides higher-level abstraction for the user. Each
* user-created nexthop may map to multiple kernel nexthops.
*
* The following variations require separate kernel nexthop to be
* created:
* * prefix flags (NHF_HOST, NHF_DEFAULT)
* * using IPv6 gateway for IPv4 routes
* * different fibnum
*
* These kernel nexthops have the lifetime bound to the lifetime of
* the user_nhop object. They are not collected until user requests
* to delete the created user_nhop.
*
*/
struct user_nhop {
uint32_t un_idx; /* Userland-provided index */
uint32_t un_fibfam; /* fibnum+af(as highest byte) */
uint8_t un_protocol; /* protocol that install the record */
struct nhop_object *un_nhop; /* "production" nexthop */
struct nhop_object *un_nhop_src; /* nexthop to copy from */
struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */
uint32_t un_nhgrp_count; /* number of nexthops */
struct user_nhop *un_next; /* next item in hash chain */
struct user_nhop *un_nextchild; /* master -> children */
struct epoch_context un_epoch_ctx; /* epoch ctl helper */
};
/* produce hash value for an object */
#define unhop_hash_obj(_obj) (hash_unhop(_obj))
/* compare two objects */
#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
/* next object accessor */
#define unhop_next(_obj) (_obj)->un_next
CHT_SLIST_DEFINE(unhop, struct user_nhop);
struct unhop_ctl {
struct unhop_head un_head;
struct rmlock un_lock;
};
#define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl")
#define UN_TRACKER struct rm_priotracker un_tracker
#define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker)
#define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker)
#define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock);
#define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock);
VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
#define V_un_ctl VNET(un_ctl)
static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
static unsigned int hash_unhop(const struct user_nhop *obj);
static void destroy_unhop(struct user_nhop *unhop);
static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
uint32_t fibnum, int family, int nh_flags);
static int
cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
{
return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
}
/*
* Hash callback: calculate hash of an object
*/
static unsigned int
hash_unhop(const struct user_nhop *obj)
{
return (obj->un_idx ^ obj->un_fibfam);
}
#define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0)
/*
* Factory interface for creating matching kernel nexthops/nexthop groups
*
* @uidx: userland nexhop index used to create the nexthop
* @fibnum: fibnum nexthop will be used in
* @family: upper family nexthop will be used in
* @nh_flags: desired nexthop prefix flags
* @perror: pointer to store error to
*
* Returns referenced nexthop linked to @fibnum/@family rib on success.
*/
struct nhop_object *
nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
int nh_flags, int *perror)
{
struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
UN_TRACKER;
if (__predict_false(ctl == NULL))
return (NULL);
struct user_nhop key= {
.un_idx = uidx,
.un_fibfam = fibnum | ((uint32_t)family) << 24,
};
struct user_nhop *unhop;
nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
if (__predict_false(family == 0))
return (NULL);
UN_RLOCK(ctl);
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
if (unhop != NULL) {
struct nhop_object *nh = unhop->un_nhop;
UN_RLOCK(ctl);
*perror = 0;
nhop_ref_any(nh);
return (nh);
}
/*
* Exact nexthop not found. Search for template nexthop to clone from.
*/
key.un_fibfam = 0;
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
if (unhop == NULL) {
UN_RUNLOCK(ctl);
*perror = ESRCH;
return (NULL);
}
UN_RUNLOCK(ctl);
/* Create entry to insert first */
struct user_nhop *un_new, *un_tmp;
un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
if (un_new == NULL) {
*perror = ENOMEM;
return (NULL);
}
un_new->un_idx = uidx;
un_new->un_fibfam = fibnum | ((uint32_t)family) << 24;
/* Relying on epoch to protect unhop here */
un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
if (un_new->un_nhop == NULL) {
free(un_new, M_NETLINK);
*perror = ENOMEM;
return (NULL);
}
/* Insert back and report */
UN_WLOCK(ctl);
/* First, find template record once again */
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
if (unhop == NULL) {
/* Someone deleted the nexthop during the call */
UN_WUNLOCK(ctl);
*perror = ESRCH;
destroy_unhop(un_new);
return (NULL);
}
/* Second, check the direct match */
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
struct nhop_object *nh;
if (un_tmp != NULL) {
/* Another thread already created the desired nextop, use it */
nh = un_tmp->un_nhop;
} else {
/* Finally, insert the new nexthop and link it to the primary */
nh = un_new->un_nhop;
CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
un_new->un_nextchild = unhop->un_nextchild;
unhop->un_nextchild = un_new;
un_new = NULL;
NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
}
UN_WUNLOCK(ctl);
if (un_new != NULL)
destroy_unhop(un_new);
*perror = 0;
nhop_ref_any(nh);
return (nh);
}
static struct user_nhop *
nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
{
struct user_nhop key= { .un_idx = uidx };
struct user_nhop *unhop = NULL;
UN_TRACKER;
UN_RLOCK(ctl);
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
UN_RUNLOCK(ctl);
return (unhop);
}
#define MAX_STACK_NHOPS 4
static struct nhop_object *
clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
{
#ifdef ROUTE_MPATH
const struct weightened_nhop *wn;
struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
uint32_t num_nhops;
#endif
struct nhop_object *nh = NULL;
int error;
if (unhop->un_nhop_src != NULL) {
IF_DEBUG_LEVEL(LOG_DEBUG2) {
char nhbuf[NHOP_PRINT_BUFSIZE];
nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
"cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
family, nh_flags);
}
struct nhop_object *nh;
nh = nhop_alloc(fibnum, AF_UNSPEC);
if (nh == NULL)
return (NULL);
nhop_copy(nh, unhop->un_nhop_src);
/* Check that nexthop gateway is compatible with the new family */
if (!nhop_set_upper_family(nh, family)) {
nhop_free(nh);
return (NULL);
}
nhop_set_uidx(nh, unhop->un_idx);
nhop_set_pxtype_flag(nh, nh_flags);
return (nhop_get_nhop(nh, &error));
}
#ifdef ROUTE_MPATH
wn = unhop->un_nhgrp_src;
num_nhops = unhop->un_nhgrp_count;
if (num_nhops > MAX_STACK_NHOPS) {
wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
if (wn_new == NULL)
return (NULL);
} else
wn_new = wn_base;
for (int i = 0; i < num_nhops; i++) {
uint32_t uidx = nhop_get_uidx(wn[i].nh);
MPASS(uidx != 0);
wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
if (error != 0)
break;
wn_new[i].weight = wn[i].weight;
}
if (error == 0) {
struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
struct nhgrp_object *nhg;
error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
nh = (struct nhop_object *)nhg;
}
if (wn_new != wn_base)
free(wn_new, M_TEMP);
#endif
return (nh);
}
static void
destroy_unhop(struct user_nhop *unhop)
{
if (unhop->un_nhop != NULL)
nhop_free_any(unhop->un_nhop);
if (unhop->un_nhop_src != NULL)
nhop_free_any(unhop->un_nhop_src);
free(unhop, M_NETLINK);
}
static void
destroy_unhop_epoch(epoch_context_t ctx)
{
struct user_nhop *unhop;
unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
destroy_unhop(unhop);
}
static uint32_t
find_spare_uidx(struct unhop_ctl *ctl)
{
struct user_nhop *unhop, key = {};
uint32_t uidx = 0;
UN_TRACKER;
UN_RLOCK(ctl);
/* This should return spare uid with 75% of 65k used in ~99/100 cases */
for (int i = 0; i < 16; i++) {
key.un_idx = (arc4random() % 65536) + 65536 * 4;
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
if (unhop == NULL) {
uidx = key.un_idx;
break;
}
}
UN_RUNLOCK(ctl);
return (uidx);
}
/*
* Actual netlink code
*/
struct netlink_walkargs {
struct nl_writer *nw;
struct nlmsghdr hdr;
struct nlpcb *so;
int family;
int error;
int count;
int dumped;
};
#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
static bool
dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
struct nl_writer *nw)
{
if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
goto enomem;
struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
nhm->nh_family = AF_UNSPEC;
nhm->nh_scope = 0;
nhm->nh_protocol = unhop->un_protocol;
nhm->nh_flags = 0;
nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
struct weightened_nhop *wn = unhop->un_nhgrp_src;
uint32_t num_nhops = unhop->un_nhgrp_count;
/* TODO: a better API? */
int nla_len = sizeof(struct nlattr);
nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
if (nla == NULL)
goto enomem;
nla->nla_type = NHA_GROUP;
nla->nla_len = nla_len;
for (int i = 0; i < num_nhops; i++) {
struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
grp->id = nhop_get_uidx(wn[i].nh);
grp->weight = wn[i].weight;
grp->resvd1 = 0;
grp->resvd2 = 0;
}
if (nlmsg_end(nw))
return (true);
enomem:
NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
nlmsg_abort(nw);
return (false);
}
static bool
dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
struct nl_writer *nw)
{
struct nhop_object *nh = unhop->un_nhop_src;
if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
goto enomem;
struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
ENOMEM_IF_NULL(nhm);
nhm->nh_family = nhop_get_neigh_family(nh);
nhm->nh_scope = 0; // XXX: what's that?
nhm->nh_protocol = unhop->un_protocol;
nhm->nh_flags = 0;
nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
if (nh->nh_flags & NHF_BLACKHOLE) {
nlattr_add_flag(nw, NHA_BLACKHOLE);
goto done;
}
nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index);
switch (nh->gw_sa.sa_family) {
#ifdef INET
case AF_INET:
nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
break;
#endif
#ifdef INET6
case AF_INET6:
{
struct in6_addr addr = nh->gw6_sa.sin6_addr;
in6_clearscope(&addr);
nlattr_add(nw, NHA_GATEWAY, 16, &addr);
break;
}
#endif
}
done:
if (nlmsg_end(nw))
return (true);
enomem:
nlmsg_abort(nw);
return (false);
}
static void
dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
struct nl_writer *nw)
{
if (unhop->un_nhop_src != NULL)
dump_nhop(unhop, hdr, nw);
else
dump_nhgrp(unhop, hdr, nw);
}
static int
delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
{
struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
struct user_nhop key = { .un_idx = uidx };
UN_WLOCK(ctl);
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
if (unhop_base != NULL) {
CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
IF_DEBUG_LEVEL(LOG_DEBUG2) {
char nhbuf[NHOP_PRINT_BUFSIZE];
nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
"removed base nhop %u: %s", uidx, nhbuf);
}
/* Unlink all child nexhops as well, keeping the chain intact */
unhop_chain = unhop_base->un_nextchild;
while (unhop_chain != NULL) {
CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
unhop_ret);
MPASS(unhop_chain == unhop_ret);
IF_DEBUG_LEVEL(LOG_DEBUG3) {
char nhbuf[NHOP_PRINT_BUFSIZE];
nhop_print_buf_any(unhop_chain->un_nhop,
nhbuf, sizeof(nhbuf));
FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
"removed child nhop %u: %s", uidx, nhbuf);
}
unhop_chain = unhop_chain->un_nextchild;
}
}
UN_WUNLOCK(ctl);
if (unhop_base == NULL) {
NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
return (ENOENT);
}
/* Report nexthop deletion */
struct netlink_walkargs wa = {
.hdr.nlmsg_pid = hdr->nlmsg_pid,
.hdr.nlmsg_seq = hdr->nlmsg_seq,
.hdr.nlmsg_flags = hdr->nlmsg_flags,
.hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
};
struct nl_writer nw = {};
if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
NL_LOG(LOG_DEBUG, "error allocating message writer");
return (ENOMEM);
}
dump_unhop(unhop_base, &wa.hdr, &nw);
nlmsg_flush(&nw);
while (unhop_base != NULL) {
unhop_chain = unhop_base->un_nextchild;
NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
unhop_base = unhop_chain;
}
return (0);
}
static void
consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
{
void *new_ptr = NULL;
size_t alloc_size;
if (new_size == 0)
return;
if (new_size != 0) {
alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
if (new_ptr == NULL)
return;
}
NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
UN_WLOCK(ctl);
if (new_ptr != NULL) {
CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
}
UN_WUNLOCK(ctl);
if (new_ptr != NULL)
free(new_ptr, M_NETLINK);
}
static bool __noinline
vnet_init_unhops(void)
{
uint32_t num_buckets = 16;
size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
M_NOWAIT | M_ZERO);
if (ctl == NULL)
return (false);
void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
if (ptr == NULL) {
free(ctl, M_NETLINK);
return (false);
}
CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
UN_LOCK_INIT(ctl);
if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
free(ptr, M_NETLINK);
free(ctl, M_NETLINK);
}
if (atomic_load_ptr(&V_un_ctl) == NULL)
return (false);
NL_LOG(LOG_NOTICE, "UNHOPS init done");
return (true);
}
static void
vnet_destroy_unhops(const void *unused __unused)
{
struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
struct user_nhop *unhop, *tmp;
if (ctl == NULL)
return;
V_un_ctl = NULL;
/* Wait till all unhop users finish their reads */
NET_EPOCH_WAIT();
UN_WLOCK(ctl);
CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
destroy_unhop(unhop);
} CHT_SLIST_FOREACH_SAFE_END;
UN_WUNLOCK(ctl);
free(ctl->un_head.ptr, M_NETLINK);
free(ctl, M_NETLINK);
}
VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
vnet_destroy_unhops, NULL);
static int
nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
{
int error = 0;
/* Verify attribute correctness */
struct nexthop_grp *grp = NLA_DATA(nla);
int data_len = NLA_DATA_LEN(nla);
int count = data_len / sizeof(*grp);
if (count == 0 || (count * sizeof(*grp) != data_len)) {
NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
return (EINVAL);
}
*((struct nlattr **)target) = nla;
return (error);
}
struct nl_parsed_nhop {
uint32_t nha_id;
uint8_t nha_blackhole;
uint8_t nha_groups;
struct ifnet *nha_oif;
struct sockaddr *nha_gw;
struct nlattr *nha_group;
uint8_t nh_family;
uint8_t nh_protocol;
};
#define _IN(_field) offsetof(struct nhmsg, _field)
#define _OUT(_field) offsetof(struct nl_parsed_nhop, _field)
static const struct nlfield_parser nlf_p_nh[] = {
{ .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
{ .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
};
static const struct nlattr_parser nla_p_nh[] = {
{ .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
{ .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
{ .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
{ .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
{ .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
{ .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
};
#undef _IN
#undef _OUT
NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh);
static bool
eligible_nhg(const struct nhop_object *nh)
{
return (nh->nh_flags & NHF_GATEWAY);
}
static int
newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
{
struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
struct weightened_nhop *wn;
wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
if (wn == NULL)
return (ENOMEM);
for (int i = 0; i < count; i++) {
struct user_nhop *unhop;
unhop = nl_find_base_unhop(ctl, grp[i].id);
if (unhop == NULL) {
NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
free(wn, M_NETLINK);
return (ESRCH);
} else if (unhop->un_nhop_src == NULL) {
NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
grp[i].id);
free(wn, M_NETLINK);
return (ENOTSUP);
} else if (!eligible_nhg(unhop->un_nhop_src)) {
NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
grp[i].id);
free(wn, M_NETLINK);
return (ENOTSUP);
}
/*
* TODO: consider more rigid eligibility checks:
* restrict nexthops with the same gateway
*/
wn[i].nh = unhop->un_nhop_src;
wn[i].weight = grp[i].weight;
}
unhop->un_nhgrp_src = wn;
unhop->un_nhgrp_count = count;
return (0);
}
/*
* Sets nexthop @nh gateway specified by @gw.
* If gateway is IPv6 link-local, alters @gw to include scopeid equal to
* @ifp ifindex.
* Returns 0 on success or errno.
*/
int
nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp,
struct nl_pstate *npt)
{
#ifdef INET6
if (gw->sa_family == AF_INET6) {
struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
if (ifp == NULL) {
NLMSG_REPORT_ERR_MSG(npt, "interface not set");
return (EINVAL);
}
in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index);
}
}
#endif
nhop_set_gw(nh, gw, true);
return (0);
}
static int
newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
{
struct ifaddr *ifa = NULL;
struct nhop_object *nh;
int error;
if (!attrs->nha_blackhole) {
if (attrs->nha_gw == NULL) {
NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
return (EINVAL);
}
if (attrs->nha_oif == NULL) {
NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
return (EINVAL);
}
if (ifa == NULL)
ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
if (ifa == NULL) {
NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
return (EINVAL);
}
}
int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
nh = nhop_alloc(RT_DEFAULT_FIB, family);
if (nh == NULL) {
NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
return (ENOMEM);
}
nhop_set_uidx(nh, attrs->nha_id);
if (attrs->nha_blackhole)
nhop_set_blackhole(nh, NHF_BLACKHOLE);
else {
error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
if (error != 0) {
nhop_free(nh);
return (error);
}
nhop_set_transmit_ifp(nh, attrs->nha_oif);
nhop_set_src(nh, ifa);
}
error = nhop_get_unlinked(nh);
if (error != 0) {
NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
return (error);
}
IF_DEBUG_LEVEL(LOG_DEBUG2) {
char nhbuf[NHOP_PRINT_BUFSIZE];
nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
}
unhop->un_nhop_src = nh;
return (0);
}
static int
rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
struct nl_pstate *npt)
{
struct user_nhop *unhop;
int error;
if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
return (ENOMEM);
struct unhop_ctl *ctl = V_un_ctl;
struct nl_parsed_nhop attrs = {};
error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
if (error != 0)
return (error);
/*
* Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
* citizen.
*/
if (attrs.nha_id == 0) {
attrs.nha_id = find_spare_uidx(ctl);
if (attrs.nha_id == 0) {
NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
return (ENOSPC);
}
}
NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0);
unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
if (unhop == NULL) {
NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
return (ENOMEM);
}
unhop->un_idx = attrs.nha_id;
unhop->un_protocol = attrs.nh_protocol;
if (attrs.nha_group)
error = newnhg(ctl, &attrs, unhop);
else
error = newnhop(&attrs, unhop, npt);
if (error != 0) {
free(unhop, M_NETLINK);
return (error);
}
UN_WLOCK(ctl);
/* Check if uidx already exists */
struct user_nhop *tmp = NULL;
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
if (tmp != NULL) {
UN_WUNLOCK(ctl);
NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
destroy_unhop(unhop);
return (EEXIST);
}
CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
UN_WUNLOCK(ctl);
/* Report addition of the next nexhop */
struct netlink_walkargs wa = {
.hdr.nlmsg_pid = hdr->nlmsg_pid,
.hdr.nlmsg_seq = hdr->nlmsg_seq,
.hdr.nlmsg_flags = hdr->nlmsg_flags,
.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
};
struct nl_writer nw = {};
if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
NL_LOG(LOG_DEBUG, "error allocating message writer");
return (ENOMEM);
}
dump_unhop(unhop, &wa.hdr, &nw);
nlmsg_flush(&nw);
consider_resize(ctl, num_buckets_new);
return (0);
}
static int
rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
struct nl_pstate *npt)
{
struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
int error;
if (__predict_false(ctl == NULL))
return (ESRCH);
struct nl_parsed_nhop attrs = {};
error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
if (error != 0)
return (error);
if (attrs.nha_id == 0) {
NL_LOG(LOG_DEBUG, "NHA_ID not set");
return (EINVAL);
}
error = delete_unhop(ctl, hdr, attrs.nha_id);
return (error);
}
static bool
match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
{
if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
return (false);
if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
return (false);
if (attrs->nha_oif != NULL &&
(unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
return (false);
return (true);
}
static int
rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
struct nl_pstate *npt)
{
struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
struct user_nhop *unhop;
UN_TRACKER;
int error;
if (__predict_false(ctl == NULL))
return (ESRCH);
struct nl_parsed_nhop attrs = {};
error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
if (error != 0)
return (error);
struct netlink_walkargs wa = {
.nw = npt->nw,
.hdr.nlmsg_pid = hdr->nlmsg_pid,
.hdr.nlmsg_seq = hdr->nlmsg_seq,
.hdr.nlmsg_flags = hdr->nlmsg_flags,
.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
};
if (attrs.nha_id != 0) {
NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
struct user_nhop key= { .un_idx = attrs.nha_id };
UN_RLOCK(ctl);
CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
UN_RUNLOCK(ctl);
if (unhop == NULL)
return (ESRCH);
dump_unhop(unhop, &wa.hdr, wa.nw);
return (0);
}
UN_RLOCK(ctl);
wa.hdr.nlmsg_flags |= NLM_F_MULTI;
CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
dump_unhop(unhop, &wa.hdr, wa.nw);
} CHT_SLIST_FOREACH_END;
UN_RUNLOCK(ctl);
if (wa.error == 0) {
if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
return (ENOMEM);
}
return (0);
}
static const struct rtnl_cmd_handler cmd_handlers[] = {
{
.cmd = NL_RTM_NEWNEXTHOP,
.name = "RTM_NEWNEXTHOP",
.cb = &rtnl_handle_newnhop,
.priv = PRIV_NET_ROUTE,
},
{
.cmd = NL_RTM_DELNEXTHOP,
.name = "RTM_DELNEXTHOP",
.cb = &rtnl_handle_delnhop,
.priv = PRIV_NET_ROUTE,
},
{
.cmd = NL_RTM_GETNEXTHOP,
.name = "RTM_GETNEXTHOP",
.cb = &rtnl_handle_getnhop,
}
};
static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser };
void
rtnl_nexthops_init(void)
{
NL_VERIFY_PARSERS(all_parsers);
rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
}