freebsd-skq/sys/netinet6/in6_src.c

1221 lines
34 KiB
C
Raw Normal View History

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
2007-12-10 16:03:40 +00:00
*
* $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $
*/
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
*/
2007-12-10 16:03:40 +00:00
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/sysctl.h>
#include <sys/errno.h>
#include <sys/time.h>
MFp4: Bring in updated jail support from bz_jail branch. This enhances the current jail implementation to permit multiple addresses per jail. In addtion to IPv4, IPv6 is supported as well. Due to updated checks it is even possible to have jails without an IP address at all, which basically gives one a chroot with restricted process view, no networking,.. SCTP support was updated and supports IPv6 in jails as well. Cpuset support permits jails to be bound to specific processor sets after creation. Jails can have an unrestricted (no duplicate protection, etc.) name in addition to the hostname. The jail name cannot be changed from within a jail and is considered to be used for management purposes or as audit-token in the future. DDB 'show jails' command was added to aid debugging. Proper compat support permits 32bit jail binaries to be used on 64bit systems to manage jails. Also backward compatibility was preserved where possible: for jail v1 syscalls, as well as with user space management utilities. Both jail as well as prison version were updated for the new features. A gap was intentionally left as the intermediate versions had been used by various patches floating around the last years. Bump __FreeBSD_version for the afore mentioned and in kernel changes. Special thanks to: - Pawel Jakub Dawidek (pjd) for his multi-IPv4 patches and Olivier Houchard (cognet) for initial single-IPv6 patches. - Jeff Roberson (jeff) and Randall Stewart (rrs) for their help, ideas and review on cpuset and SCTP support. - Robert Watson (rwatson) for lots and lots of help, discussions, suggestions and review of most of the patch at various stages. - John Baldwin (jhb) for his help. - Simon L. Nielsen (simon) as early adopter testing changes on cluster machines as well as all the testers and people who provided feedback the last months on freebsd-jail and other channels. - My employer, CK Software GmbH, for the support so I could work on this. Reviewed by: (see above) MFC after: 3 months (this is just so that I get the mail) X-MFC Before: 7.2-RELEASE if possible
2008-11-29 14:32:14 +00:00
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/rmlock.h>
#include <sys/sx.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <net/if_llatbl.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_fib.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/nd6.h>
static struct mtx addrsel_lock;
#define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF)
#define ADDRSEL_LOCK() mtx_lock(&addrsel_lock)
#define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock)
#define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED)
static struct sx addrsel_sxlock;
#define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock")
#define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock)
#define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock)
#define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock)
#define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock)
#define ADDR_LABEL_NOTAPP (-1)
VNET_DEFINE_STATIC(struct in6_addrpolicy, defaultaddrpolicy);
#define V_defaultaddrpolicy VNET(defaultaddrpolicy)
VNET_DEFINE(int, ip6_prefer_tempaddr) = 0;
static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route_in6 *, struct ifnet **,
struct nhop_object **, int, u_int, uint32_t);
static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct ifnet **,
struct ifnet *, u_int);
static int in6_selectsrc(uint32_t, struct sockaddr_in6 *,
struct ip6_pktopts *, struct inpcb *, struct ucred *,
struct ifnet **, struct in6_addr *);
2008-01-08 19:08:58 +00:00
static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);
2008-01-08 19:08:58 +00:00
static void init_policy_queue(void);
static int add_addrsel_policyent(struct in6_addrpolicy *);
static int delete_addrsel_policyent(struct in6_addrpolicy *);
static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *),
2012-11-13 01:48:00 +00:00
void *);
2008-01-08 19:08:58 +00:00
static int dump_addrsel_policyent(struct in6_addrpolicy *, void *);
static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
/*
* Return an IPv6 address, which is the most appropriate for a given
* destination and user specified options.
* If necessary, this function lookups the routing table and returns
* an entry to the caller for later use.
*/
#define REPLACE(r) do {\
IP6STAT_INC(ip6s_sources_rule[(r)]); \
/* { \
char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
} */ \
goto replace; \
} while(0)
#define NEXT(r) do {\
/* { \
char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
} */ \
goto next; /* XXX: we can't use 'continue' here */ \
} while(0)
#define BREAK(r) do { \
IP6STAT_INC(ip6s_sources_rule[(r)]); \
goto out; /* XXX: we can't use 'break' here */ \
} while(0)
static int
in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock,
struct ip6_pktopts *opts, struct inpcb *inp, struct ucred *cred,
struct ifnet **ifpp, struct in6_addr *srcp)
{
struct rm_priotracker in6_ifa_tracker;
struct in6_addr dst, tmp;
struct ifnet *ifp = NULL, *oifp = NULL;
struct in6_ifaddr *ia = NULL, *ia_best = NULL;
struct in6_pktinfo *pi = NULL;
int dst_scope = -1, best_scope = -1, best_matchlen = -1;
struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;
u_int32_t odstzone;
int prefer_tempaddr;
int error;
struct ip6_moptions *mopts;
NET_EPOCH_ASSERT();
KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__));
dst = dstsock->sin6_addr; /* make a copy for local operation */
if (ifpp) {
/*
* Save a possibly passed in ifp for in6_selectsrc. Only
* neighbor discovery code should use this feature, where
* we may know the interface but not the FIB number holding
* the connected subnet in case someone deleted it from the
* default FIB and we need to check the interface.
*/
if (*ifpp != NULL)
oifp = *ifpp;
*ifpp = NULL;
}
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
mopts = inp->in6p_moptions;
} else {
mopts = NULL;
}
/*
* If the source address is explicitly specified by the caller,
* check if the requested source address is indeed a unicast address
* assigned to the node, and can be used as the packet's source
* address. If everything is okay, use the address as source.
*/
if (opts && (pi = opts->ip6po_pktinfo) &&
!IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
/* get the outgoing interface */
if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp,
fibnum))
!= 0)
return (error);
2004-02-04 12:55:45 +00:00
/*
* determine the appropriate zone id of the source based on
* the zone of the destination and the outgoing interface.
* If the specified address is ambiguous wrt the scope zone,
* the interface must be specified; otherwise, ifa_ifwithaddr()
* will fail matching the address.
*/
tmp = pi->ipi6_addr;
if (ifp) {
error = in6_setscope(&tmp, ifp, &odstzone);
if (error)
return (error);
}
if (cred != NULL && (error = prison_local_ip6(cred,
&tmp, (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0)
return (error);
/*
* If IPV6_BINDANY socket option is set, we allow to specify
* non local addresses as source address in IPV6_PKTINFO
* ancillary data.
*/
if ((inp->inp_flags & INP_BINDANY) == 0) {
ia = in6ifa_ifwithaddr(&tmp, 0 /* XXX */, false);
if (ia == NULL || (ia->ia6_flags & (IN6_IFF_ANYCAST |
IN6_IFF_NOTREADY)))
return (EADDRNOTAVAIL);
bcopy(&ia->ia_addr.sin6_addr, srcp, sizeof(*srcp));
} else
bcopy(&tmp, srcp, sizeof(*srcp));
pi->ipi6_addr = tmp; /* XXX: this overrides pi */
if (ifpp)
*ifpp = ifp;
return (0);
}
/*
* Otherwise, if the socket has already bound the source, just use it.
*/
if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
if (cred != NULL &&
(error = prison_local_ip6(cred, &inp->in6p_laddr,
((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
return (error);
bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp));
return (0);
}
/*
* Bypass source address selection and use the primary jail IP
* if requested.
*/
if (cred != NULL && !prison_saddrsel_ip6(cred, srcp))
return (0);
/*
* If the address is not specified, choose the best one based on
* the outgoing interface and the destination address.
*/
/* get the outgoing interface */
if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp,
Constrain IPv6 routes to single FIBs when net.add_addr_allfibs=0 sys/netinet6/icmp6.c Use the interface's FIB for source address selection in ICMPv6 error responses. sys/netinet6/in6.c In in6_newaddrmsg, announce arrival of local addresses on the interface's FIB only. In in6_lltable_rtcheck, use a per-fib ND6 cache instead of a single cache. sys/netinet6/in6_src.c In in6_selectsrc, use the caller's fib instead of the default fib. In in6_selectsrc_socket, remove a superfluous check. sys/netinet6/nd6.c In nd6_lle_event, use the interface's fib for routing socket messages. In nd6_is_new_addr_neighbor, check all FIBs when trying to determine whether an address is a neighbor. Also, simplify the code for point to point interfaces. sys/netinet6/nd6.h sys/netinet6/nd6.c sys/netinet6/nd6_rtr.c Make defrouter_select fib-aware, and make all of its callers pass in the interface fib. sys/netinet6/nd6_nbr.c When inputting a Neighbor Solicitation packet, consider the interface fib instead of the default fib for DAD. Output NS and Neighbor Advertisement packets on the correct fib. sys/netinet6/nd6_rtr.c Allow installing the same host route on different interfaces in different FIBs. If rt_add_addr_allfibs=0, only install or delete the prefix route on the interface fib. tests/sys/netinet/fibs_test.sh Clear some expected failures, but add a skip for the newly revealed BUG217871. PR: 196361 Submitted by: Erick Turnquist <jhujhiti@adjectivism.org> Reported by: Jason Healy <jhealy@logn.net> Reviewed by: asomers MFC after: 3 weeks Sponsored by: Spectra Logic Corp Differential Revision: https://reviews.freebsd.org/D9451
2017-03-17 16:50:37 +00:00
(inp != NULL) ? inp->inp_inc.inc_fibnum : fibnum)) != 0)
return (error);
#ifdef DIAGNOSTIC
if (ifp == NULL) /* this should not happen */
panic("in6_selectsrc: NULL ifp");
#endif
error = in6_setscope(&dst, ifp, &odstzone);
if (error)
return (error);
IN6_IFADDR_RLOCK(&in6_ifa_tracker);
ifnet: Replace if_addr_lock rwlock with epoch + mutex Run on LLNW canaries and tested by pho@ gallatin: Using a 14-core, 28-HTT single socket E5-2697 v3 with a 40GbE MLX5 based ConnectX 4-LX NIC, I see an almost 12% improvement in received packet rate, and a larger improvement in bytes delivered all the way to userspace. When the host receiving 64 streams of netperf -H $DUT -t UDP_STREAM -- -m 1, I see, using nstat -I mce0 1 before the patch: InMpps OMpps InGbs OGbs err TCP Est %CPU syscalls csw irq GBfree 4.98 0.00 4.42 0.00 4235592 33 83.80 4720653 2149771 1235 247.32 4.73 0.00 4.20 0.00 4025260 33 82.99 4724900 2139833 1204 247.32 4.72 0.00 4.20 0.00 4035252 33 82.14 4719162 2132023 1264 247.32 4.71 0.00 4.21 0.00 4073206 33 83.68 4744973 2123317 1347 247.32 4.72 0.00 4.21 0.00 4061118 33 80.82 4713615 2188091 1490 247.32 4.72 0.00 4.21 0.00 4051675 33 85.29 4727399 2109011 1205 247.32 4.73 0.00 4.21 0.00 4039056 33 84.65 4724735 2102603 1053 247.32 After the patch InMpps OMpps InGbs OGbs err TCP Est %CPU syscalls csw irq GBfree 5.43 0.00 4.20 0.00 3313143 33 84.96 5434214 1900162 2656 245.51 5.43 0.00 4.20 0.00 3308527 33 85.24 5439695 1809382 2521 245.51 5.42 0.00 4.19 0.00 3316778 33 87.54 5416028 1805835 2256 245.51 5.42 0.00 4.19 0.00 3317673 33 90.44 5426044 1763056 2332 245.51 5.42 0.00 4.19 0.00 3314839 33 88.11 5435732 1792218 2499 245.52 5.44 0.00 4.19 0.00 3293228 33 91.84 5426301 1668597 2121 245.52 Similarly, netperf reports 230Mb/s before the patch, and 270Mb/s after the patch Reviewed by: gallatin Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D15366
2018-05-18 20:13:34 +00:00
CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
int new_scope = -1, new_matchlen = -1;
struct in6_addrpolicy *new_policy = NULL;
u_int32_t srczone, osrczone, dstzone;
struct in6_addr src;
struct ifnet *ifp1 = ia->ia_ifp;
/*
* We'll never take an address that breaks the scope zone
* of the destination. We also skip an address if its zone
* does not contain the outgoing interface.
* XXX: we should probably use sin6_scope_id here.
*/
if (in6_setscope(&dst, ifp1, &dstzone) ||
odstzone != dstzone) {
continue;
}
src = ia->ia_addr.sin6_addr;
if (in6_setscope(&src, ifp, &osrczone) ||
in6_setscope(&src, ifp1, &srczone) ||
osrczone != srczone) {
continue;
}
/* avoid unusable addresses */
if ((ia->ia6_flags &
(IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) {
continue;
}
if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
continue;
/* If jailed only take addresses of the jail into account. */
MFp4: Bring in updated jail support from bz_jail branch. This enhances the current jail implementation to permit multiple addresses per jail. In addtion to IPv4, IPv6 is supported as well. Due to updated checks it is even possible to have jails without an IP address at all, which basically gives one a chroot with restricted process view, no networking,.. SCTP support was updated and supports IPv6 in jails as well. Cpuset support permits jails to be bound to specific processor sets after creation. Jails can have an unrestricted (no duplicate protection, etc.) name in addition to the hostname. The jail name cannot be changed from within a jail and is considered to be used for management purposes or as audit-token in the future. DDB 'show jails' command was added to aid debugging. Proper compat support permits 32bit jail binaries to be used on 64bit systems to manage jails. Also backward compatibility was preserved where possible: for jail v1 syscalls, as well as with user space management utilities. Both jail as well as prison version were updated for the new features. A gap was intentionally left as the intermediate versions had been used by various patches floating around the last years. Bump __FreeBSD_version for the afore mentioned and in kernel changes. Special thanks to: - Pawel Jakub Dawidek (pjd) for his multi-IPv4 patches and Olivier Houchard (cognet) for initial single-IPv6 patches. - Jeff Roberson (jeff) and Randall Stewart (rrs) for their help, ideas and review on cpuset and SCTP support. - Robert Watson (rwatson) for lots and lots of help, discussions, suggestions and review of most of the patch at various stages. - John Baldwin (jhb) for his help. - Simon L. Nielsen (simon) as early adopter testing changes on cluster machines as well as all the testers and people who provided feedback the last months on freebsd-jail and other channels. - My employer, CK Software GmbH, for the support so I could work on this. Reviewed by: (see above) MFC after: 3 months (this is just so that I get the mail) X-MFC Before: 7.2-RELEASE if possible
2008-11-29 14:32:14 +00:00
if (cred != NULL &&
prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0)
MFp4: Bring in updated jail support from bz_jail branch. This enhances the current jail implementation to permit multiple addresses per jail. In addtion to IPv4, IPv6 is supported as well. Due to updated checks it is even possible to have jails without an IP address at all, which basically gives one a chroot with restricted process view, no networking,.. SCTP support was updated and supports IPv6 in jails as well. Cpuset support permits jails to be bound to specific processor sets after creation. Jails can have an unrestricted (no duplicate protection, etc.) name in addition to the hostname. The jail name cannot be changed from within a jail and is considered to be used for management purposes or as audit-token in the future. DDB 'show jails' command was added to aid debugging. Proper compat support permits 32bit jail binaries to be used on 64bit systems to manage jails. Also backward compatibility was preserved where possible: for jail v1 syscalls, as well as with user space management utilities. Both jail as well as prison version were updated for the new features. A gap was intentionally left as the intermediate versions had been used by various patches floating around the last years. Bump __FreeBSD_version for the afore mentioned and in kernel changes. Special thanks to: - Pawel Jakub Dawidek (pjd) for his multi-IPv4 patches and Olivier Houchard (cognet) for initial single-IPv6 patches. - Jeff Roberson (jeff) and Randall Stewart (rrs) for their help, ideas and review on cpuset and SCTP support. - Robert Watson (rwatson) for lots and lots of help, discussions, suggestions and review of most of the patch at various stages. - John Baldwin (jhb) for his help. - Simon L. Nielsen (simon) as early adopter testing changes on cluster machines as well as all the testers and people who provided feedback the last months on freebsd-jail and other channels. - My employer, CK Software GmbH, for the support so I could work on this. Reviewed by: (see above) MFC after: 3 months (this is just so that I get the mail) X-MFC Before: 7.2-RELEASE if possible
2008-11-29 14:32:14 +00:00
continue;
/* Rule 1: Prefer same address */
if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) {
ia_best = ia;
BREAK(1); /* there should be no better candidate */
}
if (ia_best == NULL)
REPLACE(0);
/* Rule 2: Prefer appropriate scope */
if (dst_scope < 0)
dst_scope = in6_addrscope(&dst);
new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
REPLACE(2);
NEXT(2);
} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
NEXT(2);
REPLACE(2);
}
/*
* Rule 3: Avoid deprecated addresses. Note that the case of
* !ip6_use_deprecated is already rejected above.
*/
if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
NEXT(3);
if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
REPLACE(3);
/* Rule 4: Prefer home addresses */
/*
* XXX: This is a TODO. We should probably merge the MIP6
* case above.
*/
/* Rule 5: Prefer outgoing interface */
if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) {
if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
NEXT(5);
if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
REPLACE(5);
}
/*
* Rule 6: Prefer matching label
* Note that best_policy should be non-NULL here.
*/
if (dst_policy == NULL)
dst_policy = lookup_addrsel_policy(dstsock);
if (dst_policy->label != ADDR_LABEL_NOTAPP) {
new_policy = lookup_addrsel_policy(&ia->ia_addr);
if (dst_policy->label == best_policy->label &&
dst_policy->label != new_policy->label)
NEXT(6);
if (dst_policy->label != best_policy->label &&
dst_policy->label == new_policy->label)
REPLACE(6);
}
/*
* Rule 7: Prefer public addresses.
* We allow users to reverse the logic by configuring
* a sysctl variable, so that privacy conscious users can
* always prefer temporary addresses.
*/
if (opts == NULL ||
opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
prefer_tempaddr = V_ip6_prefer_tempaddr;
} else if (opts->ip6po_prefer_tempaddr ==
IP6PO_TEMPADDR_NOTPREFER) {
prefer_tempaddr = 0;
} else
prefer_tempaddr = 1;
if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
if (prefer_tempaddr)
REPLACE(7);
else
NEXT(7);
}
if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
!(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
if (prefer_tempaddr)
NEXT(7);
else
REPLACE(7);
}
/*
* Rule 8: prefer addresses on alive interfaces.
* This is a KAME specific rule.
*/
if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
!(ia->ia_ifp->if_flags & IFF_UP))
NEXT(8);
if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
(ia->ia_ifp->if_flags & IFF_UP))
REPLACE(8);
/*
* Rule 9: prefer address with better virtual status.
*/
if (ifa_preferred(&ia_best->ia_ifa, &ia->ia_ifa))
REPLACE(9);
if (ifa_preferred(&ia->ia_ifa, &ia_best->ia_ifa))
NEXT(9);
/*
* Rule 10: prefer address with `prefer_source' flag.
*/
if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0 &&
(ia->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0)
REPLACE(10);
if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0 &&
(ia->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0)
NEXT(10);
/*
* Rule 14: Use longest matching prefix.
* Note: in the address selection draft, this rule is
* documented as "Rule 8". However, since it is also
* documented that this rule can be overridden, we assign
* a large number so that it is easy to assign smaller numbers
* to more preferred rules.
*/
new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst);
if (best_matchlen < new_matchlen)
REPLACE(14);
if (new_matchlen < best_matchlen)
NEXT(14);
/* Rule 15 is reserved. */
/*
* Last resort: just keep the current candidate.
* Or, do we need more rules?
*/
continue;
replace:
ia_best = ia;
best_scope = (new_scope >= 0 ? new_scope :
in6_addrscope(&ia_best->ia_addr.sin6_addr));
best_policy = (new_policy ? new_policy :
lookup_addrsel_policy(&ia_best->ia_addr));
best_matchlen = (new_matchlen >= 0 ? new_matchlen :
in6_matchlen(&ia_best->ia_addr.sin6_addr,
&dst));
next:
continue;
out:
break;
}
if ((ia = ia_best) == NULL) {
IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
IP6STAT_INC(ip6s_sources_none);
return (EADDRNOTAVAIL);
}
/*
* At this point at least one of the addresses belonged to the jail
* but it could still be, that we want to further restrict it, e.g.
* theoratically IN6_IS_ADDR_LOOPBACK.
* It must not be IN6_IS_ADDR_UNSPECIFIED anymore.
* prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should
* let all others previously selected pass.
* Use tmp to not change ::1 on lo0 to the primary jail address.
*/
tmp = ia->ia_addr.sin6_addr;
if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL &&
(inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) {
IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
IP6STAT_INC(ip6s_sources_none);
return (EADDRNOTAVAIL);
}
if (ifpp)
*ifpp = ifp;
bcopy(&tmp, srcp, sizeof(*srcp));
if (ia->ia_ifp == ifp)
IP6STAT_INC(ip6s_sources_sameif[best_scope]);
else
IP6STAT_INC(ip6s_sources_otherif[best_scope]);
if (dst_scope == best_scope)
IP6STAT_INC(ip6s_sources_samescope[best_scope]);
else
IP6STAT_INC(ip6s_sources_otherscope[best_scope]);
if (IFA6_IS_DEPRECATED(ia))
IP6STAT_INC(ip6s_sources_deprecated[best_scope]);
IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
return (0);
}
/*
* Select source address based on @inp, @dstsock and @opts.
* Stores selected address to @srcp. If @scope_ambiguous is set,
* embed scope from selected outgoing interface. If @hlim pointer
* is provided, stores calculated hop limit there.
* Returns 0 on success.
*/
int
in6_selectsrc_socket(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct inpcb *inp, struct ucred *cred, int scope_ambiguous,
struct in6_addr *srcp, int *hlim)
{
struct ifnet *retifp;
uint32_t fibnum;
int error;
Constrain IPv6 routes to single FIBs when net.add_addr_allfibs=0 sys/netinet6/icmp6.c Use the interface's FIB for source address selection in ICMPv6 error responses. sys/netinet6/in6.c In in6_newaddrmsg, announce arrival of local addresses on the interface's FIB only. In in6_lltable_rtcheck, use a per-fib ND6 cache instead of a single cache. sys/netinet6/in6_src.c In in6_selectsrc, use the caller's fib instead of the default fib. In in6_selectsrc_socket, remove a superfluous check. sys/netinet6/nd6.c In nd6_lle_event, use the interface's fib for routing socket messages. In nd6_is_new_addr_neighbor, check all FIBs when trying to determine whether an address is a neighbor. Also, simplify the code for point to point interfaces. sys/netinet6/nd6.h sys/netinet6/nd6.c sys/netinet6/nd6_rtr.c Make defrouter_select fib-aware, and make all of its callers pass in the interface fib. sys/netinet6/nd6_nbr.c When inputting a Neighbor Solicitation packet, consider the interface fib instead of the default fib for DAD. Output NS and Neighbor Advertisement packets on the correct fib. sys/netinet6/nd6_rtr.c Allow installing the same host route on different interfaces in different FIBs. If rt_add_addr_allfibs=0, only install or delete the prefix route on the interface fib. tests/sys/netinet/fibs_test.sh Clear some expected failures, but add a skip for the newly revealed BUG217871. PR: 196361 Submitted by: Erick Turnquist <jhujhiti@adjectivism.org> Reported by: Jason Healy <jhealy@logn.net> Reviewed by: asomers MFC after: 3 weeks Sponsored by: Spectra Logic Corp Differential Revision: https://reviews.freebsd.org/D9451
2017-03-17 16:50:37 +00:00
fibnum = inp->inp_inc.inc_fibnum;
retifp = NULL;
error = in6_selectsrc(fibnum, dstsock, opts, inp, cred, &retifp, srcp);
if (error != 0)
return (error);
if (hlim != NULL)
*hlim = in6_selecthlim(inp, retifp);
if (retifp == NULL || scope_ambiguous == 0)
return (0);
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined
* (when it's required), if we can determine the outgoing
* interface. determine the zone ID based on the interface.
*/
error = in6_setscope(&dstsock->sin6_addr, retifp, NULL);
return (error);
}
/*
* Select source address based on @fibnum, @dst and @scopeid.
* Stores selected address to @srcp.
* Returns 0 on success.
*
* Used by non-socket based consumers (ND code mostly)
*/
int
in6_selectsrc_addr(uint32_t fibnum, const struct in6_addr *dst,
uint32_t scopeid, struct ifnet *ifp, struct in6_addr *srcp,
int *hlim)
{
struct ifnet *retifp;
struct sockaddr_in6 dst_sa;
int error;
retifp = ifp;
bzero(&dst_sa, sizeof(dst_sa));
dst_sa.sin6_family = AF_INET6;
dst_sa.sin6_len = sizeof(dst_sa);
dst_sa.sin6_addr = *dst;
dst_sa.sin6_scope_id = scopeid;
sa6_embedscope(&dst_sa, 0);
error = in6_selectsrc(fibnum, &dst_sa, NULL, NULL, NULL, &retifp, srcp);
if (hlim != NULL)
*hlim = in6_selecthlim(NULL, retifp);
return (error);
}
/*
* clone - meaningful only for bsdi and freebsd
*/
static int
selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route_in6 *ro,
struct ifnet **retifp, struct nhop_object **retnh, int norouteok,
u_int fibnum, uint32_t flowid)
{
int error = 0;
struct ifnet *ifp = NULL;
struct nhop_object *nh = NULL;
struct sockaddr_in6 *sin6_next;
struct in6_pktinfo *pi = NULL;
struct in6_addr *dst = &dstsock->sin6_addr;
uint32_t zoneid;
#if 0
char ip6buf[INET6_ADDRSTRLEN];
if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
dstsock->sin6_addr.s6_addr32[1] == 0 &&
!IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
printf("%s: strange destination %s\n", __func__,
ip6_sprintf(ip6buf, &dstsock->sin6_addr));
} else {
printf("%s: destination = %s%%%d\n", __func__,
ip6_sprintf(ip6buf, &dstsock->sin6_addr),
dstsock->sin6_scope_id); /* for debug */
}
#endif
/* If the caller specify the outgoing interface explicitly, use it. */
if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
/* XXX boundary check is assumed to be already done. */
ifp = ifnet_byindex(pi->ipi6_ifindex);
if (ifp != NULL &&
(norouteok || retnh == NULL ||
IN6_IS_ADDR_MULTICAST(dst))) {
/*
* we do not have to check or get the route for
* multicast.
*/
goto done;
} else
goto getroute;
}
/*
* If the destination address is a multicast address and the outgoing
* interface for the address is specified by the caller, use it.
*/
if (IN6_IS_ADDR_MULTICAST(dst) &&
mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
goto done; /* we do not need a route for multicast. */
}
/*
* If destination address is LLA or link- or node-local multicast,
* use it's embedded scope zone id to determine outgoing interface.
*/
if (IN6_IS_ADDR_MC_LINKLOCAL(dst) ||
IN6_IS_ADDR_MC_NODELOCAL(dst)) {
zoneid = ntohs(in6_getscope(dst));
if (zoneid > 0) {
ifp = in6_getlinkifnet(zoneid);
goto done;
}
}
getroute:
/*
* If the next hop address for the packet is specified by the caller,
* use it as the gateway.
*/
if (opts && opts->ip6po_nexthop) {
struct route_in6 *ron;
sin6_next = satosin6(opts->ip6po_nexthop);
if (IN6_IS_ADDR_LINKLOCAL(&sin6_next->sin6_addr)) {
/*
* Next hop is LLA, thus it should be neighbor.
* Determine outgoing interface by zone index.
*/
zoneid = ntohs(in6_getscope(&sin6_next->sin6_addr));
if (zoneid > 0) {
ifp = in6_getlinkifnet(zoneid);
goto done;
2015-08-02 11:58:24 +00:00
}
}
ron = &opts->ip6po_nextroute;
/* Use a cached route if it exists and is valid. */
if (ron->ro_nh != NULL && (
!NH_IS_VALID(ron->ro_nh) ||
ron->ro_dst.sin6_family != AF_INET6 ||
!IN6_ARE_ADDR_EQUAL(&ron->ro_dst.sin6_addr,
&sin6_next->sin6_addr)))
RO_NHFREE(ron);
if (ron->ro_nh == NULL) {
ron->ro_dst = *sin6_next;
/*
* sin6_next is not link-local OR scopeid is 0,
* no need to clear scope
*/
ron->ro_nh = fib6_lookup(fibnum,
&sin6_next->sin6_addr, 0, NHR_REF, flowid);
}
/*
* The node identified by that address must be a
* neighbor of the sending host.
*/
if (ron->ro_nh == NULL ||
(ron->ro_nh->nh_flags & NHF_GATEWAY) != 0)
error = EHOSTUNREACH;
else {
nh = ron->ro_nh;
ifp = nh->nh_ifp;
}
goto done;
}
/*
* Use a cached route if it exists and is valid, else try to allocate
* a new one. Note that we should check the address family of the
* cached destination, in case of sharing the cache with IPv4.
*/
if (ro) {
if (ro->ro_nh &&
(!NH_IS_VALID(ro->ro_nh) ||
((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 ||
!IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
2004-02-04 12:55:45 +00:00
dst))) {
RO_NHFREE(ro);
}
if (ro->ro_nh == (struct nhop_object *)NULL) {
struct sockaddr_in6 *sa6;
/* No route yet, so try to acquire one */
bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
*sa6 = *dstsock;
sa6->sin6_scope_id = 0;
/*
* Currently dst has scopeid embedded iff it is LL.
* New routing API accepts scopeid as a separate argument.
* Convert dst before/after doing lookup
*/
uint32_t scopeid = 0;
if (IN6_IS_SCOPE_LINKLOCAL(&sa6->sin6_addr)) {
/* Unwrap in6_getscope() and in6_clearscope() */
scopeid = ntohs(sa6->sin6_addr.s6_addr16[1]);
sa6->sin6_addr.s6_addr16[1] = 0;
}
ro->ro_nh = fib6_lookup(fibnum,
&sa6->sin6_addr, scopeid, NHR_REF, flowid);
if (IN6_IS_SCOPE_LINKLOCAL(&sa6->sin6_addr))
sa6->sin6_addr.s6_addr16[1] = htons(scopeid);
}
/*
* do not care about the result if we have the nexthop
* explicitly specified.
*/
if (opts && opts->ip6po_nexthop)
goto done;
if (ro->ro_nh)
ifp = ro->ro_nh->nh_ifp;
else
error = EHOSTUNREACH;
nh = ro->ro_nh;
/*
* Check if the outgoing interface conflicts with
* the interface specified by ipi6_ifindex (if specified).
* Note that loopback interface is always okay.
* (this may happen when we are sending a packet to one of
* our own addresses.)
*/
if (ifp && opts && opts->ip6po_pktinfo &&
2004-02-04 12:55:45 +00:00
opts->ip6po_pktinfo->ipi6_ifindex) {
if (!(ifp->if_flags & IFF_LOOPBACK) &&
ifp->if_index !=
opts->ip6po_pktinfo->ipi6_ifindex) {
error = EHOSTUNREACH;
goto done;
}
}
}
done:
if (ifp == NULL && nh == NULL) {
/*
* This can happen if the caller did not pass a cached route
* nor any other hints. We treat this case an error.
*/
error = EHOSTUNREACH;
}
if (error == EHOSTUNREACH)
IP6STAT_INC(ip6s_noroute);
if (retifp != NULL) {
if (nh != NULL)
*retifp = nh->nh_aifp;
else
*retifp = ifp;
}
if (retnh != NULL)
*retnh = nh; /* nh may be NULL */
return (error);
}
static int
in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct ifnet **retifp,
struct ifnet *oifp, u_int fibnum)
{
int error;
struct route_in6 sro;
struct nhop_object *nh = NULL;
uint16_t nh_flags;
KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__));
bzero(&sro, sizeof(sro));
nh_flags = 0;
error = selectroute(dstsock, opts, mopts, &sro, retifp, &nh, 1, fibnum, 0);
if (nh != NULL)
nh_flags = nh->nh_flags;
if (nh != NULL && nh == sro.ro_nh)
NH_FREE(nh);
if (error != 0) {
/* Help ND. See oifp comment in in6_selectsrc(). */
if (oifp != NULL && fibnum == RT_DEFAULT_FIB) {
*retifp = oifp;
error = 0;
}
return (error);
}
/*
* do not use a rejected or black hole route.
* XXX: this check should be done in the L2 output routine.
* However, if we skipped this check here, we'd see the following
* scenario:
* - install a rejected route for a scoped address prefix
* (like fe80::/10)
* - send a packet to a destination that matches the scoped prefix,
* with ambiguity about the scope zone.
* - pick the outgoing interface from the route, and disambiguate the
* scope zone with the interface.
* - ip6_output() would try to get another route with the "new"
* destination, which may be valid.
* - we'd see no error on output.
* Although this may not be very harmful, it should still be confusing.
* We thus reject the case here.
*/
if (nh_flags & (NHF_REJECT | NHF_BLACKHOLE)) {
error = (nh_flags & NHF_HOST ? EHOSTUNREACH : ENETUNREACH);
return (error);
}
return (0);
}
/* Public wrapper function to selectroute(). */
int
in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route_in6 *ro,
struct ifnet **retifp, struct nhop_object **retnh, u_int fibnum, uint32_t flowid)
{
return (selectroute(dstsock, opts, mopts, ro, retifp,
retnh, 0, fibnum, flowid));
}
/*
* Default hop limit selection. The precedence is as follows:
* 1. Hoplimit value specified via ioctl.
* 2. (If the outgoing interface is detected) the current
* hop limit of the interface specified by router advertisement.
* 3. The system default hoplimit.
*/
int
in6_selecthlim(struct inpcb *inp, struct ifnet *ifp)
{
if (inp && inp->in6p_hops >= 0)
return (inp->in6p_hops);
else if (ifp)
return (ND_IFINFO(ifp)->chlim);
else if (inp && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
struct nhop_object *nh;
struct in6_addr dst;
uint32_t fibnum, scopeid;
int hlim;
fibnum = inp->inp_inc.inc_fibnum;
in6_splitscope(&inp->in6p_faddr, &dst, &scopeid);
nh = fib6_lookup(fibnum, &dst, scopeid, 0, 0);
if (nh != NULL) {
hlim = ND_IFINFO(nh->nh_ifp)->chlim;
return (hlim);
}
}
return (V_ip6_defhlim);
}
/*
* XXX: this is borrowed from in6_pcbbind(). If possible, we should
* share this function by all *bsd*...
*/
int
2017-02-10 05:58:16 +00:00
in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred)
{
struct socket *so = inp->inp_socket;
u_int16_t lport = 0;
int error, lookupflags = 0;
#ifdef INVARIANTS
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
#endif
INP_WLOCK_ASSERT(inp);
Decompose the current single inpcbinfo lock into two locks: - The existing ipi_lock continues to protect the global inpcb list and inpcb counter. This lock is now relegated to a small number of allocation and free operations, and occasional operations that walk all connections (including, awkwardly, certain UDP multicast receive operations -- something to revisit). - A new ipi_hash_lock protects the two inpcbinfo hash tables for looking up connections and bound sockets, manipulated using new INP_HASH_*() macros. This lock, combined with inpcb locks, protects the 4-tuple address space. Unlike the current ipi_lock, ipi_hash_lock follows the individual inpcb connection locks, so may be acquired while manipulating a connection on which a lock is already held, avoiding the need to acquire the inpcbinfo lock preemptively when a binding change might later be required. As a result, however, lookup operations necessarily go through a reference acquire while holding the lookup lock, later acquiring an inpcb lock -- if required. A new function in_pcblookup() looks up connections, and accepts flags indicating how to return the inpcb. Due to lock order changes, callers no longer need acquire locks before performing a lookup: the lookup routine will acquire the ipi_hash_lock as needed. In the future, it will also be able to use alternative lookup and locking strategies transparently to callers, such as pcbgroup lookup. New lookup flags are, supplementing the existing INPLOOKUP_WILDCARD flag: INPLOOKUP_RLOCKPCB - Acquire a read lock on the returned inpcb INPLOOKUP_WLOCKPCB - Acquire a write lock on the returned inpcb Callers must pass exactly one of these flags (for the time being). Some notes: - All protocols are updated to work within the new regime; especially, TCP, UDPv4, and UDPv6. pcbinfo ipi_lock acquisitions are largely eliminated, and global hash lock hold times are dramatically reduced compared to previous locking. - The TCP syncache still relies on the pcbinfo lock, something that we may want to revisit. - Support for reverting to the FreeBSD 7.x locking strategy in TCP input is no longer available -- hash lookup locks are now held only very briefly during inpcb lookup, rather than for potentially extended periods. However, the pcbinfo ipi_lock will still be acquired if a connection state might change such that a connection is added or removed. - Raw IP sockets continue to use the pcbinfo ipi_lock for protection, due to maintaining their own hash tables. - The interface in6_pcblookup_hash_locked() is maintained, which allows callers to acquire hash locks and perform one or more lookups atomically with 4-tuple allocation: this is required only for TCPv6, as there is no in6_pcbconnect_setup(), which there should be. - UDPv6 locking remains significantly more conservative than UDPv4 locking, which relates to source address selection. This needs attention, as it likely significantly reduces parallelism in this code for multithreaded socket use (such as in BIND). - In the UDPv4 and UDPv6 multicast cases, we need to revisit locking somewhat, as they relied on ipi_lock to stablise 4-tuple matches, which is no longer sufficient. A second check once the inpcb lock is held should do the trick, keeping the general case from requiring the inpcb lock for every inpcb visited. - This work reminds us that we need to revisit locking of the v4/v6 flags, which may be accessed lock-free both before and after this change. - Right now, a single lock name is used for the pcbhash lock -- this is undesirable, and probably another argument is required to take care of this (or a char array name field in the pcbinfo?). This is not an MFC candidate for 8.x due to its impact on lookup and locking semantics. It's possible some of these issues could be worked around with compatibility wrappers, if necessary. Reviewed by: bz Sponsored by: Juniper Networks, Inc.
2011-05-30 09:43:55 +00:00
INP_HASH_WLOCK_ASSERT(pcbinfo);
error = prison_local_ip6(cred, laddr,
((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0));
if (error)
return(error);
MFp4: Bring in updated jail support from bz_jail branch. This enhances the current jail implementation to permit multiple addresses per jail. In addtion to IPv4, IPv6 is supported as well. Due to updated checks it is even possible to have jails without an IP address at all, which basically gives one a chroot with restricted process view, no networking,.. SCTP support was updated and supports IPv6 in jails as well. Cpuset support permits jails to be bound to specific processor sets after creation. Jails can have an unrestricted (no duplicate protection, etc.) name in addition to the hostname. The jail name cannot be changed from within a jail and is considered to be used for management purposes or as audit-token in the future. DDB 'show jails' command was added to aid debugging. Proper compat support permits 32bit jail binaries to be used on 64bit systems to manage jails. Also backward compatibility was preserved where possible: for jail v1 syscalls, as well as with user space management utilities. Both jail as well as prison version were updated for the new features. A gap was intentionally left as the intermediate versions had been used by various patches floating around the last years. Bump __FreeBSD_version for the afore mentioned and in kernel changes. Special thanks to: - Pawel Jakub Dawidek (pjd) for his multi-IPv4 patches and Olivier Houchard (cognet) for initial single-IPv6 patches. - Jeff Roberson (jeff) and Randall Stewart (rrs) for their help, ideas and review on cpuset and SCTP support. - Robert Watson (rwatson) for lots and lots of help, discussions, suggestions and review of most of the patch at various stages. - John Baldwin (jhb) for his help. - Simon L. Nielsen (simon) as early adopter testing changes on cluster machines as well as all the testers and people who provided feedback the last months on freebsd-jail and other channels. - My employer, CK Software GmbH, for the support so I could work on this. Reviewed by: (see above) MFC after: 3 months (this is just so that I get the mail) X-MFC Before: 7.2-RELEASE if possible
2008-11-29 14:32:14 +00:00
/* XXX: this is redundant when called from in6_pcbbind */
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
inp->inp_flags |= INP_ANONPORT;
2017-02-10 05:58:16 +00:00
error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags);
if (error != 0)
return (error);
inp->inp_lport = lport;
if (in_pcbinshash(inp) != 0) {
inp->in6p_laddr = in6addr_any;
inp->inp_lport = 0;
return (EAGAIN);
}
return (0);
}
void
addrsel_policy_init(void)
{
init_policy_queue();
/* initialize the "last resort" policy */
bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy));
V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
if (!IS_DEFAULT_VNET(curvnet))
return;
ADDRSEL_LOCK_INIT();
ADDRSEL_SXLOCK_INIT();
}
static struct in6_addrpolicy *
lookup_addrsel_policy(struct sockaddr_in6 *key)
{
struct in6_addrpolicy *match = NULL;
ADDRSEL_LOCK();
match = match_addrsel_policy(key);
if (match == NULL)
match = &V_defaultaddrpolicy;
else
match->use++;
ADDRSEL_UNLOCK();
return (match);
}
/*
* Subroutines to manage the address selection policy table via sysctl.
*/
struct walkarg {
struct sysctl_req *w_req;
};
static int in6_src_sysctl(SYSCTL_HANDLER_ARGS);
SYSCTL_DECL(_net_inet6_ip6);
static SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
CTLFLAG_RD | CTLFLAG_MPSAFE, in6_src_sysctl,
"");
static int
in6_src_sysctl(SYSCTL_HANDLER_ARGS)
{
struct walkarg w;
if (req->newptr)
return EPERM;
bzero(&w, sizeof(w));
w.w_req = req;
return (walk_addrsel_policy(dump_addrsel_policyent, &w));
}
int
in6_src_ioctl(u_long cmd, caddr_t data)
{
struct in6_addrpolicy ent0;
if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
return (EOPNOTSUPP); /* check for safety */
ent0 = *(struct in6_addrpolicy *)data;
if (ent0.label == ADDR_LABEL_NOTAPP)
return (EINVAL);
/* check if the prefix mask is consecutive. */
if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
return (EINVAL);
/* clear trailing garbages (if any) of the prefix address. */
IN6_MASK_ADDR(&ent0.addr.sin6_addr, &ent0.addrmask.sin6_addr);
ent0.use = 0;
switch (cmd) {
case SIOCAADDRCTL_POLICY:
return (add_addrsel_policyent(&ent0));
case SIOCDADDRCTL_POLICY:
return (delete_addrsel_policyent(&ent0));
}
return (0); /* XXX: compromise compilers */
}
/*
* The followings are implementation of the policy table using a
* simple tail queue.
* XXX such details should be hidden.
* XXX implementation using binary tree should be more efficient.
*/
struct addrsel_policyent {
TAILQ_ENTRY(addrsel_policyent) ape_entry;
struct in6_addrpolicy ape_policy;
};
TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
VNET_DEFINE_STATIC(struct addrsel_policyhead, addrsel_policytab);
#define V_addrsel_policytab VNET(addrsel_policytab)
static void
init_policy_queue(void)
{
TAILQ_INIT(&V_addrsel_policytab);
}
static int
add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
{
struct addrsel_policyent *new, *pol;
new = malloc(sizeof(*new), M_IFADDR,
M_WAITOK);
ADDRSEL_XLOCK();
ADDRSEL_LOCK();
/* duplication check */
TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
&pol->ape_policy.addr.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
&pol->ape_policy.addrmask.sin6_addr)) {
ADDRSEL_UNLOCK();
ADDRSEL_XUNLOCK();
free(new, M_IFADDR);
return (EEXIST); /* or override it? */
}
}
bzero(new, sizeof(*new));
/* XXX: should validate entry */
new->ape_policy = *newpolicy;
TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry);
ADDRSEL_UNLOCK();
ADDRSEL_XUNLOCK();
return (0);
}
static int
delete_addrsel_policyent(struct in6_addrpolicy *key)
{
struct addrsel_policyent *pol;
ADDRSEL_XLOCK();
ADDRSEL_LOCK();
/* search for the entry in the table */
TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
&pol->ape_policy.addr.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
&pol->ape_policy.addrmask.sin6_addr)) {
break;
}
}
if (pol == NULL) {
ADDRSEL_UNLOCK();
ADDRSEL_XUNLOCK();
return (ESRCH);
}
TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry);
ADDRSEL_UNLOCK();
ADDRSEL_XUNLOCK();
free(pol, M_IFADDR);
return (0);
}
static int
2012-11-13 01:48:00 +00:00
walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w)
{
struct addrsel_policyent *pol;
int error = 0;
ADDRSEL_SLOCK();
TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
if ((error = (*callback)(&pol->ape_policy, w)) != 0) {
ADDRSEL_SUNLOCK();
return (error);
}
}
ADDRSEL_SUNLOCK();
return (error);
}
static int
dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
{
int error = 0;
struct walkarg *w = arg;
error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));
return (error);
}
static struct in6_addrpolicy *
match_addrsel_policy(struct sockaddr_in6 *key)
{
struct addrsel_policyent *pent;
struct in6_addrpolicy *bestpol = NULL, *pol;
int matchlen, bestmatchlen = -1;
u_char *mp, *ep, *k, *p, m;
TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) {
matchlen = 0;
pol = &pent->ape_policy;
mp = (u_char *)&pol->addrmask.sin6_addr;
ep = mp + 16; /* XXX: scope field? */
k = (u_char *)&key->sin6_addr;
p = (u_char *)&pol->addr.sin6_addr;
for (; mp < ep && *mp; mp++, k++, p++) {
m = *mp;
if ((*k & m) != *p)
goto next; /* not match */
if (m == 0xff) /* short cut for a typical case */
matchlen += 8;
else {
while (m >= 0x80) {
matchlen++;
m <<= 1;
}
}
}
/* matched. check if this is better than the current best. */
if (bestpol == NULL ||
matchlen > bestmatchlen) {
bestpol = pol;
bestmatchlen = matchlen;
}
next:
continue;
}
return (bestpol);
}