2005-01-07 01:45:51 +00:00
|
|
|
/*-
|
2017-11-20 19:43:44 +00:00
|
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1988, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
2017-02-28 23:42:47 +00:00
|
|
|
* 3. Neither the name of the University nor the names of its contributors
|
1994-05-24 10:09:53 +00:00
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)if_ether.c 8.1 (Berkeley) 6/10/93
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ethernet address resolution protocol.
|
|
|
|
* TODO:
|
|
|
|
* add "inuse/lock" bit (or ref. count) along with valid bit
|
|
|
|
*/
|
|
|
|
|
2007-10-07 20:44:24 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
1998-01-08 23:42:31 +00:00
|
|
|
#include "opt_inet.h"
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/param.h>
|
Extract eventfilter declarations to sys/_eventfilter.h
This allows replacing "sys/eventfilter.h" includes with "sys/_eventfilter.h"
in other header files (e.g., sys/{bus,conf,cpu}.h) and reduces header
pollution substantially.
EVENTHANDLER_DECLARE and EVENTHANDLER_LIST_DECLAREs were moved out of .c
files into appropriate headers (e.g., sys/proc.h, powernv/opal.h).
As a side effect of reduced header pollution, many .c files and headers no
longer contain needed definitions. The remainder of the patch addresses
adding appropriate includes to fix those files.
LOCK_DEBUG and LOCK_FILE_LINE_ARG are moved to sys/_lock.h, as required by
sys/mutex.h since r326106 (but silently protected by header pollution prior
to this change).
No functional change (intended). Of course, any out of tree modules that
relied on header pollution for sys/eventhandler.h, sys/lock.h, or
sys/mutex.h inclusion need to be fixed. __FreeBSD_version has been bumped.
2019-05-20 00:38:23 +00:00
|
|
|
#include <sys/eventhandler.h>
|
1995-12-09 16:06:54 +00:00
|
|
|
#include <sys/kernel.h>
|
2015-07-29 08:12:05 +00:00
|
|
|
#include <sys/lock.h>
|
1999-02-16 10:49:55 +00:00
|
|
|
#include <sys/queue.h>
|
1995-12-09 16:06:54 +00:00
|
|
|
#include <sys/sysctl.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/mbuf.h>
|
1995-12-09 16:06:54 +00:00
|
|
|
#include <sys/malloc.h>
|
2008-09-14 08:24:45 +00:00
|
|
|
#include <sys/proc.h>
|
2015-07-29 08:12:05 +00:00
|
|
|
#include <sys/rmlock.h>
|
1996-10-12 19:49:43 +00:00
|
|
|
#include <sys/socket.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/syslog.h>
|
|
|
|
|
|
|
|
#include <net/if.h>
|
2013-10-26 17:58:36 +00:00
|
|
|
#include <net/if_var.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <net/if_dl.h>
|
1999-02-20 11:18:00 +00:00
|
|
|
#include <net/if_types.h>
|
1995-05-11 00:13:26 +00:00
|
|
|
#include <net/netisr.h>
|
2001-02-03 17:25:21 +00:00
|
|
|
#include <net/ethernet.h>
|
2009-06-23 17:03:45 +00:00
|
|
|
#include <net/route.h>
|
2020-07-02 21:04:08 +00:00
|
|
|
#include <net/route/nhop.h>
|
2009-08-01 19:26:27 +00:00
|
|
|
#include <net/vnet.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
#include <netinet/in.h>
|
2015-12-09 11:14:27 +00:00
|
|
|
#include <netinet/in_fib.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <netinet/in_var.h>
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
#include <net/if_llatbl.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <netinet/if_ether.h>
|
2012-01-22 02:13:19 +00:00
|
|
|
#ifdef INET
|
2010-08-11 20:18:19 +00:00
|
|
|
#include <netinet/ip_carp.h>
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2006-10-22 11:52:19 +00:00
|
|
|
#include <security/mac/mac_framework.h>
|
|
|
|
|
2013-04-26 12:50:32 +00:00
|
|
|
#define SIN(s) ((const struct sockaddr_in *)(s))
|
2015-09-15 08:50:44 +00:00
|
|
|
|
|
|
|
static struct timeval arp_lastlog;
|
|
|
|
static int arp_curpps;
|
|
|
|
static int arp_maxpps = 1;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2015-12-05 09:50:37 +00:00
|
|
|
/* Simple ARP state machine */
|
|
|
|
enum arp_llinfo_state {
|
|
|
|
ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */
|
|
|
|
ARP_LLINFO_REACHABLE, /* LLE is valid */
|
|
|
|
ARP_LLINFO_VERIFY, /* LLE is valid, need refresh */
|
|
|
|
ARP_LLINFO_DELETED, /* LLE is deleted */
|
|
|
|
};
|
|
|
|
|
1999-02-16 10:49:55 +00:00
|
|
|
SYSCTL_DECL(_net_link_ether);
|
2020-02-26 14:26:36 +00:00
|
|
|
static SYSCTL_NODE(_net_link_ether, PF_INET, inet,
|
|
|
|
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
|
|
|
"");
|
|
|
|
static SYSCTL_NODE(_net_link_ether, PF_ARP, arp,
|
|
|
|
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
|
|
|
"");
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/* timer values */
|
2018-07-24 16:35:52 +00:00
|
|
|
VNET_DEFINE_STATIC(int, arpt_keep) = (20*60); /* once resolved, good for 20
|
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator
(DPCPU), as suggested by Peter Wemm, and implement a new per-virtual
network stack memory allocator. Modify vnet to use the allocator
instead of monolithic global container structures (vinet, ...). This
change solves many binary compatibility problems associated with
VIMAGE, and restores ELF symbols for virtualized global variables.
Each virtualized global variable exists as a "reference copy", and also
once per virtual network stack. Virtualized global variables are
tagged at compile-time, placing the in a special linker set, which is
loaded into a contiguous region of kernel memory. Virtualized global
variables in the base kernel are linked as normal, but those in modules
are copied and relocated to a reserved portion of the kernel's vnet
region with the help of a the kernel linker.
Virtualized global variables exist in per-vnet memory set up when the
network stack instance is created, and are initialized statically from
the reference copy. Run-time access occurs via an accessor macro, which
converts from the current vnet and requested symbol to a per-vnet
address. When "options VIMAGE" is not compiled into the kernel, normal
global ELF symbols will be used instead and indirection is avoided.
This change restores static initialization for network stack global
variables, restores support for non-global symbols and types, eliminates
the need for many subsystem constructors, eliminates large per-subsystem
structures that caused many binary compatibility issues both for
monitoring applications (netstat) and kernel modules, removes the
per-function INIT_VNET_*() macros throughout the stack, eliminates the
need for vnet_symmap ksym(2) munging, and eliminates duplicate
definitions of virtualized globals under VIMAGE_GLOBALS.
Bump __FreeBSD_version and update UPDATING.
Portions submitted by: bz
Reviewed by: bz, zec
Discussed with: gnn, jamie, jeff, jhb, julian, sam
Suggested by: peter
Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
|
|
|
* minutes */
|
2018-07-24 16:35:52 +00:00
|
|
|
VNET_DEFINE_STATIC(int, arp_maxtries) = 5;
|
|
|
|
VNET_DEFINE_STATIC(int, arp_proxyall) = 0;
|
|
|
|
VNET_DEFINE_STATIC(int, arpt_down) = 20; /* keep incomplete entries for
|
2012-07-31 11:31:12 +00:00
|
|
|
* 20 seconds */
|
2018-07-24 16:35:52 +00:00
|
|
|
VNET_DEFINE_STATIC(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/
|
2013-07-09 09:50:15 +00:00
|
|
|
VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */
|
|
|
|
VNET_PCPUSTAT_SYSINIT(arpstat);
|
|
|
|
|
|
|
|
#ifdef VIMAGE
|
|
|
|
VNET_PCPUSTAT_SYSUNINIT(arpstat);
|
|
|
|
#endif /* VIMAGE */
|
2010-11-12 22:03:02 +00:00
|
|
|
|
Bump amount of queued packets in for unresolved ARP/NDP entries to 16.
Currently default behaviour is to keep only 1 packet per unresolved entry.
Ability to queue more than one packet was added 10 years ago, in r215207,
though the default value was kep intact.
Things have changed since that time. Systems tend to initiate multiple
connections at once for a variety of reasons.
For example, recent kern/252278 bug report describe happy-eyeball DNS
behaviour sending multiple requests to the DNS server.
The primary driver for upper value for the queue length determination is
memory consumption. Remote actors should not be able to easily exhaust
local memory by sending packets to unresolved arp/ND entries.
For now, bump value to 16 packets, to match Darwin implementation.
The proper approach would be to switch the limit to calculate memory
consumption instead of packet count and limit based on memory.
We should MFC this with a variation of D22447.
Reviewers: #manpages, #network, bz, emaste
Reviewed By: emaste, gbe(doc), jilles(doc)
MFC after: 1 month
Differential Revision: https://reviews.freebsd.org/D28068
2021-01-11 19:50:21 +00:00
|
|
|
VNET_DEFINE_STATIC(int, arp_maxhold) = 16;
|
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator
(DPCPU), as suggested by Peter Wemm, and implement a new per-virtual
network stack memory allocator. Modify vnet to use the allocator
instead of monolithic global container structures (vinet, ...). This
change solves many binary compatibility problems associated with
VIMAGE, and restores ELF symbols for virtualized global variables.
Each virtualized global variable exists as a "reference copy", and also
once per virtual network stack. Virtualized global variables are
tagged at compile-time, placing the in a special linker set, which is
loaded into a contiguous region of kernel memory. Virtualized global
variables in the base kernel are linked as normal, but those in modules
are copied and relocated to a reserved portion of the kernel's vnet
region with the help of a the kernel linker.
Virtualized global variables exist in per-vnet memory set up when the
network stack instance is created, and are initialized statically from
the reference copy. Run-time access occurs via an accessor macro, which
converts from the current vnet and requested symbol to a per-vnet
address. When "options VIMAGE" is not compiled into the kernel, normal
global ELF symbols will be used instead and indirection is avoided.
This change restores static initialization for network stack global
variables, restores support for non-global symbols and types, eliminates
the need for many subsystem constructors, eliminates large per-subsystem
structures that caused many binary compatibility issues both for
monitoring applications (netstat) and kernel modules, removes the
per-function INIT_VNET_*() macros throughout the stack, eliminates the
need for vnet_symmap ksym(2) munging, and eliminates duplicate
definitions of virtualized globals under VIMAGE_GLOBALS.
Bump __FreeBSD_version and update UPDATING.
Portions submitted by: bz
Reviewed by: bz, zec
Discussed with: gnn, jamie, jeff, jhb, julian, sam
Suggested by: peter
Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
|
|
|
|
2009-07-16 21:13:04 +00:00
|
|
|
#define V_arpt_keep VNET(arpt_keep)
|
2009-10-15 06:12:04 +00:00
|
|
|
#define V_arpt_down VNET(arpt_down)
|
2015-12-05 09:50:37 +00:00
|
|
|
#define V_arpt_rexmit VNET(arpt_rexmit)
|
2009-07-16 21:13:04 +00:00
|
|
|
#define V_arp_maxtries VNET(arp_maxtries)
|
|
|
|
#define V_arp_proxyall VNET(arp_proxyall)
|
2010-11-12 22:03:02 +00:00
|
|
|
#define V_arp_maxhold VNET(arp_maxhold)
|
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator
(DPCPU), as suggested by Peter Wemm, and implement a new per-virtual
network stack memory allocator. Modify vnet to use the allocator
instead of monolithic global container structures (vinet, ...). This
change solves many binary compatibility problems associated with
VIMAGE, and restores ELF symbols for virtualized global variables.
Each virtualized global variable exists as a "reference copy", and also
once per virtual network stack. Virtualized global variables are
tagged at compile-time, placing the in a special linker set, which is
loaded into a contiguous region of kernel memory. Virtualized global
variables in the base kernel are linked as normal, but those in modules
are copied and relocated to a reserved portion of the kernel's vnet
region with the help of a the kernel linker.
Virtualized global variables exist in per-vnet memory set up when the
network stack instance is created, and are initialized statically from
the reference copy. Run-time access occurs via an accessor macro, which
converts from the current vnet and requested symbol to a per-vnet
address. When "options VIMAGE" is not compiled into the kernel, normal
global ELF symbols will be used instead and indirection is avoided.
This change restores static initialization for network stack global
variables, restores support for non-global symbols and types, eliminates
the need for many subsystem constructors, eliminates large per-subsystem
structures that caused many binary compatibility issues both for
monitoring applications (netstat) and kernel modules, removes the
per-function INIT_VNET_*() macros throughout the stack, eliminates the
need for vnet_symmap ksym(2) munging, and eliminates duplicate
definitions of virtualized globals under VIMAGE_GLOBALS.
Bump __FreeBSD_version and update UPDATING.
Portions submitted by: bz
Reviewed by: bz, zec
Discussed with: gnn, jamie, jeff, jhb, julian, sam
Suggested by: peter
Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
|
|
|
|
2014-11-07 09:39:05 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
|
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator
(DPCPU), as suggested by Peter Wemm, and implement a new per-virtual
network stack memory allocator. Modify vnet to use the allocator
instead of monolithic global container structures (vinet, ...). This
change solves many binary compatibility problems associated with
VIMAGE, and restores ELF symbols for virtualized global variables.
Each virtualized global variable exists as a "reference copy", and also
once per virtual network stack. Virtualized global variables are
tagged at compile-time, placing the in a special linker set, which is
loaded into a contiguous region of kernel memory. Virtualized global
variables in the base kernel are linked as normal, but those in modules
are copied and relocated to a reserved portion of the kernel's vnet
region with the help of a the kernel linker.
Virtualized global variables exist in per-vnet memory set up when the
network stack instance is created, and are initialized statically from
the reference copy. Run-time access occurs via an accessor macro, which
converts from the current vnet and requested symbol to a per-vnet
address. When "options VIMAGE" is not compiled into the kernel, normal
global ELF symbols will be used instead and indirection is avoided.
This change restores static initialization for network stack global
variables, restores support for non-global symbols and types, eliminates
the need for many subsystem constructors, eliminates large per-subsystem
structures that caused many binary compatibility issues both for
monitoring applications (netstat) and kernel modules, removes the
per-function INIT_VNET_*() macros throughout the stack, eliminates the
need for vnet_symmap ksym(2) munging, and eliminates duplicate
definitions of virtualized globals under VIMAGE_GLOBALS.
Bump __FreeBSD_version and update UPDATING.
Portions submitted by: bz
Reviewed by: bz, zec
Discussed with: gnn, jamie, jeff, jhb, julian, sam
Suggested by: peter
Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
|
|
|
&VNET_NAME(arpt_keep), 0,
|
|
|
|
"ARP entry lifetime in seconds");
|
2014-11-07 09:39:05 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
|
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator
(DPCPU), as suggested by Peter Wemm, and implement a new per-virtual
network stack memory allocator. Modify vnet to use the allocator
instead of monolithic global container structures (vinet, ...). This
change solves many binary compatibility problems associated with
VIMAGE, and restores ELF symbols for virtualized global variables.
Each virtualized global variable exists as a "reference copy", and also
once per virtual network stack. Virtualized global variables are
tagged at compile-time, placing the in a special linker set, which is
loaded into a contiguous region of kernel memory. Virtualized global
variables in the base kernel are linked as normal, but those in modules
are copied and relocated to a reserved portion of the kernel's vnet
region with the help of a the kernel linker.
Virtualized global variables exist in per-vnet memory set up when the
network stack instance is created, and are initialized statically from
the reference copy. Run-time access occurs via an accessor macro, which
converts from the current vnet and requested symbol to a per-vnet
address. When "options VIMAGE" is not compiled into the kernel, normal
global ELF symbols will be used instead and indirection is avoided.
This change restores static initialization for network stack global
variables, restores support for non-global symbols and types, eliminates
the need for many subsystem constructors, eliminates large per-subsystem
structures that caused many binary compatibility issues both for
monitoring applications (netstat) and kernel modules, removes the
per-function INIT_VNET_*() macros throughout the stack, eliminates the
need for vnet_symmap ksym(2) munging, and eliminates duplicate
definitions of virtualized globals under VIMAGE_GLOBALS.
Bump __FreeBSD_version and update UPDATING.
Portions submitted by: bz
Reviewed by: bz, zec
Discussed with: gnn, jamie, jeff, jhb, julian, sam
Suggested by: peter
Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
|
|
|
&VNET_NAME(arp_maxtries), 0,
|
Step 1.5 of importing the network stack virtualization infrastructure
from the vimage project, as per plan established at devsummit 08/08:
http://wiki.freebsd.org/Image/Notes200808DevSummit
Introduce INIT_VNET_*() initializer macros, VNET_FOREACH() iterator
macros, and CURVNET_SET() context setting macros, all currently
resolving to NOPs.
Prepare for virtualization of selected SYSCTL objects by introducing a
family of SYSCTL_V_*() macros, currently resolving to their global
counterparts, i.e. SYSCTL_V_INT() == SYSCTL_INT().
Move selected #defines from sys/sys/vimage.h to newly introduced header
files specific to virtualized subsystems (sys/net/vnet.h,
sys/netinet/vinet.h etc.).
All the changes are verified to have zero functional impact at this
point in time by doing MD5 comparision between pre- and post-change
object files(*).
(*) netipsec/keysock.c did not validate depending on compile time options.
Implemented by: julian, bz, brooks, zec
Reviewed by: julian, bz, brooks, kris, rwatson, ...
Approved by: julian (mentor)
Obtained from: //depot/projects/vimage-commit2/...
X-MFC after: never
Sponsored by: NLnet Foundation, The FreeBSD Foundation
2008-10-02 15:37:58 +00:00
|
|
|
"ARP resolution attempts before returning error");
|
2014-11-07 09:39:05 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
|
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator
(DPCPU), as suggested by Peter Wemm, and implement a new per-virtual
network stack memory allocator. Modify vnet to use the allocator
instead of monolithic global container structures (vinet, ...). This
change solves many binary compatibility problems associated with
VIMAGE, and restores ELF symbols for virtualized global variables.
Each virtualized global variable exists as a "reference copy", and also
once per virtual network stack. Virtualized global variables are
tagged at compile-time, placing the in a special linker set, which is
loaded into a contiguous region of kernel memory. Virtualized global
variables in the base kernel are linked as normal, but those in modules
are copied and relocated to a reserved portion of the kernel's vnet
region with the help of a the kernel linker.
Virtualized global variables exist in per-vnet memory set up when the
network stack instance is created, and are initialized statically from
the reference copy. Run-time access occurs via an accessor macro, which
converts from the current vnet and requested symbol to a per-vnet
address. When "options VIMAGE" is not compiled into the kernel, normal
global ELF symbols will be used instead and indirection is avoided.
This change restores static initialization for network stack global
variables, restores support for non-global symbols and types, eliminates
the need for many subsystem constructors, eliminates large per-subsystem
structures that caused many binary compatibility issues both for
monitoring applications (netstat) and kernel modules, removes the
per-function INIT_VNET_*() macros throughout the stack, eliminates the
need for vnet_symmap ksym(2) munging, and eliminates duplicate
definitions of virtualized globals under VIMAGE_GLOBALS.
Bump __FreeBSD_version and update UPDATING.
Portions submitted by: bz
Reviewed by: bz, zec
Discussed with: gnn, jamie, jeff, jhb, julian, sam
Suggested by: peter
Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
|
|
|
&VNET_NAME(arp_proxyall), 0,
|
Step 1.5 of importing the network stack virtualization infrastructure
from the vimage project, as per plan established at devsummit 08/08:
http://wiki.freebsd.org/Image/Notes200808DevSummit
Introduce INIT_VNET_*() initializer macros, VNET_FOREACH() iterator
macros, and CURVNET_SET() context setting macros, all currently
resolving to NOPs.
Prepare for virtualization of selected SYSCTL objects by introducing a
family of SYSCTL_V_*() macros, currently resolving to their global
counterparts, i.e. SYSCTL_V_INT() == SYSCTL_INT().
Move selected #defines from sys/sys/vimage.h to newly introduced header
files specific to virtualized subsystems (sys/net/vnet.h,
sys/netinet/vinet.h etc.).
All the changes are verified to have zero functional impact at this
point in time by doing MD5 comparision between pre- and post-change
object files(*).
(*) netipsec/keysock.c did not validate depending on compile time options.
Implemented by: julian, bz, brooks, zec
Reviewed by: julian, bz, brooks, kris, rwatson, ...
Approved by: julian (mentor)
Obtained from: //depot/projects/vimage-commit2/...
X-MFC after: never
Sponsored by: NLnet Foundation, The FreeBSD Foundation
2008-10-02 15:37:58 +00:00
|
|
|
"Enable proxy ARP for all suitable requests");
|
2014-11-07 09:39:05 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
|
2010-11-12 22:03:02 +00:00
|
|
|
&VNET_NAME(arpt_down), 0,
|
|
|
|
"Incomplete ARP entry lifetime in seconds");
|
2013-07-09 09:50:15 +00:00
|
|
|
SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
|
|
|
|
arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
|
2014-11-07 09:39:05 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
|
2012-07-31 11:31:12 +00:00
|
|
|
&VNET_NAME(arp_maxhold), 0,
|
2010-11-12 22:03:02 +00:00
|
|
|
"Number of packets to hold per ARP entry");
|
2015-09-15 08:50:44 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
|
|
|
|
CTLFLAG_RW, &arp_maxpps, 0,
|
|
|
|
"Maximum number of remotely triggered ARP messages that can be "
|
|
|
|
"logged per second");
|
|
|
|
|
Add GARP retransmit capability
A single gratuitous ARP (GARP) is always transmitted when an IPv4
address is added to an interface, and that is usually sufficient.
However, in some circumstances, such as when a shared address is
passed between cluster nodes, this single GARP may occasionally be
dropped or lost. This can lead to neighbors on the network link
working with a stale ARP cache and sending packets destined for
that address to the node that previously owned the address, which
may not respond.
To avoid this situation, GARP retransmissions can be enabled by setting
the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
than zero. The setting represents the maximum number of retransmissions.
The interval between retransmissions is calculated using an exponential
backoff algorithm, doubling each time, so the retransmission intervals
are: {1, 2, 4, 8, 16, ...} (seconds).
Due to the exponential backoff algorithm used for the interval
between GARP retransmissions, the maximum number of retransmissions
is limited to 16 for sanity. This limit corresponds to a maximum
interval between retransmissions of 2^16 seconds ~= 18 hours.
Increasing this limit is possible, but sending out GARPs spaced
days apart would be of little use.
Submitted by: David A. Bright <david.a.bright@dell.com>
MFC after: 1 month
Relnotes: yes
Sponsored by: Dell EMC
Differential Revision: https://reviews.freebsd.org/D7695
2016-10-02 01:42:45 +00:00
|
|
|
/*
|
|
|
|
* Due to the exponential backoff algorithm used for the interval between GARP
|
|
|
|
* retransmissions, the maximum number of retransmissions is limited for
|
|
|
|
* sanity. This limit corresponds to a maximum interval between retransmissions
|
|
|
|
* of 2^16 seconds ~= 18 hours.
|
|
|
|
*
|
|
|
|
* Making this limit more dynamic is more complicated than worthwhile,
|
|
|
|
* especially since sending out GARPs spaced days apart would be of little
|
|
|
|
* use. A maximum dynamic limit would look something like:
|
|
|
|
*
|
|
|
|
* const int max = fls(INT_MAX / hz) - 1;
|
|
|
|
*/
|
|
|
|
#define MAX_GARP_RETRANSMITS 16
|
|
|
|
static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS);
|
|
|
|
static int garp_rexmit_count = 0; /* GARP retransmission setting. */
|
|
|
|
|
|
|
|
SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count,
|
|
|
|
CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
|
|
|
|
&garp_rexmit_count, 0, sysctl_garp_rexmit, "I",
|
|
|
|
"Number of times to retransmit GARP packets;"
|
|
|
|
" 0 to disable, maximum of 16");
|
|
|
|
|
2019-03-09 01:12:59 +00:00
|
|
|
VNET_DEFINE_STATIC(int, arp_log_level) = LOG_INFO; /* Min. log(9) level. */
|
|
|
|
#define V_arp_log_level VNET(arp_log_level)
|
|
|
|
SYSCTL_INT(_net_link_ether_arp, OID_AUTO, log_level, CTLFLAG_VNET | CTLFLAG_RW,
|
|
|
|
&VNET_NAME(arp_log_level), 0,
|
|
|
|
"Minimum log(9) level for recording rate limited arp log messages. "
|
|
|
|
"The higher will be log more (emerg=0, info=6 (default), debug=7).");
|
2015-09-15 08:50:44 +00:00
|
|
|
#define ARP_LOG(pri, ...) do { \
|
2019-03-09 01:12:59 +00:00
|
|
|
if ((pri) <= V_arp_log_level && \
|
|
|
|
ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \
|
2015-09-15 08:50:44 +00:00
|
|
|
log((pri), "arp: " __VA_ARGS__); \
|
|
|
|
} while (0)
|
|
|
|
|
2003-03-04 23:19:55 +00:00
|
|
|
static void arpintr(struct mbuf *);
|
2002-03-19 21:25:46 +00:00
|
|
|
static void arptimer(void *);
|
1998-01-08 23:42:31 +00:00
|
|
|
#ifdef INET
|
2002-03-19 21:25:46 +00:00
|
|
|
static void in_arpinput(struct mbuf *);
|
1998-01-08 23:42:31 +00:00
|
|
|
#endif
|
1994-10-01 21:50:33 +00:00
|
|
|
|
2015-08-13 13:38:09 +00:00
|
|
|
static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
|
|
|
|
struct ifnet *ifp, int bridged, struct llentry *la);
|
|
|
|
static void arp_mark_lle_reachable(struct llentry *la);
|
2015-11-09 10:11:14 +00:00
|
|
|
static void arp_iflladdr(void *arg __unused, struct ifnet *ifp);
|
2015-08-13 13:38:09 +00:00
|
|
|
|
2015-11-09 10:11:14 +00:00
|
|
|
static eventhandler_tag iflladdr_tag;
|
2015-08-13 13:38:09 +00:00
|
|
|
|
Reimplement the netisr framework in order to support parallel netisr
threads:
- Support up to one netisr thread per CPU, each processings its own
workstream, or set of per-protocol queues. Threads may be bound
to specific CPUs, or allowed to migrate, based on a global policy.
In the future it would be desirable to support topology-centric
policies, such as "one netisr per package".
- Allow each protocol to advertise an ordering policy, which can
currently be one of:
NETISR_POLICY_SOURCE: packets must maintain ordering with respect to
an implicit or explicit source (such as an interface or socket).
NETISR_POLICY_FLOW: make use of mbuf flow identifiers to place work,
as well as allowing protocols to provide a flow generation function
for mbufs without flow identifers (m2flow). Falls back on
NETISR_POLICY_SOURCE if now flow ID is available.
NETISR_POLICY_CPU: allow protocols to inspect and assign a CPU for
each packet handled by netisr (m2cpuid).
- Provide utility functions for querying the number of workstreams
being used, as well as a mapping function from workstream to CPU ID,
which protocols may use in work placement decisions.
- Add explicit interfaces to get and set per-protocol queue limits, and
get and clear drop counters, which query data or apply changes across
all workstreams.
- Add a more extensible netisr registration interface, in which
protocols declare 'struct netisr_handler' structures for each
registered NETISR_ type. These include name, handler function,
optional mbuf to flow ID function, optional mbuf to CPU ID function,
queue limit, and ordering policy. Padding is present to allow these
to be expanded in the future. If no queue limit is declared, then
a default is used.
- Queue limits are now per-workstream, and raised from the previous
IFQ_MAXLEN default of 50 to 256.
- All protocols are updated to use the new registration interface, and
with the exception of netnatm, default queue limits. Most protocols
register as NETISR_POLICY_SOURCE, except IPv4 and IPv6, which use
NETISR_POLICY_FLOW, and will therefore take advantage of driver-
generated flow IDs if present.
- Formalize a non-packet based interface between interface polling and
the netisr, rather than having polling pretend to be two protocols.
Provide two explicit hooks in the netisr worker for start and end
events for runs: netisr_poll() and netisr_pollmore(), as well as a
function, netisr_sched_poll(), to allow the polling code to schedule
netisr execution. DEVICE_POLLING still embeds single-netisr
assumptions in its implementation, so for now if it is compiled into
the kernel, a single and un-bound netisr thread is enforced
regardless of tunable configuration.
In the default configuration, the new netisr implementation maintains
the same basic assumptions as the previous implementation: a single,
un-bound worker thread processes all deferred work, and direct dispatch
is enabled by default wherever possible.
Performance measurement shows a marginal performance improvement over
the old implementation due to the use of batched dequeue.
An rmlock is used to synchronize use and registration/unregistration
using the framework; currently, synchronized use is disabled
(replicating current netisr policy) due to a measurable 3%-6% hit in
ping-pong micro-benchmarking. It will be enabled once further rmlock
optimization has taken place. However, in practice, netisrs are
rarely registered or unregistered at runtime.
A new man page for netisr will follow, but since one doesn't currently
exist, it hasn't been updated.
This change is not appropriate for MFC, although the polling shutdown
handler should be merged to 7-STABLE.
Bump __FreeBSD_version.
Reviewed by: bz
2009-06-01 10:41:38 +00:00
|
|
|
static const struct netisr_handler arp_nh = {
|
|
|
|
.nh_name = "arp",
|
|
|
|
.nh_handler = arpintr,
|
|
|
|
.nh_proto = NETISR_ARP,
|
|
|
|
.nh_policy = NETISR_POLICY_SOURCE,
|
|
|
|
};
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
* Timeout routine. Age arp_tab entries periodically.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-12-22 22:00:30 +00:00
|
|
|
static void
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
arptimer(void *arg)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2012-08-02 13:57:49 +00:00
|
|
|
struct llentry *lle = (struct llentry *)arg;
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
struct ifnet *ifp;
|
2015-12-05 09:50:37 +00:00
|
|
|
int r_skip_req;
|
2012-08-02 13:57:49 +00:00
|
|
|
|
|
|
|
if (lle->la_flags & LLE_STATIC) {
|
|
|
|
return;
|
|
|
|
}
|
2015-02-09 19:28:11 +00:00
|
|
|
LLE_WLOCK(lle);
|
2015-08-11 12:38:54 +00:00
|
|
|
if (callout_pending(&lle->lle_timer)) {
|
2015-02-09 19:28:11 +00:00
|
|
|
/*
|
2020-02-12 13:31:36 +00:00
|
|
|
* Here we are a bit odd here in the treatment of
|
2015-02-09 19:28:11 +00:00
|
|
|
* active/pending. If the pending bit is set, it got
|
|
|
|
* rescheduled before I ran. The active
|
|
|
|
* bit we ignore, since if it was stopped
|
|
|
|
* in ll_tablefree() and was currently running
|
|
|
|
* it would have return 0 so the code would
|
|
|
|
* not have deleted it since the callout could
|
|
|
|
* not be stopped so we want to go through
|
|
|
|
* with the delete here now. If the callout
|
|
|
|
* was restarted, the pending bit will be back on and
|
|
|
|
* we just want to bail since the callout_reset would
|
|
|
|
* return 1 and our reference would have been removed
|
|
|
|
* by arpresolve() below.
|
|
|
|
*/
|
|
|
|
LLE_WUNLOCK(lle);
|
|
|
|
return;
|
|
|
|
}
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
ifp = lle->lle_tbl->llt_ifp;
|
2009-09-03 21:10:57 +00:00
|
|
|
CURVNET_SET(ifp->if_vnet);
|
2012-08-02 13:57:49 +00:00
|
|
|
|
2015-12-05 09:50:37 +00:00
|
|
|
switch (lle->ln_state) {
|
|
|
|
case ARP_LLINFO_REACHABLE:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Expiration time is approaching.
|
|
|
|
* Let's try to refresh entry if it is still
|
|
|
|
* in use.
|
|
|
|
*
|
|
|
|
* Set r_skip_req to get feedback from
|
|
|
|
* fast path. Change state and re-schedule
|
|
|
|
* ourselves.
|
|
|
|
*/
|
|
|
|
LLE_REQ_LOCK(lle);
|
|
|
|
lle->r_skip_req = 1;
|
|
|
|
LLE_REQ_UNLOCK(lle);
|
|
|
|
lle->ln_state = ARP_LLINFO_VERIFY;
|
|
|
|
callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
|
|
|
|
LLE_WUNLOCK(lle);
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return;
|
|
|
|
case ARP_LLINFO_VERIFY:
|
|
|
|
LLE_REQ_LOCK(lle);
|
|
|
|
r_skip_req = lle->r_skip_req;
|
|
|
|
LLE_REQ_UNLOCK(lle);
|
|
|
|
|
|
|
|
if (r_skip_req == 0 && lle->la_preempt > 0) {
|
|
|
|
/* Entry was used, issue refresh request */
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
struct epoch_tracker et;
|
2015-12-05 09:50:37 +00:00
|
|
|
struct in_addr dst;
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
|
2015-12-05 09:50:37 +00:00
|
|
|
dst = lle->r_l3addr.addr4;
|
|
|
|
lle->la_preempt--;
|
|
|
|
callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
|
|
|
|
LLE_WUNLOCK(lle);
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
NET_EPOCH_ENTER(et);
|
2015-12-05 09:50:37 +00:00
|
|
|
arprequest(ifp, NULL, &dst, NULL);
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
NET_EPOCH_EXIT(et);
|
2015-12-05 09:50:37 +00:00
|
|
|
CURVNET_RESTORE();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Nothing happened. Reschedule if not too late */
|
|
|
|
if (lle->la_expire > time_uptime) {
|
|
|
|
callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
|
|
|
|
LLE_WUNLOCK(lle);
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ARP_LLINFO_INCOMPLETE:
|
|
|
|
case ARP_LLINFO_DELETED:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2013-07-03 17:27:32 +00:00
|
|
|
if ((lle->la_flags & LLE_DELETED) == 0) {
|
2012-08-02 13:57:49 +00:00
|
|
|
int evt;
|
|
|
|
|
|
|
|
if (lle->la_flags & LLE_VALID)
|
|
|
|
evt = LLENTRY_EXPIRED;
|
|
|
|
else
|
|
|
|
evt = LLENTRY_TIMEDOUT;
|
|
|
|
EVENTHANDLER_INVOKE(lle_event, lle, evt);
|
|
|
|
}
|
|
|
|
|
2015-08-11 12:38:54 +00:00
|
|
|
callout_stop(&lle->lle_timer);
|
2012-08-02 13:57:49 +00:00
|
|
|
|
|
|
|
/* XXX: LOR avoidance. We still have ref on lle. */
|
|
|
|
LLE_WUNLOCK(lle);
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
IF_AFDATA_LOCK(ifp);
|
|
|
|
LLE_WLOCK(lle);
|
2012-08-02 13:57:49 +00:00
|
|
|
|
2012-12-13 11:11:15 +00:00
|
|
|
/* Guard against race with other llentry_free(). */
|
|
|
|
if (lle->la_flags & LLE_LINKED) {
|
|
|
|
LLE_REMREF(lle);
|
2015-09-15 06:48:19 +00:00
|
|
|
lltable_unlink_entry(lle->lle_tbl, lle);
|
|
|
|
}
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
IF_AFDATA_UNLOCK(ifp);
|
2012-12-13 11:11:15 +00:00
|
|
|
|
2015-09-15 06:48:19 +00:00
|
|
|
size_t pkts_dropped = llentry_free(lle);
|
|
|
|
|
|
|
|
ARPSTAT_ADD(dropped, pkts_dropped);
|
2012-08-02 13:57:49 +00:00
|
|
|
ARPSTAT_INC(timeouts);
|
2012-12-13 11:11:15 +00:00
|
|
|
|
2009-09-03 21:10:57 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2015-12-31 05:03:27 +00:00
|
|
|
/*
|
|
|
|
* Stores link-layer header for @ifp in format suitable for if_output()
|
|
|
|
* into buffer @buf. Resulting header length is stored in @bufsize.
|
|
|
|
*
|
|
|
|
* Returns 0 on success.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf,
|
|
|
|
size_t *bufsize)
|
|
|
|
{
|
|
|
|
struct if_encap_req ereq;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(buf, *bufsize);
|
|
|
|
bzero(&ereq, sizeof(ereq));
|
|
|
|
ereq.buf = buf;
|
|
|
|
ereq.bufsize = *bufsize;
|
|
|
|
ereq.rtype = IFENCAP_LL;
|
|
|
|
ereq.family = AF_ARP;
|
|
|
|
ereq.lladdr = ar_tha(ah);
|
|
|
|
ereq.hdata = (u_char *)ah;
|
|
|
|
if (bcast)
|
|
|
|
ereq.flags = IFENCAP_FLAG_BROADCAST;
|
|
|
|
error = ifp->if_requestencap(ifp, &ereq);
|
|
|
|
if (error == 0)
|
|
|
|
*bufsize = ereq.bufsize;
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Broadcast an ARP request. Caller specifies:
|
|
|
|
* - arp header source ip address
|
|
|
|
* - arp header target ip address
|
|
|
|
* - arp header source ethernet address
|
|
|
|
*/
|
2019-02-24 22:49:56 +00:00
|
|
|
static int
|
|
|
|
arprequest_internal(struct ifnet *ifp, const struct in_addr *sip,
|
2013-04-26 12:50:32 +00:00
|
|
|
const struct in_addr *tip, u_char *enaddr)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2004-03-14 00:44:11 +00:00
|
|
|
struct mbuf *m;
|
|
|
|
struct arphdr *ah;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct sockaddr sa;
|
2012-01-08 17:25:15 +00:00
|
|
|
u_char *carpaddr = NULL;
|
2015-12-31 05:03:27 +00:00
|
|
|
uint8_t linkhdr[LLE_MAX_LINKHDR];
|
|
|
|
size_t linkhdrsize;
|
|
|
|
struct route ro;
|
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
NET_EPOCH_ASSERT();
|
|
|
|
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
if (sip == NULL) {
|
|
|
|
/*
|
|
|
|
* The caller did not supply a source address, try to find
|
|
|
|
* a compatible one among those assigned to this interface.
|
|
|
|
*/
|
|
|
|
struct ifaddr *ifa;
|
|
|
|
|
2018-05-18 20:13:34 +00:00
|
|
|
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
|
2012-01-08 17:25:15 +00:00
|
|
|
if (ifa->ifa_addr->sa_family != AF_INET)
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
continue;
|
2012-01-08 17:25:15 +00:00
|
|
|
|
|
|
|
if (ifa->ifa_carp) {
|
|
|
|
if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
|
|
|
|
continue;
|
|
|
|
sip = &IA_SIN(ifa)->sin_addr;
|
|
|
|
} else {
|
|
|
|
carpaddr = NULL;
|
|
|
|
sip = &IA_SIN(ifa)->sin_addr;
|
|
|
|
}
|
|
|
|
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
if (0 == ((sip->s_addr ^ tip->s_addr) &
|
2012-01-08 17:25:15 +00:00
|
|
|
IA_MASKSIN(ifa)->sin_addr.s_addr))
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
break; /* found it. */
|
|
|
|
}
|
2012-07-31 11:31:12 +00:00
|
|
|
if (sip == NULL) {
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
printf("%s: cannot find matching address\n", __func__);
|
2019-02-24 22:49:56 +00:00
|
|
|
return (EADDRNOTAVAIL);
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
}
|
|
|
|
}
|
2012-01-08 17:25:15 +00:00
|
|
|
if (enaddr == NULL)
|
|
|
|
enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
|
2019-02-24 22:49:56 +00:00
|
|
|
return (ENOMEM);
|
2014-08-31 06:30:50 +00:00
|
|
|
m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
|
|
|
|
2 * ifp->if_addrlen;
|
2004-03-21 18:51:26 +00:00
|
|
|
m->m_pkthdr.len = m->m_len;
|
To ease changes to underlying mbuf structure and the mbuf allocator, reduce
the knowledge of mbuf layout, and in particular constants such as M_EXT,
MLEN, MHLEN, and so on, in mbuf consumers by unifying various alignment
utility functions (M_ALIGN(), MH_ALIGN(), MEXT_ALIGN() in a single
M_ALIGN() macro, implemented by a now-inlined m_align() function:
- Move m_align() from uipc_mbuf.c to mbuf.h; mark as __inline.
- Reimplement M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() using m_align().
- Update consumers around the tree to simply use M_ALIGN().
This change eliminates a number of cases where mbuf consumers must be aware
of whether or not mbufs returned by the allocator use external storage, but
also assumptions about the size of the returned mbuf. This will make it
easier to introduce changes in how we use external storage, as well as
features such as variable-size mbufs.
Differential Revision: https://reviews.freebsd.org/D1436
Reviewed by: glebius, trasz, gnn, bz
Sponsored by: EMC / Isilon Storage Division
2015-01-05 09:58:32 +00:00
|
|
|
M_ALIGN(m, m->m_len);
|
2004-03-21 18:51:26 +00:00
|
|
|
ah = mtod(m, struct arphdr *);
|
2004-03-21 06:36:05 +00:00
|
|
|
bzero((caddr_t)ah, m->m_len);
|
|
|
|
#ifdef MAC
|
2007-10-28 15:55:23 +00:00
|
|
|
mac_netinet_arp_send(ifp, m);
|
2004-03-21 06:36:05 +00:00
|
|
|
#endif
|
2001-10-14 20:17:53 +00:00
|
|
|
ah->ar_pro = htons(ETHERTYPE_IP);
|
|
|
|
ah->ar_hln = ifp->if_addrlen; /* hardware address length */
|
|
|
|
ah->ar_pln = sizeof(struct in_addr); /* protocol address length */
|
|
|
|
ah->ar_op = htons(ARPOP_REQUEST);
|
2013-04-26 12:50:32 +00:00
|
|
|
bcopy(enaddr, ar_sha(ah), ah->ar_hln);
|
|
|
|
bcopy(sip, ar_spa(ah), ah->ar_pln);
|
|
|
|
bcopy(tip, ar_tpa(ah), ah->ar_pln);
|
2004-03-21 06:36:05 +00:00
|
|
|
sa.sa_family = AF_ARP;
|
|
|
|
sa.sa_len = 2;
|
2015-12-31 05:03:27 +00:00
|
|
|
|
|
|
|
/* Calculate link header for sending frame */
|
|
|
|
bzero(&ro, sizeof(ro));
|
|
|
|
linkhdrsize = sizeof(linkhdr);
|
|
|
|
error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize);
|
|
|
|
if (error != 0 && error != EAFNOSUPPORT) {
|
|
|
|
ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
|
|
|
|
if_name(ifp), error);
|
2019-02-24 22:49:56 +00:00
|
|
|
return (error);
|
2015-12-31 05:03:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ro.ro_prepend = linkhdr;
|
|
|
|
ro.ro_plen = linkhdrsize;
|
|
|
|
ro.ro_flags = 0;
|
|
|
|
|
2004-03-21 06:36:05 +00:00
|
|
|
m->m_flags |= M_BCAST;
|
2013-08-19 13:27:32 +00:00
|
|
|
m_clrprotoflags(m); /* Avoid confusing lower layers. */
|
2019-02-24 22:49:56 +00:00
|
|
|
error = (*ifp->if_output)(ifp, m, &sa, &ro);
|
2009-09-03 21:10:57 +00:00
|
|
|
ARPSTAT_INC(txrequests);
|
2019-03-09 01:12:59 +00:00
|
|
|
if (error) {
|
|
|
|
ARPSTAT_INC(txerrors);
|
2019-02-24 22:49:56 +00:00
|
|
|
ARP_LOG(LOG_DEBUG, "Failed to send ARP packet on %s: %d\n",
|
|
|
|
if_name(ifp), error);
|
2019-03-09 01:12:59 +00:00
|
|
|
}
|
2019-02-24 22:49:56 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2019-02-24 22:49:56 +00:00
|
|
|
void
|
|
|
|
arprequest(struct ifnet *ifp, const struct in_addr *sip,
|
|
|
|
const struct in_addr *tip, u_char *enaddr)
|
|
|
|
{
|
|
|
|
|
|
|
|
(void) arprequest_internal(ifp, sip, tip, enaddr);
|
|
|
|
}
|
2015-12-31 05:03:27 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2015-08-16 12:23:58 +00:00
|
|
|
* Resolve an IP address into an ethernet address - heavy version.
|
|
|
|
* Used internally by arpresolve().
|
2018-11-17 16:13:09 +00:00
|
|
|
* We have already checked that we can't use an existing lle without
|
|
|
|
* modification so we have to acquire an LLE_EXCLUSIVE lle lock.
|
This commit does two things:
1. rt_check() cleanup:
rt_check() is only necessary for some address families to gain access
to the corresponding arp entry, so call it only in/near the *resolve()
routines where it is actually used -- at the moment this is
arpresolve(), nd6_storelladdr() (the call is embedded here),
and atmresolve() (the call is just before atmresolve to reduce
the number of changes).
This change will make it a lot easier to decouple the arp table
from the routing table.
There is an extra call to rt_check() in if_iso88025subr.c to
determine the routing info length. I have left it alone for
the time being.
The interface of arpresolve() and nd6_storelladdr() now changes slightly:
+ the 'rtentry' parameter (really a hint from the upper level layer)
is now passed unchanged from *_output(), so it becomes the route
to the final destination and not to the gateway.
+ the routines will return 0 if resolution is possible, non-zero
otherwise.
+ arpresolve() returns EWOULDBLOCK in case the mbuf is being held
waiting for an arp reply -- in this case the error code is masked
in the caller so the upper layer protocol will not see a failure.
2. arpcom untangling
Where possible, use 'struct ifnet' instead of 'struct arpcom' variables,
and use the IFP2AC macro to access arpcom fields.
This mostly affects the netatalk code.
=== Detailed changes: ===
net/if_arcsubr.c
rt_check() cleanup, remove a useless variable
net/if_atmsubr.c
rt_check() cleanup
net/if_ethersubr.c
rt_check() cleanup, arpcom untangling
net/if_fddisubr.c
rt_check() cleanup, arpcom untangling
net/if_iso88025subr.c
rt_check() cleanup
netatalk/aarp.c
arpcom untangling, remove a block of duplicated code
netatalk/at_extern.h
arpcom untangling
netinet/if_ether.c
rt_check() cleanup (change arpresolve)
netinet6/nd6.c
rt_check() cleanup (change nd6_storelladdr)
2004-04-25 09:24:52 +00:00
|
|
|
*
|
2018-11-17 16:13:09 +00:00
|
|
|
* On success, desten and pflags are filled in and the function returns 0;
|
This commit does two things:
1. rt_check() cleanup:
rt_check() is only necessary for some address families to gain access
to the corresponding arp entry, so call it only in/near the *resolve()
routines where it is actually used -- at the moment this is
arpresolve(), nd6_storelladdr() (the call is embedded here),
and atmresolve() (the call is just before atmresolve to reduce
the number of changes).
This change will make it a lot easier to decouple the arp table
from the routing table.
There is an extra call to rt_check() in if_iso88025subr.c to
determine the routing info length. I have left it alone for
the time being.
The interface of arpresolve() and nd6_storelladdr() now changes slightly:
+ the 'rtentry' parameter (really a hint from the upper level layer)
is now passed unchanged from *_output(), so it becomes the route
to the final destination and not to the gateway.
+ the routines will return 0 if resolution is possible, non-zero
otherwise.
+ arpresolve() returns EWOULDBLOCK in case the mbuf is being held
waiting for an arp reply -- in this case the error code is masked
in the caller so the upper layer protocol will not see a failure.
2. arpcom untangling
Where possible, use 'struct ifnet' instead of 'struct arpcom' variables,
and use the IFP2AC macro to access arpcom fields.
This mostly affects the netatalk code.
=== Detailed changes: ===
net/if_arcsubr.c
rt_check() cleanup, remove a useless variable
net/if_atmsubr.c
rt_check() cleanup
net/if_ethersubr.c
rt_check() cleanup, arpcom untangling
net/if_fddisubr.c
rt_check() cleanup, arpcom untangling
net/if_iso88025subr.c
rt_check() cleanup
netatalk/aarp.c
arpcom untangling, remove a block of duplicated code
netatalk/at_extern.h
arpcom untangling
netinet/if_ether.c
rt_check() cleanup (change arpresolve)
netinet6/nd6.c
rt_check() cleanup (change nd6_storelladdr)
2004-04-25 09:24:52 +00:00
|
|
|
* If the packet must be held pending resolution, we return EWOULDBLOCK
|
|
|
|
* On other errors, we return the corresponding error code.
|
2007-12-31 23:48:06 +00:00
|
|
|
* Note that m_freem() handles NULL.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2015-08-16 12:23:58 +00:00
|
|
|
static int
|
2015-12-31 05:03:27 +00:00
|
|
|
arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m,
|
2016-06-02 17:51:29 +00:00
|
|
|
const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
|
|
|
|
struct llentry **plle)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2015-08-20 12:05:17 +00:00
|
|
|
struct llentry *la = NULL, *la_tmp;
|
2010-11-12 22:03:02 +00:00
|
|
|
struct mbuf *curr = NULL;
|
|
|
|
struct mbuf *next = NULL;
|
2015-08-16 12:23:58 +00:00
|
|
|
int error, renew;
|
2015-12-31 05:03:27 +00:00
|
|
|
char *lladdr;
|
|
|
|
int ll_len;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
NET_EPOCH_ASSERT();
|
|
|
|
|
2014-11-27 23:06:25 +00:00
|
|
|
if (pflags != NULL)
|
|
|
|
*pflags = 0;
|
2016-06-02 17:51:29 +00:00
|
|
|
if (plle != NULL)
|
|
|
|
*plle = NULL;
|
2014-11-27 23:06:25 +00:00
|
|
|
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
if ((flags & LLE_CREATE) == 0)
|
2015-08-16 12:23:58 +00:00
|
|
|
la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
|
|
|
|
if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
|
2015-08-20 12:05:17 +00:00
|
|
|
la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
|
|
|
|
if (la == NULL) {
|
2017-02-16 20:47:41 +00:00
|
|
|
char addrbuf[INET_ADDRSTRLEN];
|
|
|
|
|
2015-08-20 12:05:17 +00:00
|
|
|
log(LOG_DEBUG,
|
|
|
|
"arpresolve: can't allocate llinfo for %s on %s\n",
|
2017-02-16 20:47:41 +00:00
|
|
|
inet_ntoa_r(SIN(dst)->sin_addr, addrbuf),
|
|
|
|
if_name(ifp));
|
2015-08-20 12:05:17 +00:00
|
|
|
m_freem(m);
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
2012-08-01 09:00:26 +00:00
|
|
|
IF_AFDATA_WLOCK(ifp);
|
2015-08-20 12:05:17 +00:00
|
|
|
LLE_WLOCK(la);
|
|
|
|
la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
|
|
|
|
/* Prefer ANY existing lle over newly-created one */
|
|
|
|
if (la_tmp == NULL)
|
|
|
|
lltable_link_entry(LLTABLE(ifp), la);
|
2012-08-01 09:00:26 +00:00
|
|
|
IF_AFDATA_WUNLOCK(ifp);
|
2015-08-20 12:05:17 +00:00
|
|
|
if (la_tmp != NULL) {
|
|
|
|
lltable_free_entry(LLTABLE(ifp), la);
|
|
|
|
la = la_tmp;
|
|
|
|
}
|
2008-12-17 00:14:28 +00:00
|
|
|
}
|
2005-08-11 08:25:48 +00:00
|
|
|
if (la == NULL) {
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
m_freem(m);
|
|
|
|
return (EINVAL);
|
2012-07-31 11:31:12 +00:00
|
|
|
}
|
2005-09-09 10:06:27 +00:00
|
|
|
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
if ((la->la_flags & LLE_VALID) &&
|
2010-11-30 15:57:00 +00:00
|
|
|
((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
|
2015-12-31 05:03:27 +00:00
|
|
|
if (flags & LLE_ADDRONLY) {
|
|
|
|
lladdr = la->ll_addr;
|
|
|
|
ll_len = ifp->if_addrlen;
|
|
|
|
} else {
|
|
|
|
lladdr = la->r_linkdata;
|
|
|
|
ll_len = la->r_hdrlen;
|
|
|
|
}
|
|
|
|
bcopy(lladdr, desten, ll_len);
|
2012-08-01 09:00:26 +00:00
|
|
|
|
2018-03-17 17:05:48 +00:00
|
|
|
/* Notify LLE code that the entry was used by datapath */
|
|
|
|
llentry_mark_used(la);
|
2014-11-27 23:06:25 +00:00
|
|
|
if (pflags != NULL)
|
2015-12-05 09:50:37 +00:00
|
|
|
*pflags = la->la_flags & (LLE_VALID|LLE_IFADDR);
|
2016-06-02 17:51:29 +00:00
|
|
|
if (plle) {
|
|
|
|
LLE_ADDREF(la);
|
|
|
|
*plle = la;
|
|
|
|
}
|
2015-08-16 12:23:58 +00:00
|
|
|
LLE_WUNLOCK(la);
|
2015-06-17 12:23:04 +00:00
|
|
|
return (0);
|
2012-07-31 11:31:12 +00:00
|
|
|
}
|
|
|
|
|
2010-11-30 15:57:00 +00:00
|
|
|
renew = (la->la_asked == 0 || la->la_expire != time_uptime);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* There is an arptab entry, but no ethernet address
|
2010-11-12 22:03:02 +00:00
|
|
|
* response yet. Add the mbuf to the list, dropping
|
|
|
|
* the oldest packet if we have exceeded the system
|
|
|
|
* setting.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
if (m != NULL) {
|
2010-11-12 22:03:02 +00:00
|
|
|
if (la->la_numheld >= V_arp_maxhold) {
|
|
|
|
if (la->la_hold != NULL) {
|
|
|
|
next = la->la_hold->m_nextpkt;
|
|
|
|
m_freem(la->la_hold);
|
|
|
|
la->la_hold = next;
|
|
|
|
la->la_numheld--;
|
|
|
|
ARPSTAT_INC(dropped);
|
|
|
|
}
|
2012-07-31 11:31:12 +00:00
|
|
|
}
|
2009-09-03 21:10:57 +00:00
|
|
|
if (la->la_hold != NULL) {
|
2010-11-12 22:03:02 +00:00
|
|
|
curr = la->la_hold;
|
|
|
|
while (curr->m_nextpkt != NULL)
|
|
|
|
curr = curr->m_nextpkt;
|
|
|
|
curr->m_nextpkt = m;
|
2012-07-31 11:31:12 +00:00
|
|
|
} else
|
2010-11-12 22:03:02 +00:00
|
|
|
la->la_hold = m;
|
|
|
|
la->la_numheld++;
|
2007-12-17 04:19:25 +00:00
|
|
|
}
|
2005-11-08 12:05:57 +00:00
|
|
|
/*
|
|
|
|
* Return EWOULDBLOCK if we have tried less than arp_maxtries. It
|
|
|
|
* will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
|
|
|
|
* if we have already sent arp_maxtries ARP requests. Retransmit the
|
|
|
|
* ARP request, but not faster than one request per second.
|
|
|
|
*/
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
if (la->la_asked < V_arp_maxtries)
|
2005-11-08 12:05:57 +00:00
|
|
|
error = EWOULDBLOCK; /* First request. */
|
|
|
|
else
|
2014-11-27 23:06:25 +00:00
|
|
|
error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
|
2005-11-08 12:05:57 +00:00
|
|
|
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
if (renew) {
|
2019-02-24 22:49:56 +00:00
|
|
|
int canceled, e;
|
2010-04-11 16:04:08 +00:00
|
|
|
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
LLE_ADDREF(la);
|
2010-11-30 15:57:00 +00:00
|
|
|
la->la_expire = time_uptime;
|
2015-08-11 12:38:54 +00:00
|
|
|
canceled = callout_reset(&la->lle_timer, hz * V_arpt_down,
|
2010-04-11 16:04:08 +00:00
|
|
|
arptimer, la);
|
|
|
|
if (canceled)
|
|
|
|
LLE_REMREF(la);
|
2007-01-14 18:44:17 +00:00
|
|
|
la->la_asked++;
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
LLE_WUNLOCK(la);
|
2019-02-24 22:49:56 +00:00
|
|
|
e = arprequest_internal(ifp, NULL, &SIN(dst)->sin_addr, NULL);
|
|
|
|
/*
|
|
|
|
* Only overwrite 'error' in case of error; in case of success
|
|
|
|
* the proper return value was already set above.
|
|
|
|
*/
|
|
|
|
if (e != 0)
|
|
|
|
return (e);
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
return (error);
|
|
|
|
}
|
2015-08-16 12:23:58 +00:00
|
|
|
|
|
|
|
LLE_WUNLOCK(la);
|
2005-11-08 12:05:57 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2015-12-31 05:03:27 +00:00
|
|
|
/*
|
|
|
|
* Lookups link header based on an IP address.
|
2015-08-16 12:23:58 +00:00
|
|
|
* On input:
|
|
|
|
* ifp is the interface we use
|
|
|
|
* is_gw != 0 if @dst represents gateway to some destination
|
|
|
|
* m is the mbuf. May be NULL if we don't have a packet.
|
|
|
|
* dst is the next hop,
|
2015-12-31 05:03:27 +00:00
|
|
|
* desten is the storage to put LL header.
|
2015-12-05 09:50:37 +00:00
|
|
|
* flags returns subset of lle flags: LLE_VALID | LLE_IFADDR
|
2015-08-16 12:23:58 +00:00
|
|
|
*
|
2015-12-31 05:03:27 +00:00
|
|
|
* On success, full/partial link header and flags are filled in and
|
|
|
|
* the function returns 0.
|
2015-08-16 12:23:58 +00:00
|
|
|
* If the packet must be held pending resolution, we return EWOULDBLOCK
|
|
|
|
* On other errors, we return the corresponding error code.
|
|
|
|
* Note that m_freem() handles NULL.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
|
2016-06-02 17:51:29 +00:00
|
|
|
const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
|
|
|
|
struct llentry **plle)
|
2015-08-16 12:23:58 +00:00
|
|
|
{
|
2016-04-15 15:46:41 +00:00
|
|
|
struct llentry *la = NULL;
|
2015-08-16 12:23:58 +00:00
|
|
|
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
NET_EPOCH_ASSERT();
|
|
|
|
|
2015-08-16 12:23:58 +00:00
|
|
|
if (pflags != NULL)
|
|
|
|
*pflags = 0;
|
2016-06-02 17:51:29 +00:00
|
|
|
if (plle != NULL)
|
|
|
|
*plle = NULL;
|
2015-08-16 12:23:58 +00:00
|
|
|
|
|
|
|
if (m != NULL) {
|
|
|
|
if (m->m_flags & M_BCAST) {
|
|
|
|
/* broadcast */
|
|
|
|
(void)memcpy(desten,
|
|
|
|
ifp->if_broadcastaddr, ifp->if_addrlen);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
if (m->m_flags & M_MCAST) {
|
|
|
|
/* multicast */
|
|
|
|
ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-22 02:11:49 +00:00
|
|
|
la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst);
|
2015-12-05 09:50:37 +00:00
|
|
|
if (la != NULL && (la->r_flags & RLLE_VALID) != 0) {
|
|
|
|
/* Entry found, let's copy lle info */
|
2015-12-31 05:03:27 +00:00
|
|
|
bcopy(la->r_linkdata, desten, la->r_hdrlen);
|
2015-08-16 12:23:58 +00:00
|
|
|
if (pflags != NULL)
|
2015-12-05 09:50:37 +00:00
|
|
|
*pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR);
|
2018-03-17 17:05:48 +00:00
|
|
|
/* Notify the LLE handling code that the entry was used. */
|
|
|
|
llentry_mark_used(la);
|
2016-07-22 02:11:49 +00:00
|
|
|
if (plle) {
|
|
|
|
LLE_ADDREF(la);
|
|
|
|
*plle = la;
|
|
|
|
LLE_WUNLOCK(la);
|
|
|
|
}
|
2015-08-16 12:23:58 +00:00
|
|
|
return (0);
|
|
|
|
}
|
2016-07-22 02:11:49 +00:00
|
|
|
if (plle && la)
|
|
|
|
LLE_WUNLOCK(la);
|
2015-08-16 12:23:58 +00:00
|
|
|
|
2015-12-31 05:03:27 +00:00
|
|
|
return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst,
|
2016-06-02 17:51:29 +00:00
|
|
|
desten, pflags, plle));
|
2015-08-16 12:23:58 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Common length and type checks are done here,
|
|
|
|
* then the protocol-specific routine is called.
|
|
|
|
*/
|
1995-12-09 16:06:54 +00:00
|
|
|
static void
|
2003-03-04 23:19:55 +00:00
|
|
|
arpintr(struct mbuf *m)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2003-03-04 23:19:55 +00:00
|
|
|
struct arphdr *ar;
|
2015-09-15 08:50:44 +00:00
|
|
|
struct ifnet *ifp;
|
2015-09-14 10:28:47 +00:00
|
|
|
char *layer;
|
|
|
|
int hlen;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2015-09-15 08:50:44 +00:00
|
|
|
ifp = m->m_pkthdr.rcvif;
|
|
|
|
|
2003-03-04 23:19:55 +00:00
|
|
|
if (m->m_len < sizeof(struct arphdr) &&
|
|
|
|
((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
|
2015-09-15 08:50:44 +00:00
|
|
|
ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n",
|
|
|
|
if_name(ifp));
|
2003-03-04 23:19:55 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
ar = mtod(m, struct arphdr *);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2015-09-14 10:28:47 +00:00
|
|
|
/* Check if length is sufficient */
|
Fix regression from r287779, that bite me. If we call m_pullup()
unconditionally, we end up with an mbuf chain of two mbufs, which
later in in_arpreply() is rewritten from ARP request to ARP reply
and is sent out. Looks like igb(4) (at least mine, and at least
at my network) fails on such mbuf chain, so ARP reply doesn't go
out wire. Thus, make the m_pullup() call conditional, as it is
everywhere. Of course, the bug in igb(?) should be investigated,
but better first fix the head. And unconditional m_pullup() was
suboptimal, anyway.
2015-10-07 13:10:26 +00:00
|
|
|
if (m->m_len < arphdr_len(ar)) {
|
|
|
|
m = m_pullup(m, arphdr_len(ar));
|
|
|
|
if (m == NULL) {
|
|
|
|
ARP_LOG(LOG_NOTICE, "short packet received on %s\n",
|
|
|
|
if_name(ifp));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ar = mtod(m, struct arphdr *);
|
2015-09-14 10:28:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
hlen = 0;
|
|
|
|
layer = "";
|
|
|
|
switch (ntohs(ar->ar_hrd)) {
|
|
|
|
case ARPHRD_ETHER:
|
|
|
|
hlen = ETHER_ADDR_LEN; /* RFC 826 */
|
|
|
|
layer = "ethernet";
|
|
|
|
break;
|
|
|
|
case ARPHRD_INFINIBAND:
|
2020-02-12 13:31:36 +00:00
|
|
|
hlen = 20; /* RFC 4391, INFINIBAND_ALEN */
|
2015-09-14 10:28:47 +00:00
|
|
|
layer = "infiniband";
|
|
|
|
break;
|
|
|
|
case ARPHRD_IEEE1394:
|
|
|
|
hlen = 0; /* SHALL be 16 */ /* RFC 2734 */
|
|
|
|
layer = "firewire";
|
|
|
|
|
|
|
|
/*
|
2015-12-25 14:51:36 +00:00
|
|
|
* Restrict too long hardware addresses.
|
2015-09-14 10:28:47 +00:00
|
|
|
* Currently we are capable of handling 20-byte
|
|
|
|
* addresses ( sizeof(lle->ll_addr) )
|
|
|
|
*/
|
|
|
|
if (ar->ar_hln >= 20)
|
|
|
|
hlen = 16;
|
|
|
|
break;
|
|
|
|
default:
|
2015-09-15 08:50:44 +00:00
|
|
|
ARP_LOG(LOG_NOTICE,
|
2015-12-25 14:51:36 +00:00
|
|
|
"packet with unknown hardware format 0x%02d received on "
|
|
|
|
"%s\n", ntohs(ar->ar_hrd), if_name(ifp));
|
2003-03-04 23:19:55 +00:00
|
|
|
m_freem(m);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-09-14 10:28:47 +00:00
|
|
|
if (hlen != 0 && hlen != ar->ar_hln) {
|
2015-09-15 08:50:44 +00:00
|
|
|
ARP_LOG(LOG_NOTICE,
|
|
|
|
"packet with invalid %s address length %d received on %s\n",
|
|
|
|
layer, ar->ar_hln, if_name(ifp));
|
2015-09-14 10:28:47 +00:00
|
|
|
m_freem(m);
|
|
|
|
return;
|
2003-03-04 23:19:55 +00:00
|
|
|
}
|
2000-03-11 00:24:29 +00:00
|
|
|
|
2009-09-03 21:10:57 +00:00
|
|
|
ARPSTAT_INC(received);
|
2003-03-04 23:19:55 +00:00
|
|
|
switch (ntohs(ar->ar_pro)) {
|
1998-01-08 23:42:31 +00:00
|
|
|
#ifdef INET
|
2003-03-04 23:19:55 +00:00
|
|
|
case ETHERTYPE_IP:
|
|
|
|
in_arpinput(m);
|
|
|
|
return;
|
1998-01-08 23:42:31 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2003-03-04 23:19:55 +00:00
|
|
|
m_freem(m);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1998-01-08 23:42:31 +00:00
|
|
|
#ifdef INET
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* ARP for Internet protocols on 10 Mb/s Ethernet.
|
|
|
|
* Algorithm is that given in RFC 826.
|
|
|
|
* In addition, a sanity check is performed on the sender
|
|
|
|
* protocol address, to catch impersonators.
|
|
|
|
* We no longer handle negotiations for use of trailer protocol:
|
|
|
|
* Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
|
|
|
|
* along with IP replies if we wanted trailers sent to us,
|
|
|
|
* and also sent them in response to IP replies.
|
|
|
|
* This allowed either end to announce the desire to receive
|
|
|
|
* trailer packets.
|
|
|
|
* We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
|
|
|
|
* but formerly didn't normally send requests.
|
|
|
|
*/
|
2001-01-06 00:45:08 +00:00
|
|
|
static int log_arp_wrong_iface = 1;
|
2001-09-03 21:53:15 +00:00
|
|
|
static int log_arp_movements = 1;
|
2005-12-18 19:11:56 +00:00
|
|
|
static int log_arp_permanent_modify = 1;
|
2012-09-03 14:29:28 +00:00
|
|
|
static int allow_multicast = 0;
|
2001-01-06 00:45:08 +00:00
|
|
|
|
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
|
|
|
|
&log_arp_wrong_iface, 0,
|
|
|
|
"log arp packets arriving on the wrong interface");
|
2001-09-03 21:53:15 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
|
2012-07-31 11:31:12 +00:00
|
|
|
&log_arp_movements, 0,
|
|
|
|
"log arp replies from MACs different than the one in the cache");
|
2005-12-18 19:11:56 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
|
2012-07-31 11:31:12 +00:00
|
|
|
&log_arp_permanent_modify, 0,
|
|
|
|
"log arp replies from MACs different than the one in the permanent arp entry");
|
2012-09-03 14:29:28 +00:00
|
|
|
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
|
|
|
|
&allow_multicast, 0, "accept multicast addresses");
|
2001-01-06 00:45:08 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
static void
|
2007-05-10 15:58:48 +00:00
|
|
|
in_arpinput(struct mbuf *m)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2015-07-29 08:12:05 +00:00
|
|
|
struct rm_priotracker in_ifa_tracker;
|
2004-03-14 00:44:11 +00:00
|
|
|
struct arphdr *ah;
|
|
|
|
struct ifnet *ifp = m->m_pkthdr.rcvif;
|
2015-08-20 12:05:17 +00:00
|
|
|
struct llentry *la = NULL, *la_tmp;
|
2001-09-29 04:34:11 +00:00
|
|
|
struct ifaddr *ifa;
|
|
|
|
struct in_ifaddr *ia;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct sockaddr sa;
|
|
|
|
struct in_addr isaddr, itaddr, myaddr;
|
2005-02-22 13:04:05 +00:00
|
|
|
u_int8_t *enaddr = NULL;
|
2015-08-13 13:38:09 +00:00
|
|
|
int op;
|
2008-08-18 09:06:11 +00:00
|
|
|
int bridged = 0, is_bridge = 0;
|
2015-08-13 13:38:09 +00:00
|
|
|
int carped;
|
2007-12-12 20:53:25 +00:00
|
|
|
struct sockaddr_in sin;
|
2015-08-20 12:05:17 +00:00
|
|
|
struct sockaddr *dst;
|
2020-07-02 21:04:08 +00:00
|
|
|
struct nhop_object *nh;
|
2015-12-31 05:03:27 +00:00
|
|
|
uint8_t linkhdr[LLE_MAX_LINKHDR];
|
|
|
|
struct route ro;
|
|
|
|
size_t linkhdrsize;
|
|
|
|
int lladdr_off;
|
|
|
|
int error;
|
2017-02-16 20:47:41 +00:00
|
|
|
char addrbuf[INET_ADDRSTRLEN];
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
|
|
|
|
NET_EPOCH_ASSERT();
|
2015-12-09 11:14:27 +00:00
|
|
|
|
2007-12-12 20:53:25 +00:00
|
|
|
sin.sin_len = sizeof(struct sockaddr_in);
|
|
|
|
sin.sin_family = AF_INET;
|
2007-12-17 07:40:34 +00:00
|
|
|
sin.sin_addr.s_addr = 0;
|
Step 1.5 of importing the network stack virtualization infrastructure
from the vimage project, as per plan established at devsummit 08/08:
http://wiki.freebsd.org/Image/Notes200808DevSummit
Introduce INIT_VNET_*() initializer macros, VNET_FOREACH() iterator
macros, and CURVNET_SET() context setting macros, all currently
resolving to NOPs.
Prepare for virtualization of selected SYSCTL objects by introducing a
family of SYSCTL_V_*() macros, currently resolving to their global
counterparts, i.e. SYSCTL_V_INT() == SYSCTL_INT().
Move selected #defines from sys/sys/vimage.h to newly introduced header
files specific to virtualized subsystems (sys/net/vnet.h,
sys/netinet/vinet.h etc.).
All the changes are verified to have zero functional impact at this
point in time by doing MD5 comparision between pre- and post-change
object files(*).
(*) netipsec/keysock.c did not validate depending on compile time options.
Implemented by: julian, bz, brooks, zec
Reviewed by: julian, bz, brooks, kris, rwatson, ...
Approved by: julian (mentor)
Obtained from: //depot/projects/vimage-commit2/...
X-MFC after: never
Sponsored by: NLnet Foundation, The FreeBSD Foundation
2008-10-02 15:37:58 +00:00
|
|
|
|
2006-01-29 23:21:01 +00:00
|
|
|
if (ifp->if_bridge)
|
2005-06-05 03:13:13 +00:00
|
|
|
bridged = 1;
|
2008-08-18 09:06:11 +00:00
|
|
|
if (ifp->if_type == IFT_BRIDGE)
|
|
|
|
is_bridge = 1;
|
2005-06-05 03:13:13 +00:00
|
|
|
|
2015-09-15 08:50:44 +00:00
|
|
|
/*
|
|
|
|
* We already have checked that mbuf contains enough contiguous data
|
|
|
|
* to hold entire arp message according to the arp header.
|
|
|
|
*/
|
2001-10-14 20:17:53 +00:00
|
|
|
ah = mtod(m, struct arphdr *);
|
2015-09-15 08:50:44 +00:00
|
|
|
|
2012-07-31 11:31:12 +00:00
|
|
|
/*
|
2011-01-12 19:11:17 +00:00
|
|
|
* ARP is only for IPv4 so we can reject packets with
|
|
|
|
* a protocol length not equal to an IPv4 address.
|
|
|
|
*/
|
|
|
|
if (ah->ar_pln != sizeof(struct in_addr)) {
|
2013-05-11 10:51:32 +00:00
|
|
|
ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
|
2011-01-12 19:11:17 +00:00
|
|
|
sizeof(struct in_addr));
|
2013-04-25 17:38:04 +00:00
|
|
|
goto drop;
|
2011-01-12 19:11:17 +00:00
|
|
|
}
|
|
|
|
|
2012-09-03 14:29:28 +00:00
|
|
|
if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
|
2013-05-11 10:51:32 +00:00
|
|
|
ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
|
2011-11-21 12:07:18 +00:00
|
|
|
ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
|
2013-04-25 17:38:04 +00:00
|
|
|
goto drop;
|
2011-01-12 19:11:17 +00:00
|
|
|
}
|
|
|
|
|
2001-10-14 20:17:53 +00:00
|
|
|
op = ntohs(ah->ar_op);
|
|
|
|
(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
|
|
|
|
(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
|
2004-09-09 12:34:07 +00:00
|
|
|
|
2009-09-03 21:10:57 +00:00
|
|
|
if (op == ARPOP_REPLY)
|
|
|
|
ARPSTAT_INC(rxreplies);
|
|
|
|
|
2001-09-29 04:34:11 +00:00
|
|
|
/*
|
|
|
|
* For a bridge, we want to check the address irrespective
|
|
|
|
* of the receive interface. (This will change slightly
|
|
|
|
* when we have clusters of interfaces).
|
|
|
|
*/
|
2015-07-29 08:12:05 +00:00
|
|
|
IN_IFADDR_RLOCK(&in_ifa_tracker);
|
2005-03-09 10:00:01 +00:00
|
|
|
LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
|
2011-01-25 17:15:23 +00:00
|
|
|
if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
ia->ia_ifp == ifp) &&
|
A major overhaul of the CARP implementation. The ip_carp.c was started
from scratch, copying needed functionality from the old implemenation
on demand, with a thorough review of all code. The main change is that
interface layer has been removed from the CARP. Now redundant addresses
are configured exactly on the interfaces, they run on.
The CARP configuration itself is, as before, configured and read via
SIOCSVH/SIOCGVH ioctls. A new prefix created with SIOCAIFADDR or
SIOCAIFADDR_IN6 may now be configured to a particular virtual host id,
which makes the prefix redundant.
ifconfig(8) semantics has been changed too: now one doesn't need
to clone carpXX interface, he/she should directly configure a vhid
on a Ethernet interface.
To supply vhid data from the kernel to an application the getifaddrs(8)
function had been changed to pass ifam_data with each address. [1]
The new implementation definitely closes all PRs related to carp(4)
being an interface, and may close several others. It also allows
to run a single redundant IP per interface.
Big thanks to Bjoern Zeeb for his help with inet6 part of patch, for
idea on using ifam_data and for several rounds of reviewing!
PR: kern/117000, kern/126945, kern/126714, kern/120130, kern/117448
Reviewed by: bz
Submitted by: bz [1]
2011-12-16 12:16:56 +00:00
|
|
|
itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
|
|
|
|
(ia->ia_ifa.ifa_carp == NULL ||
|
|
|
|
(*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
|
2009-06-24 10:33:35 +00:00
|
|
|
ifa_ref(&ia->ia_ifa);
|
2015-07-29 08:12:05 +00:00
|
|
|
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
|
2001-09-29 04:34:11 +00:00
|
|
|
goto match;
|
2005-03-09 10:00:01 +00:00
|
|
|
}
|
|
|
|
}
|
2001-09-29 04:34:11 +00:00
|
|
|
LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
|
2011-01-25 17:15:23 +00:00
|
|
|
if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
ia->ia_ifp == ifp) &&
|
2009-06-24 10:33:35 +00:00
|
|
|
isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
|
|
|
|
ifa_ref(&ia->ia_ifa);
|
2015-07-29 08:12:05 +00:00
|
|
|
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
|
2001-09-29 04:34:11 +00:00
|
|
|
goto match;
|
2009-06-24 10:33:35 +00:00
|
|
|
}
|
2008-08-18 09:06:11 +00:00
|
|
|
|
|
|
|
#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \
|
|
|
|
(ia->ia_ifp->if_bridge == ifp->if_softc && \
|
|
|
|
!bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \
|
|
|
|
addr == ia->ia_addr.sin_addr.s_addr)
|
|
|
|
/*
|
|
|
|
* Check the case when bridge shares its MAC address with
|
|
|
|
* some of its children, so packets are claimed by bridge
|
|
|
|
* itself (bridge_input() does it first), but they are really
|
|
|
|
* meant to be destined to the bridge member.
|
|
|
|
*/
|
|
|
|
if (is_bridge) {
|
|
|
|
LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
|
|
|
|
if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
|
2009-06-24 10:33:35 +00:00
|
|
|
ifa_ref(&ia->ia_ifa);
|
2008-08-18 09:06:11 +00:00
|
|
|
ifp = ia->ia_ifp;
|
2015-07-29 08:12:05 +00:00
|
|
|
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
|
2008-08-18 09:06:11 +00:00
|
|
|
goto match;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#undef BDG_MEMBER_MATCHES_ARP
|
2015-07-29 08:12:05 +00:00
|
|
|
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
|
2008-08-18 09:06:11 +00:00
|
|
|
|
2001-09-29 04:34:11 +00:00
|
|
|
/*
|
2001-10-20 05:14:06 +00:00
|
|
|
* No match, use the first inet address on the receive interface
|
2001-09-29 04:34:11 +00:00
|
|
|
* as a dummy address for the rest of the function.
|
|
|
|
*/
|
2018-05-18 20:13:34 +00:00
|
|
|
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
|
2011-12-29 15:59:14 +00:00
|
|
|
if (ifa->ifa_addr->sa_family == AF_INET &&
|
|
|
|
(ifa->ifa_carp == NULL ||
|
|
|
|
(*carp_iamatch_p)(ifa, &enaddr))) {
|
2001-10-25 06:14:21 +00:00
|
|
|
ia = ifatoia(ifa);
|
2009-06-24 10:33:35 +00:00
|
|
|
ifa_ref(ifa);
|
2001-10-25 06:14:21 +00:00
|
|
|
goto match;
|
|
|
|
}
|
2009-06-24 10:33:35 +00:00
|
|
|
|
2001-10-25 06:14:21 +00:00
|
|
|
/*
|
|
|
|
* If bridging, fall back to using any inet address.
|
|
|
|
*/
|
2015-07-29 08:12:05 +00:00
|
|
|
IN_IFADDR_RLOCK(&in_ifa_tracker);
|
2018-05-18 20:13:34 +00:00
|
|
|
if (!bridged || (ia = CK_STAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
|
2015-07-29 08:12:05 +00:00
|
|
|
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
|
2004-04-25 15:00:17 +00:00
|
|
|
goto drop;
|
2009-06-25 11:52:33 +00:00
|
|
|
}
|
2009-06-24 10:33:35 +00:00
|
|
|
ifa_ref(&ia->ia_ifa);
|
2015-07-29 08:12:05 +00:00
|
|
|
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
|
2001-09-29 04:34:11 +00:00
|
|
|
match:
|
2005-02-22 13:04:05 +00:00
|
|
|
if (!enaddr)
|
|
|
|
enaddr = (u_int8_t *)IF_LLADDR(ifp);
|
A major overhaul of the CARP implementation. The ip_carp.c was started
from scratch, copying needed functionality from the old implemenation
on demand, with a thorough review of all code. The main change is that
interface layer has been removed from the CARP. Now redundant addresses
are configured exactly on the interfaces, they run on.
The CARP configuration itself is, as before, configured and read via
SIOCSVH/SIOCGVH ioctls. A new prefix created with SIOCAIFADDR or
SIOCAIFADDR_IN6 may now be configured to a particular virtual host id,
which makes the prefix redundant.
ifconfig(8) semantics has been changed too: now one doesn't need
to clone carpXX interface, he/she should directly configure a vhid
on a Ethernet interface.
To supply vhid data from the kernel to an application the getifaddrs(8)
function had been changed to pass ifam_data with each address. [1]
The new implementation definitely closes all PRs related to carp(4)
being an interface, and may close several others. It also allows
to run a single redundant IP per interface.
Big thanks to Bjoern Zeeb for his help with inet6 part of patch, for
idea on using ifam_data and for several rounds of reviewing!
PR: kern/117000, kern/126945, kern/126714, kern/120130, kern/117448
Reviewed by: bz
Submitted by: bz [1]
2011-12-16 12:16:56 +00:00
|
|
|
carped = (ia->ia_ifa.ifa_carp != NULL);
|
2001-09-29 04:34:11 +00:00
|
|
|
myaddr = ia->ia_addr.sin_addr;
|
2009-06-24 10:33:35 +00:00
|
|
|
ifa_free(&ia->ia_ifa);
|
2005-02-22 13:04:05 +00:00
|
|
|
if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
|
2004-04-25 15:00:17 +00:00
|
|
|
goto drop; /* it's from me, ignore it. */
|
2001-10-14 20:17:53 +00:00
|
|
|
if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
|
2013-05-11 10:51:32 +00:00
|
|
|
ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
|
2017-02-16 20:47:41 +00:00
|
|
|
"%s!\n", inet_ntoa_r(isaddr, addrbuf));
|
2004-04-25 15:00:17 +00:00
|
|
|
goto drop;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2015-08-13 13:38:09 +00:00
|
|
|
|
|
|
|
if (ifp->if_addrlen != ah->ar_hln) {
|
|
|
|
ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
|
|
|
|
"i/f %d (ignored)\n", ifp->if_addrlen,
|
|
|
|
(u_char *) ar_sha(ah), ":", ah->ar_hln,
|
|
|
|
ifp->if_addrlen);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2004-10-12 17:10:40 +00:00
|
|
|
/*
|
|
|
|
* Warn if another host is using the same IP address, but only if the
|
|
|
|
* IP address isn't 0.0.0.0, which is used for DHCP only, in which
|
|
|
|
* case we suppress the warning to avoid false positive complaints of
|
|
|
|
* potential misconfiguration.
|
|
|
|
*/
|
A major overhaul of the CARP implementation. The ip_carp.c was started
from scratch, copying needed functionality from the old implemenation
on demand, with a thorough review of all code. The main change is that
interface layer has been removed from the CARP. Now redundant addresses
are configured exactly on the interfaces, they run on.
The CARP configuration itself is, as before, configured and read via
SIOCSVH/SIOCGVH ioctls. A new prefix created with SIOCAIFADDR or
SIOCAIFADDR_IN6 may now be configured to a particular virtual host id,
which makes the prefix redundant.
ifconfig(8) semantics has been changed too: now one doesn't need
to clone carpXX interface, he/she should directly configure a vhid
on a Ethernet interface.
To supply vhid data from the kernel to an application the getifaddrs(8)
function had been changed to pass ifam_data with each address. [1]
The new implementation definitely closes all PRs related to carp(4)
being an interface, and may close several others. It also allows
to run a single redundant IP per interface.
Big thanks to Bjoern Zeeb for his help with inet6 part of patch, for
idea on using ifam_data and for several rounds of reviewing!
PR: kern/117000, kern/126945, kern/126714, kern/120130, kern/117448
Reviewed by: bz
Submitted by: bz [1]
2011-12-16 12:16:56 +00:00
|
|
|
if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
|
|
|
|
myaddr.s_addr != 0) {
|
2013-05-11 10:51:32 +00:00
|
|
|
ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
|
2001-10-14 20:17:53 +00:00
|
|
|
ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
|
2017-02-16 20:47:41 +00:00
|
|
|
inet_ntoa_r(isaddr, addrbuf), ifp->if_xname);
|
1994-05-24 10:09:53 +00:00
|
|
|
itaddr = myaddr;
|
2009-09-03 21:10:57 +00:00
|
|
|
ARPSTAT_INC(dupips);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto reply;
|
|
|
|
}
|
2003-10-01 08:32:37 +00:00
|
|
|
if (ifp->if_flags & IFF_STATICARP)
|
|
|
|
goto reply;
|
2005-08-11 08:25:48 +00:00
|
|
|
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
bzero(&sin, sizeof(sin));
|
|
|
|
sin.sin_len = sizeof(struct sockaddr_in);
|
|
|
|
sin.sin_family = AF_INET;
|
|
|
|
sin.sin_addr = isaddr;
|
2015-08-20 12:05:17 +00:00
|
|
|
dst = (struct sockaddr *)&sin;
|
|
|
|
la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
|
2015-08-13 13:38:09 +00:00
|
|
|
if (la != NULL)
|
|
|
|
arp_check_update_lle(ah, isaddr, ifp, bridged, la);
|
|
|
|
else if (itaddr.s_addr == myaddr.s_addr) {
|
2012-07-31 11:31:12 +00:00
|
|
|
/*
|
2015-12-16 09:16:06 +00:00
|
|
|
* Request/reply to our address, but no lle exists yet.
|
2015-12-31 05:03:27 +00:00
|
|
|
* Calculate full link prepend to use in lle.
|
2010-11-12 22:03:02 +00:00
|
|
|
*/
|
2015-12-31 05:03:27 +00:00
|
|
|
linkhdrsize = sizeof(linkhdr);
|
|
|
|
if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
|
|
|
|
&linkhdrsize, &lladdr_off) != 0)
|
|
|
|
goto reply;
|
|
|
|
|
|
|
|
/* Allocate new entry */
|
2015-08-20 12:05:17 +00:00
|
|
|
la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
|
2015-12-16 09:16:06 +00:00
|
|
|
if (la == NULL) {
|
|
|
|
/*
|
|
|
|
* lle creation may fail if source address belongs
|
|
|
|
* to non-directly connected subnet. However, we
|
|
|
|
* will try to answer the request instead of dropping
|
|
|
|
* frame.
|
|
|
|
*/
|
|
|
|
goto reply;
|
|
|
|
}
|
2015-12-31 05:03:27 +00:00
|
|
|
lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
|
|
|
|
lladdr_off);
|
2015-08-20 12:05:17 +00:00
|
|
|
|
2015-08-13 13:38:09 +00:00
|
|
|
IF_AFDATA_WLOCK(ifp);
|
2015-08-20 12:05:17 +00:00
|
|
|
LLE_WLOCK(la);
|
|
|
|
la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if lle still does not exists.
|
|
|
|
* If it does, that means that we either
|
|
|
|
* 1) have configured it explicitly, via
|
|
|
|
* 1a) 'arp -s' static entry or
|
|
|
|
* 1b) interface address static record
|
|
|
|
* or
|
|
|
|
* 2) it was the result of sending first packet to-host
|
|
|
|
* or
|
|
|
|
* 3) it was another arp reply packet we handled in
|
|
|
|
* different thread.
|
|
|
|
*
|
|
|
|
* In all cases except 3) we definitely need to prefer
|
|
|
|
* existing lle. For the sake of simplicity, prefer any
|
|
|
|
* existing lle over newly-create one.
|
|
|
|
*/
|
|
|
|
if (la_tmp == NULL)
|
|
|
|
lltable_link_entry(LLTABLE(ifp), la);
|
2015-08-13 13:38:09 +00:00
|
|
|
IF_AFDATA_WUNLOCK(ifp);
|
2015-08-20 12:05:17 +00:00
|
|
|
|
|
|
|
if (la_tmp == NULL) {
|
2015-08-19 21:08:42 +00:00
|
|
|
arp_mark_lle_reachable(la);
|
|
|
|
LLE_WUNLOCK(la);
|
2015-08-20 12:05:17 +00:00
|
|
|
} else {
|
|
|
|
/* Free newly-create entry and handle packet */
|
|
|
|
lltable_free_entry(LLTABLE(ifp), la);
|
|
|
|
la = la_tmp;
|
|
|
|
la_tmp = NULL;
|
|
|
|
arp_check_update_lle(ah, isaddr, ifp, bridged, la);
|
|
|
|
/* arp_check_update_lle() returns @la unlocked */
|
2015-08-19 21:08:42 +00:00
|
|
|
}
|
2015-08-20 12:05:17 +00:00
|
|
|
la = NULL;
|
2011-06-18 13:54:36 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
reply:
|
2004-04-25 15:00:17 +00:00
|
|
|
if (op != ARPOP_REQUEST)
|
|
|
|
goto drop;
|
2009-09-03 21:10:57 +00:00
|
|
|
ARPSTAT_INC(rxrequests);
|
This main goals of this project are:
1. separating L2 tables (ARP, NDP) from the L3 routing tables
2. removing as much locking dependencies among these layers as
possible to allow for some parallelism in the search operations
3. simplify the logic in the routing code,
The most notable end result is the obsolescent of the route
cloning (RTF_CLONING) concept, which translated into code reduction
in both IPv4 ARP and IPv6 NDP related modules, and size reduction in
struct rtentry{}. The change in design obsoletes the semantics of
RTF_CLONING, RTF_WASCLONE and RTF_LLINFO routing flags. The userland
applications such as "arp" and "ndp" have been modified to reflect
those changes. The output from "netstat -r" shows only the routing
entries.
Quite a few developers have contributed to this project in the
past: Glebius Smirnoff, Luigi Rizzo, Alessandro Cerri, and
Andre Oppermann. And most recently:
- Kip Macy revised the locking code completely, thus completing
the last piece of the puzzle, Kip has also been conducting
active functional testing
- Sam Leffler has helped me improving/refactoring the code, and
provided valuable reviews
- Julian Elischer setup the perforce tree for me and has helped
me maintaining that branch before the svn conversion
2008-12-15 06:10:57 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
if (itaddr.s_addr == myaddr.s_addr) {
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
/* Shortcut.. the receiving interface is the target. */
|
2001-10-14 20:17:53 +00:00
|
|
|
(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
|
2005-02-22 13:04:05 +00:00
|
|
|
(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
2020-04-02 20:06:37 +00:00
|
|
|
/*
|
|
|
|
* Destination address is not ours. Check if
|
|
|
|
* proxyarp entry exists or proxyarp is turned on globally.
|
|
|
|
*/
|
|
|
|
struct llentry *lle;
|
2008-12-19 11:07:34 +00:00
|
|
|
|
|
|
|
sin.sin_addr = itaddr;
|
2009-09-15 18:39:27 +00:00
|
|
|
lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
|
2008-12-19 11:07:34 +00:00
|
|
|
|
2009-09-15 18:39:27 +00:00
|
|
|
if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
|
2008-12-19 11:07:34 +00:00
|
|
|
(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
|
2015-12-31 05:03:27 +00:00
|
|
|
(void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln);
|
2008-12-19 11:07:34 +00:00
|
|
|
LLE_RUNLOCK(lle);
|
2009-09-15 18:39:27 +00:00
|
|
|
} else {
|
|
|
|
if (lle != NULL)
|
|
|
|
LLE_RUNLOCK(lle);
|
|
|
|
|
|
|
|
if (!V_arp_proxyall)
|
|
|
|
goto drop;
|
2012-08-01 09:00:26 +00:00
|
|
|
|
2020-07-02 21:04:08 +00:00
|
|
|
NET_EPOCH_ASSERT();
|
|
|
|
nh = fib4_lookup(ifp->if_fib, itaddr, 0, 0, 0);
|
|
|
|
if (nh == NULL)
|
2009-09-15 18:39:27 +00:00
|
|
|
goto drop;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't send proxies for nodes on the same interface
|
|
|
|
* as this one came out of, or we'll get into a fight
|
|
|
|
* over who claims what Ether address.
|
|
|
|
*/
|
2020-07-02 21:04:08 +00:00
|
|
|
if (nh->nh_ifp == ifp)
|
2009-09-15 18:39:27 +00:00
|
|
|
goto drop;
|
2008-12-19 11:07:34 +00:00
|
|
|
|
2009-09-15 18:39:27 +00:00
|
|
|
(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
|
|
|
|
(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Also check that the node which sent the ARP packet
|
2011-02-21 09:01:34 +00:00
|
|
|
* is on the interface we expect it to be on. This
|
2009-09-15 18:39:27 +00:00
|
|
|
* avoids ARP chaos if an interface is connected to the
|
|
|
|
* wrong network.
|
|
|
|
*/
|
2012-08-01 09:00:26 +00:00
|
|
|
|
2020-07-02 21:04:08 +00:00
|
|
|
nh = fib4_lookup(ifp->if_fib, isaddr, 0, 0, 0);
|
|
|
|
if (nh == NULL)
|
2009-09-15 18:39:27 +00:00
|
|
|
goto drop;
|
2020-07-02 21:04:08 +00:00
|
|
|
if (nh->nh_ifp != ifp) {
|
2013-05-11 10:51:32 +00:00
|
|
|
ARP_LOG(LOG_INFO, "proxy: ignoring request"
|
2015-12-09 11:14:27 +00:00
|
|
|
" from %s via %s\n",
|
2017-02-16 20:47:41 +00:00
|
|
|
inet_ntoa_r(isaddr, addrbuf),
|
|
|
|
ifp->if_xname);
|
2009-09-15 18:39:27 +00:00
|
|
|
goto drop;
|
|
|
|
}
|
2000-07-13 19:31:01 +00:00
|
|
|
|
1994-11-02 00:58:29 +00:00
|
|
|
#ifdef DEBUG_PROXY
|
2017-02-16 20:47:41 +00:00
|
|
|
printf("arp: proxying for %s\n",
|
|
|
|
inet_ntoa_r(itaddr, addrbuf));
|
1994-10-01 21:50:33 +00:00
|
|
|
#endif
|
2009-09-15 18:39:27 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2007-02-02 20:31:44 +00:00
|
|
|
if (itaddr.s_addr == myaddr.s_addr &&
|
|
|
|
IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
|
|
|
|
/* RFC 3927 link-local IPv4; always reply by broadcast. */
|
|
|
|
#ifdef DEBUG_LINKLOCAL
|
|
|
|
printf("arp: sending reply for link-local addr %s\n",
|
2017-02-16 20:47:41 +00:00
|
|
|
inet_ntoa_r(itaddr, addrbuf));
|
2007-02-02 20:31:44 +00:00
|
|
|
#endif
|
|
|
|
m->m_flags |= M_BCAST;
|
|
|
|
m->m_flags &= ~M_MCAST;
|
|
|
|
} else {
|
|
|
|
/* default behaviour; never reply by broadcast. */
|
|
|
|
m->m_flags &= ~(M_BCAST|M_MCAST);
|
|
|
|
}
|
2001-10-14 20:17:53 +00:00
|
|
|
(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
|
|
|
|
(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
|
|
|
|
ah->ar_op = htons(ARPOP_REPLY);
|
|
|
|
ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
|
2012-07-31 11:31:12 +00:00
|
|
|
m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
|
|
|
|
m->m_pkthdr.len = m->m_len;
|
2011-07-04 05:47:48 +00:00
|
|
|
m->m_pkthdr.rcvif = NULL;
|
2004-03-21 06:36:05 +00:00
|
|
|
sa.sa_family = AF_ARP;
|
|
|
|
sa.sa_len = 2;
|
2015-12-31 05:03:27 +00:00
|
|
|
|
|
|
|
/* Calculate link header for sending frame */
|
|
|
|
bzero(&ro, sizeof(ro));
|
|
|
|
linkhdrsize = sizeof(linkhdr);
|
|
|
|
error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* arp_fillheader() may fail due to lack of support inside encap request
|
|
|
|
* routing. This is not necessary an error, AF_ARP can/should be handled
|
|
|
|
* by if_output().
|
|
|
|
*/
|
|
|
|
if (error != 0 && error != EAFNOSUPPORT) {
|
|
|
|
ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
|
|
|
|
if_name(ifp), error);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ro.ro_prepend = linkhdr;
|
|
|
|
ro.ro_plen = linkhdrsize;
|
|
|
|
ro.ro_flags = 0;
|
|
|
|
|
2013-08-19 13:27:32 +00:00
|
|
|
m_clrprotoflags(m); /* Avoid confusing lower layers. */
|
2015-12-31 05:03:27 +00:00
|
|
|
(*ifp->if_output)(ifp, m, &sa, &ro);
|
2009-09-03 21:10:57 +00:00
|
|
|
ARPSTAT_INC(txreplies);
|
1994-05-24 10:09:53 +00:00
|
|
|
return;
|
2004-04-25 15:00:17 +00:00
|
|
|
|
|
|
|
drop:
|
|
|
|
m_freem(m);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1998-01-08 23:42:31 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2015-08-13 13:38:09 +00:00
|
|
|
/*
|
|
|
|
* Checks received arp data against existing @la.
|
|
|
|
* Updates lle state/performs notification if necessary.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
|
|
|
|
int bridged, struct llentry *la)
|
|
|
|
{
|
|
|
|
struct sockaddr sa;
|
|
|
|
struct mbuf *m_hold, *m_hold_next;
|
2015-12-31 05:03:27 +00:00
|
|
|
uint8_t linkhdr[LLE_MAX_LINKHDR];
|
|
|
|
size_t linkhdrsize;
|
|
|
|
int lladdr_off;
|
2017-02-16 20:47:41 +00:00
|
|
|
char addrbuf[INET_ADDRSTRLEN];
|
2015-08-13 13:38:09 +00:00
|
|
|
|
|
|
|
LLE_WLOCK_ASSERT(la);
|
|
|
|
|
|
|
|
/* the following is not an error when doing bridging */
|
|
|
|
if (!bridged && la->lle_tbl->llt_ifp != ifp) {
|
|
|
|
if (log_arp_wrong_iface)
|
|
|
|
ARP_LOG(LOG_WARNING, "%s is on %s "
|
|
|
|
"but got reply from %*D on %s\n",
|
2017-02-16 20:47:41 +00:00
|
|
|
inet_ntoa_r(isaddr, addrbuf),
|
2015-08-13 13:38:09 +00:00
|
|
|
la->lle_tbl->llt_ifp->if_xname,
|
|
|
|
ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
|
|
|
|
ifp->if_xname);
|
|
|
|
LLE_WUNLOCK(la);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if ((la->la_flags & LLE_VALID) &&
|
2015-12-31 05:03:27 +00:00
|
|
|
bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) {
|
2015-08-13 13:38:09 +00:00
|
|
|
if (la->la_flags & LLE_STATIC) {
|
|
|
|
LLE_WUNLOCK(la);
|
|
|
|
if (log_arp_permanent_modify)
|
|
|
|
ARP_LOG(LOG_ERR,
|
|
|
|
"%*D attempts to modify "
|
|
|
|
"permanent entry for %s on %s\n",
|
|
|
|
ifp->if_addrlen,
|
|
|
|
(u_char *)ar_sha(ah), ":",
|
2017-02-16 20:47:41 +00:00
|
|
|
inet_ntoa_r(isaddr, addrbuf),
|
|
|
|
ifp->if_xname);
|
2015-08-13 13:38:09 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (log_arp_movements) {
|
|
|
|
ARP_LOG(LOG_INFO, "%s moved from %*D "
|
|
|
|
"to %*D on %s\n",
|
2017-02-16 20:47:41 +00:00
|
|
|
inet_ntoa_r(isaddr, addrbuf),
|
2015-08-13 13:38:09 +00:00
|
|
|
ifp->if_addrlen,
|
2017-03-11 04:57:52 +00:00
|
|
|
(u_char *)la->ll_addr, ":",
|
2015-08-13 13:38:09 +00:00
|
|
|
ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
|
|
|
|
ifp->if_xname);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-12-31 05:03:27 +00:00
|
|
|
/* Calculate full link prepend to use in lle */
|
|
|
|
linkhdrsize = sizeof(linkhdr);
|
|
|
|
if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
|
|
|
|
&linkhdrsize, &lladdr_off) != 0)
|
|
|
|
return;
|
|
|
|
|
2015-08-13 13:38:09 +00:00
|
|
|
/* Check if something has changed */
|
2015-12-31 05:03:27 +00:00
|
|
|
if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 ||
|
2015-08-13 13:38:09 +00:00
|
|
|
(la->la_flags & LLE_VALID) == 0) {
|
2015-12-31 05:03:27 +00:00
|
|
|
/* Try to perform LLE update */
|
|
|
|
if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
|
|
|
|
lladdr_off) == 0)
|
2015-08-13 13:38:09 +00:00
|
|
|
return;
|
2015-12-05 09:50:37 +00:00
|
|
|
|
|
|
|
/* Clear fast path feedback request if set */
|
|
|
|
la->r_skip_req = 0;
|
2015-08-13 13:38:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
arp_mark_lle_reachable(la);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The packets are all freed within the call to the output
|
|
|
|
* routine.
|
|
|
|
*
|
|
|
|
* NB: The lock MUST be released before the call to the
|
|
|
|
* output routine.
|
|
|
|
*/
|
|
|
|
if (la->la_hold != NULL) {
|
|
|
|
m_hold = la->la_hold;
|
|
|
|
la->la_hold = NULL;
|
|
|
|
la->la_numheld = 0;
|
|
|
|
lltable_fill_sa_entry(la, &sa);
|
|
|
|
LLE_WUNLOCK(la);
|
|
|
|
for (; m_hold != NULL; m_hold = m_hold_next) {
|
|
|
|
m_hold_next = m_hold->m_nextpkt;
|
|
|
|
m_hold->m_nextpkt = NULL;
|
|
|
|
/* Avoid confusing lower layers. */
|
|
|
|
m_clrprotoflags(m_hold);
|
|
|
|
(*ifp->if_output)(ifp, m_hold, &sa, NULL);
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
LLE_WUNLOCK(la);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
arp_mark_lle_reachable(struct llentry *la)
|
|
|
|
{
|
2015-12-05 09:50:37 +00:00
|
|
|
int canceled, wtime;
|
2015-08-13 13:38:09 +00:00
|
|
|
|
|
|
|
LLE_WLOCK_ASSERT(la);
|
|
|
|
|
2015-12-05 09:50:37 +00:00
|
|
|
la->ln_state = ARP_LLINFO_REACHABLE;
|
2015-08-13 13:38:09 +00:00
|
|
|
EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
|
|
|
|
|
|
|
|
if (!(la->la_flags & LLE_STATIC)) {
|
|
|
|
LLE_ADDREF(la);
|
|
|
|
la->la_expire = time_uptime + V_arpt_keep;
|
2015-12-05 09:50:37 +00:00
|
|
|
wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit;
|
|
|
|
if (wtime < 0)
|
|
|
|
wtime = V_arpt_keep;
|
2015-08-13 13:38:09 +00:00
|
|
|
canceled = callout_reset(&la->lle_timer,
|
2015-12-05 09:50:37 +00:00
|
|
|
hz * wtime, arptimer, la);
|
2015-08-13 13:38:09 +00:00
|
|
|
if (canceled)
|
|
|
|
LLE_REMREF(la);
|
|
|
|
}
|
|
|
|
la->la_asked = 0;
|
|
|
|
la->la_preempt = V_arp_maxtries;
|
|
|
|
}
|
|
|
|
|
2015-11-09 10:35:33 +00:00
|
|
|
/*
|
2017-08-09 10:46:34 +00:00
|
|
|
* Add permanent link-layer record for given interface address.
|
2015-11-09 10:35:33 +00:00
|
|
|
*/
|
|
|
|
static __noinline void
|
|
|
|
arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst)
|
1994-12-22 21:56:22 +00:00
|
|
|
{
|
2015-08-20 12:05:17 +00:00
|
|
|
struct llentry *lle, *lle_tmp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Interface address LLE record is considered static
|
|
|
|
* because kernel code relies on LLE_STATIC flag to check
|
|
|
|
* if these entries can be rewriten by arp updates.
|
|
|
|
*/
|
|
|
|
lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst);
|
|
|
|
if (lle == NULL) {
|
|
|
|
log(LOG_INFO, "arp_ifinit: cannot create arp "
|
|
|
|
"entry for interface address\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
IF_AFDATA_WLOCK(ifp);
|
|
|
|
LLE_WLOCK(lle);
|
|
|
|
/* Unlink any entry if exists */
|
|
|
|
lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
|
|
|
|
if (lle_tmp != NULL)
|
|
|
|
lltable_unlink_entry(LLTABLE(ifp), lle_tmp);
|
|
|
|
|
|
|
|
lltable_link_entry(LLTABLE(ifp), lle);
|
|
|
|
IF_AFDATA_WUNLOCK(ifp);
|
|
|
|
|
|
|
|
if (lle_tmp != NULL)
|
|
|
|
EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED);
|
|
|
|
|
|
|
|
EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
|
|
|
|
LLE_WUNLOCK(lle);
|
|
|
|
if (lle_tmp != NULL)
|
|
|
|
lltable_free_entry(LLTABLE(ifp), lle_tmp);
|
1994-12-22 21:56:22 +00:00
|
|
|
}
|
Lock down the network interface queues. The queue mutex must be obtained
before adding/removing packets from the queue. Also, the if_obytes and
if_omcasts fields should only be manipulated under protection of the mutex.
IF_ENQUEUE, IF_PREPEND, and IF_DEQUEUE perform all necessary locking on
the queue. An IF_LOCK macro is provided, as well as the old (mutex-less)
versions of the macros in the form _IF_ENQUEUE, _IF_QFULL, for code which
needs them, but their use is discouraged.
Two new macros are introduced: IF_DRAIN() to drain a queue, and IF_HANDOFF,
which takes care of locking/enqueue, and also statistics updating/start
if necessary.
2000-11-25 07:35:38 +00:00
|
|
|
|
Add GARP retransmit capability
A single gratuitous ARP (GARP) is always transmitted when an IPv4
address is added to an interface, and that is usually sufficient.
However, in some circumstances, such as when a shared address is
passed between cluster nodes, this single GARP may occasionally be
dropped or lost. This can lead to neighbors on the network link
working with a stale ARP cache and sending packets destined for
that address to the node that previously owned the address, which
may not respond.
To avoid this situation, GARP retransmissions can be enabled by setting
the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
than zero. The setting represents the maximum number of retransmissions.
The interval between retransmissions is calculated using an exponential
backoff algorithm, doubling each time, so the retransmission intervals
are: {1, 2, 4, 8, 16, ...} (seconds).
Due to the exponential backoff algorithm used for the interval
between GARP retransmissions, the maximum number of retransmissions
is limited to 16 for sanity. This limit corresponds to a maximum
interval between retransmissions of 2^16 seconds ~= 18 hours.
Increasing this limit is possible, but sending out GARPs spaced
days apart would be of little use.
Submitted by: David A. Bright <david.a.bright@dell.com>
MFC after: 1 month
Relnotes: yes
Sponsored by: Dell EMC
Differential Revision: https://reviews.freebsd.org/D7695
2016-10-02 01:42:45 +00:00
|
|
|
/*
|
|
|
|
* Handle the garp_rexmit_count. Like sysctl_handle_int(), but limits the range
|
|
|
|
* of valid values.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
int rexmit_count = *(int *)arg1;
|
|
|
|
|
|
|
|
error = sysctl_handle_int(oidp, &rexmit_count, 0, req);
|
|
|
|
|
|
|
|
/* Enforce limits on any new value that may have been set. */
|
|
|
|
if (!error && req->newptr) {
|
|
|
|
/* A new value was set. */
|
|
|
|
if (rexmit_count < 0) {
|
|
|
|
rexmit_count = 0;
|
|
|
|
} else if (rexmit_count > MAX_GARP_RETRANSMITS) {
|
|
|
|
rexmit_count = MAX_GARP_RETRANSMITS;
|
|
|
|
}
|
|
|
|
*(int *)arg1 = rexmit_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Retransmit a Gratuitous ARP (GARP) and, if necessary, schedule a callout to
|
|
|
|
* retransmit it again. A pending callout owns a reference to the ifa.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
garp_rexmit(void *arg)
|
|
|
|
{
|
|
|
|
struct in_ifaddr *ia = arg;
|
|
|
|
|
|
|
|
if (callout_pending(&ia->ia_garp_timer) ||
|
|
|
|
!callout_active(&ia->ia_garp_timer)) {
|
|
|
|
IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
|
|
|
|
ifa_free(&ia->ia_ifa);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-02-12 21:22:57 +00:00
|
|
|
CURVNET_SET(ia->ia_ifa.ifa_ifp->if_vnet);
|
|
|
|
|
Add GARP retransmit capability
A single gratuitous ARP (GARP) is always transmitted when an IPv4
address is added to an interface, and that is usually sufficient.
However, in some circumstances, such as when a shared address is
passed between cluster nodes, this single GARP may occasionally be
dropped or lost. This can lead to neighbors on the network link
working with a stale ARP cache and sending packets destined for
that address to the node that previously owned the address, which
may not respond.
To avoid this situation, GARP retransmissions can be enabled by setting
the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
than zero. The setting represents the maximum number of retransmissions.
The interval between retransmissions is calculated using an exponential
backoff algorithm, doubling each time, so the retransmission intervals
are: {1, 2, 4, 8, 16, ...} (seconds).
Due to the exponential backoff algorithm used for the interval
between GARP retransmissions, the maximum number of retransmissions
is limited to 16 for sanity. This limit corresponds to a maximum
interval between retransmissions of 2^16 seconds ~= 18 hours.
Increasing this limit is possible, but sending out GARPs spaced
days apart would be of little use.
Submitted by: David A. Bright <david.a.bright@dell.com>
MFC after: 1 month
Relnotes: yes
Sponsored by: Dell EMC
Differential Revision: https://reviews.freebsd.org/D7695
2016-10-02 01:42:45 +00:00
|
|
|
/*
|
|
|
|
* Drop lock while the ARP request is generated.
|
|
|
|
*/
|
|
|
|
IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
|
|
|
|
|
|
|
|
arprequest(ia->ia_ifa.ifa_ifp, &IA_SIN(ia)->sin_addr,
|
|
|
|
&IA_SIN(ia)->sin_addr, IF_LLADDR(ia->ia_ifa.ifa_ifp));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Increment the count of retransmissions. If the count has reached the
|
|
|
|
* maximum value, stop sending the GARP packets. Otherwise, schedule
|
|
|
|
* the callout to retransmit another GARP packet.
|
|
|
|
*/
|
|
|
|
++ia->ia_garp_count;
|
|
|
|
if (ia->ia_garp_count >= garp_rexmit_count) {
|
|
|
|
ifa_free(&ia->ia_ifa);
|
|
|
|
} else {
|
|
|
|
int rescheduled;
|
|
|
|
IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp);
|
|
|
|
rescheduled = callout_reset(&ia->ia_garp_timer,
|
|
|
|
(1 << ia->ia_garp_count) * hz,
|
|
|
|
garp_rexmit, ia);
|
|
|
|
IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
|
|
|
|
if (rescheduled) {
|
|
|
|
ifa_free(&ia->ia_ifa);
|
|
|
|
}
|
|
|
|
}
|
2019-02-12 21:22:57 +00:00
|
|
|
|
|
|
|
CURVNET_RESTORE();
|
Add GARP retransmit capability
A single gratuitous ARP (GARP) is always transmitted when an IPv4
address is added to an interface, and that is usually sufficient.
However, in some circumstances, such as when a shared address is
passed between cluster nodes, this single GARP may occasionally be
dropped or lost. This can lead to neighbors on the network link
working with a stale ARP cache and sending packets destined for
that address to the node that previously owned the address, which
may not respond.
To avoid this situation, GARP retransmissions can be enabled by setting
the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
than zero. The setting represents the maximum number of retransmissions.
The interval between retransmissions is calculated using an exponential
backoff algorithm, doubling each time, so the retransmission intervals
are: {1, 2, 4, 8, 16, ...} (seconds).
Due to the exponential backoff algorithm used for the interval
between GARP retransmissions, the maximum number of retransmissions
is limited to 16 for sanity. This limit corresponds to a maximum
interval between retransmissions of 2^16 seconds ~= 18 hours.
Increasing this limit is possible, but sending out GARPs spaced
days apart would be of little use.
Submitted by: David A. Bright <david.a.bright@dell.com>
MFC after: 1 month
Relnotes: yes
Sponsored by: Dell EMC
Differential Revision: https://reviews.freebsd.org/D7695
2016-10-02 01:42:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Start the GARP retransmit timer.
|
|
|
|
*
|
|
|
|
* A single GARP is always transmitted when an IPv4 address is added
|
|
|
|
* to an interface and that is usually sufficient. However, in some
|
|
|
|
* circumstances, such as when a shared address is passed between
|
|
|
|
* cluster nodes, this single GARP may occasionally be dropped or
|
|
|
|
* lost. This can lead to neighbors on the network link working with a
|
|
|
|
* stale ARP cache and sending packets destined for that address to
|
|
|
|
* the node that previously owned the address, which may not respond.
|
|
|
|
*
|
|
|
|
* To avoid this situation, GARP retransmits can be enabled by setting
|
|
|
|
* the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
|
|
|
|
* than zero. The setting represents the maximum number of
|
|
|
|
* retransmissions. The interval between retransmissions is calculated
|
|
|
|
* using an exponential backoff algorithm, doubling each time, so the
|
|
|
|
* retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds).
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
garp_timer_start(struct ifaddr *ifa)
|
|
|
|
{
|
|
|
|
struct in_ifaddr *ia = (struct in_ifaddr *) ifa;
|
|
|
|
|
|
|
|
IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp);
|
|
|
|
ia->ia_garp_count = 0;
|
|
|
|
if (callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz,
|
|
|
|
garp_rexmit, ia) == 0) {
|
|
|
|
ifa_ref(ifa);
|
|
|
|
}
|
|
|
|
IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
|
|
|
|
}
|
|
|
|
|
2005-02-22 13:04:05 +00:00
|
|
|
void
|
2015-11-09 10:35:33 +00:00
|
|
|
arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
|
2005-02-22 13:04:05 +00:00
|
|
|
{
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
struct epoch_tracker et;
|
2015-11-09 10:35:33 +00:00
|
|
|
const struct sockaddr_in *dst_in;
|
|
|
|
const struct sockaddr *dst;
|
|
|
|
|
|
|
|
if (ifa->ifa_carp != NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
dst = ifa->ifa_addr;
|
|
|
|
dst_in = (const struct sockaddr_in *)dst;
|
|
|
|
|
|
|
|
if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY)
|
|
|
|
return;
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
NET_EPOCH_ENTER(et);
|
2015-12-17 14:41:30 +00:00
|
|
|
arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp));
|
Widen NET_EPOCH coverage.
When epoch(9) was introduced to network stack, it was basically
dropped in place of existing locking, which was mutexes and
rwlocks. For the sake of performance mutex covered areas were
as small as possible, so became epoch covered areas.
However, epoch doesn't introduce any contention, it just delays
memory reclaim. So, there is no point to minimise epoch covered
areas in sense of performance. Meanwhile entering/exiting epoch
also has non-zero CPU usage, so doing this less often is a win.
Not the least is also code maintainability. In the new paradigm
we can assume that at any stage of processing a packet, we are
inside network epoch. This makes coding both input and output
path way easier.
On output path we already enter epoch quite early - in the
ip_output(), in the ip6_output().
This patch does the same for the input path. All ISR processing,
network related callouts, other ways of packet injection to the
network stack shall be performed in net_epoch. Any leaf function
that walks network configuration now asserts epoch.
Tricky part is configuration code paths - ioctls, sysctls. They
also call into leaf functions, so some need to be changed.
This patch would introduce more epoch recursions (see EPOCH_TRACE)
than we had before. They will be cleaned up separately, as several
of them aren't trivial. Note, that unlike a lock recursion the
epoch recursion is safe and just wastes a bit of resources.
Reviewed by: gallatin, hselasky, cy, adrian, kristof
Differential Revision: https://reviews.freebsd.org/D19111
2019-10-07 22:40:05 +00:00
|
|
|
NET_EPOCH_EXIT(et);
|
Add GARP retransmit capability
A single gratuitous ARP (GARP) is always transmitted when an IPv4
address is added to an interface, and that is usually sufficient.
However, in some circumstances, such as when a shared address is
passed between cluster nodes, this single GARP may occasionally be
dropped or lost. This can lead to neighbors on the network link
working with a stale ARP cache and sending packets destined for
that address to the node that previously owned the address, which
may not respond.
To avoid this situation, GARP retransmissions can be enabled by setting
the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
than zero. The setting represents the maximum number of retransmissions.
The interval between retransmissions is calculated using an exponential
backoff algorithm, doubling each time, so the retransmission intervals
are: {1, 2, 4, 8, 16, ...} (seconds).
Due to the exponential backoff algorithm used for the interval
between GARP retransmissions, the maximum number of retransmissions
is limited to 16 for sanity. This limit corresponds to a maximum
interval between retransmissions of 2^16 seconds ~= 18 hours.
Increasing this limit is possible, but sending out GARPs spaced
days apart would be of little use.
Submitted by: David A. Bright <david.a.bright@dell.com>
MFC after: 1 month
Relnotes: yes
Sponsored by: Dell EMC
Differential Revision: https://reviews.freebsd.org/D7695
2016-10-02 01:42:45 +00:00
|
|
|
if (garp_rexmit_count > 0) {
|
|
|
|
garp_timer_start(ifa);
|
|
|
|
}
|
2015-11-09 10:35:33 +00:00
|
|
|
|
|
|
|
arp_add_ifa_lle(ifp, dst);
|
|
|
|
}
|
|
|
|
|
2015-12-17 14:41:30 +00:00
|
|
|
void
|
|
|
|
arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr)
|
2015-11-09 10:35:33 +00:00
|
|
|
{
|
|
|
|
|
2015-12-17 14:41:30 +00:00
|
|
|
if (ntohl(addr.s_addr) != INADDR_ANY)
|
|
|
|
arprequest(ifp, &addr, &addr, enaddr);
|
2005-02-22 13:04:05 +00:00
|
|
|
}
|
|
|
|
|
2015-11-09 10:11:14 +00:00
|
|
|
/*
|
2015-12-17 14:41:30 +00:00
|
|
|
* Sends gratuitous ARPs for each ifaddr to notify other
|
|
|
|
* nodes about the address change.
|
2015-11-09 10:11:14 +00:00
|
|
|
*/
|
2015-12-17 14:41:30 +00:00
|
|
|
static __noinline void
|
|
|
|
arp_handle_ifllchange(struct ifnet *ifp)
|
2015-11-09 10:11:14 +00:00
|
|
|
{
|
|
|
|
struct ifaddr *ifa;
|
2015-12-15 16:02:11 +00:00
|
|
|
|
2018-05-18 20:13:34 +00:00
|
|
|
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
|
2015-12-17 14:41:30 +00:00
|
|
|
if (ifa->ifa_addr->sa_family == AF_INET)
|
|
|
|
arp_ifinit(ifp, ifa);
|
2015-11-09 10:11:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-12-17 14:41:30 +00:00
|
|
|
* A handler for interface link layer address change event.
|
2015-11-09 10:11:14 +00:00
|
|
|
*/
|
2015-12-31 05:03:27 +00:00
|
|
|
static void
|
2015-11-09 10:11:14 +00:00
|
|
|
arp_iflladdr(void *arg __unused, struct ifnet *ifp)
|
|
|
|
{
|
|
|
|
|
2015-12-31 05:03:27 +00:00
|
|
|
lltable_update_ifaddr(LLTABLE(ifp));
|
|
|
|
|
2015-12-17 14:41:30 +00:00
|
|
|
if ((ifp->if_flags & IFF_UP) != 0)
|
|
|
|
arp_handle_ifllchange(ifp);
|
2015-11-09 10:11:14 +00:00
|
|
|
}
|
|
|
|
|
First pass at separating per-vnet initializer functions
from existing functions for initializing global state.
At this stage, the new per-vnet initializer functions are
directly called from the existing global initialization code,
which should in most cases result in compiler inlining those
new functions, hence yielding a near-zero functional change.
Modify the existing initializer functions which are invoked via
protosw, like ip_init() et. al., to allow them to be invoked
multiple times, i.e. per each vnet. Global state, if any,
is initialized only if such functions are called within the
context of vnet0, which will be determined via the
IS_DEFAULT_VNET(curvnet) check (currently always true).
While here, V_irtualize a few remaining global UMA zones
used by net/netinet/netipsec networking code. While it is
not yet clear to me or anybody else whether this is the right
thing to do, at this stage this makes the code more readable,
and makes it easier to track uncollected UMA-zone-backed
objects on vnet removal. In the long run, it's quite possible
that some form of shared use of UMA zone pools among multiple
vnets should be considered.
Bump __FreeBSD_version due to changes in layout of structs
vnet_ipfw, vnet_inet and vnet_net.
Approved by: julian (mentor)
2009-04-06 22:29:41 +00:00
|
|
|
static void
|
2016-06-03 13:57:10 +00:00
|
|
|
vnet_arp_init(void)
|
First pass at separating per-vnet initializer functions
from existing functions for initializing global state.
At this stage, the new per-vnet initializer functions are
directly called from the existing global initialization code,
which should in most cases result in compiler inlining those
new functions, hence yielding a near-zero functional change.
Modify the existing initializer functions which are invoked via
protosw, like ip_init() et. al., to allow them to be invoked
multiple times, i.e. per each vnet. Global state, if any,
is initialized only if such functions are called within the
context of vnet0, which will be determined via the
IS_DEFAULT_VNET(curvnet) check (currently always true).
While here, V_irtualize a few remaining global UMA zones
used by net/netinet/netipsec networking code. While it is
not yet clear to me or anybody else whether this is the right
thing to do, at this stage this makes the code more readable,
and makes it easier to track uncollected UMA-zone-backed
objects on vnet removal. In the long run, it's quite possible
that some form of shared use of UMA zone pools among multiple
vnets should be considered.
Bump __FreeBSD_version due to changes in layout of structs
vnet_ipfw, vnet_inet and vnet_net.
Approved by: julian (mentor)
2009-04-06 22:29:41 +00:00
|
|
|
{
|
|
|
|
|
2016-06-03 13:57:10 +00:00
|
|
|
if (IS_DEFAULT_VNET(curvnet)) {
|
|
|
|
netisr_register(&arp_nh);
|
2015-11-09 10:11:14 +00:00
|
|
|
iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
|
|
|
|
arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
|
2016-06-03 13:57:10 +00:00
|
|
|
}
|
|
|
|
#ifdef VIMAGE
|
|
|
|
else
|
|
|
|
netisr_register_vnet(&arp_nh);
|
|
|
|
#endif
|
Lock down the network interface queues. The queue mutex must be obtained
before adding/removing packets from the queue. Also, the if_obytes and
if_omcasts fields should only be manipulated under protection of the mutex.
IF_ENQUEUE, IF_PREPEND, and IF_DEQUEUE perform all necessary locking on
the queue. An IF_LOCK macro is provided, as well as the old (mutex-less)
versions of the macros in the form _IF_ENQUEUE, _IF_QFULL, for code which
needs them, but their use is discouraged.
Two new macros are introduced: IF_DRAIN() to drain a queue, and IF_HANDOFF,
which takes care of locking/enqueue, and also statistics updating/start
if necessary.
2000-11-25 07:35:38 +00:00
|
|
|
}
|
2016-06-03 13:57:10 +00:00
|
|
|
VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND,
|
|
|
|
vnet_arp_init, 0);
|
|
|
|
|
|
|
|
#ifdef VIMAGE
|
|
|
|
/*
|
|
|
|
* We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH
|
|
|
|
* lookups after destroying the hash. Ideally this would go on SI_ORDER_3.5.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
vnet_arp_destroy(__unused void *arg)
|
|
|
|
{
|
|
|
|
|
|
|
|
netisr_unregister_vnet(&arp_nh);
|
|
|
|
}
|
|
|
|
VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
|
|
|
|
vnet_arp_destroy, NULL);
|
|
|
|
#endif
|