freebsd-dev/sys/net/if_llatbl.c

1120 lines
25 KiB
C
Raw Normal View History

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
2008-12-15 06:53:09 +00:00
* Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved.
* Copyright (c) 2004-2008 Qing Li. All rights reserved.
* Copyright (c) 2008 Kip Macy. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
2008-12-15 06:53:09 +00:00
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
2008-12-15 06:53:09 +00:00
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
2008-12-15 06:53:09 +00:00
#include <vm/uma.h>
#include <netinet/in.h>
#include <net/if_llatbl.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_var.h>
#include <net/route.h>
#include <net/route/route_ctl.h>
#include <net/route/route_debug.h>
#include <net/vnet.h>
2008-12-15 06:53:09 +00:00
#include <netinet/if_ether.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
MALLOC_DEFINE(M_LLTABLE, "lltable", "link level address tables");
VNET_DEFINE_STATIC(SLIST_HEAD(, lltable), lltables) =
SLIST_HEAD_INITIALIZER(lltables);
#define V_lltables VNET(lltables)
2008-12-15 06:53:09 +00:00
static struct rwlock lltable_list_lock;
RW_SYSINIT(lltable_list_lock, &lltable_list_lock, "lltable_list_lock");
#define LLTABLE_LIST_RLOCK() rw_rlock(&lltable_list_lock)
#define LLTABLE_LIST_RUNLOCK() rw_runlock(&lltable_list_lock)
#define LLTABLE_LIST_WLOCK() rw_wlock(&lltable_list_lock)
#define LLTABLE_LIST_WUNLOCK() rw_wunlock(&lltable_list_lock)
#define LLTABLE_LIST_LOCK_ASSERT() rw_assert(&lltable_list_lock, RA_LOCKED)
static void lltable_unlink(struct lltable *llt);
static void llentries_unlink(struct lltable *llt, struct llentries *head);
/*
* Dump lle state for a specific address family.
*/
static int
lltable_dump_af(struct lltable *llt, struct sysctl_req *wr)
{
struct epoch_tracker et;
int error;
LLTABLE_LIST_LOCK_ASSERT();
if (llt->llt_ifp->if_flags & IFF_LOOPBACK)
return (0);
error = 0;
NET_EPOCH_ENTER(et);
error = lltable_foreach_lle(llt,
(llt_foreach_cb_t *)llt->llt_dump_entry, wr);
NET_EPOCH_EXIT(et);
return (error);
}
2008-12-15 06:53:09 +00:00
/*
* Dump arp state for a specific address family.
*/
int
lltable_sysctl_dumparp(int af, struct sysctl_req *wr)
{
struct lltable *llt;
int error = 0;
LLTABLE_LIST_RLOCK();
SLIST_FOREACH(llt, &V_lltables, llt_link) {
2008-12-15 06:53:09 +00:00
if (llt->llt_af == af) {
error = lltable_dump_af(llt, wr);
2008-12-15 06:53:09 +00:00
if (error != 0)
goto done;
}
}
done:
LLTABLE_LIST_RUNLOCK();
2008-12-15 06:53:09 +00:00
return (error);
}
/*
* Common function helpers for chained hash table.
*/
/*
* Runs specified callback for each entry in @llt.
* Caller does the locking.
*
*/
static int
htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg)
{
struct llentry *lle, *next;
int i, error;
error = 0;
for (i = 0; i < llt->llt_hsize; i++) {
CK_LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
error = f(llt, lle, farg);
if (error != 0)
break;
}
}
return (error);
}
/*
* The htable_[un]link_entry() functions return:
* 0 if the entry was (un)linked already and nothing changed,
* 1 if the entry was added/removed to/from the table, and
* -1 on error (e.g., not being able to add the entry due to limits reached).
* While the "unlink" operation should never error, callers of
* lltable_link_entry() need to check for errors and handle them.
*/
static int
htable_link_entry(struct lltable *llt, struct llentry *lle)
{
struct llentries *lleh;
uint32_t hashidx;
if ((lle->la_flags & LLE_LINKED) != 0)
return (0);
IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
if (llt->llt_maxentries > 0 &&
llt->llt_entries >= llt->llt_maxentries)
return (-1);
hashidx = llt->llt_hash(lle, llt->llt_hsize);
lleh = &llt->lle_head[hashidx];
lle->lle_tbl = llt;
lle->lle_head = lleh;
lle->la_flags |= LLE_LINKED;
CK_LIST_INSERT_HEAD(lleh, lle, lle_next);
llt->llt_entries++;
return (1);
}
static int
htable_unlink_entry(struct llentry *lle)
{
struct lltable *llt;
if ((lle->la_flags & LLE_LINKED) == 0)
return (0);
llt = lle->lle_tbl;
IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
KASSERT(llt->llt_entries > 0, ("%s: lltable %p (%s) entries %d <= 0",
__func__, llt, if_name(llt->llt_ifp), llt->llt_entries));
CK_LIST_REMOVE(lle, lle_next);
lle->la_flags &= ~(LLE_VALID | LLE_LINKED);
#if 0
lle->lle_tbl = NULL;
lle->lle_head = NULL;
#endif
llt->llt_entries--;
return (1);
}
struct prefix_match_data {
const struct sockaddr *addr;
const struct sockaddr *mask;
struct llentries dchain;
u_int flags;
};
static int
htable_prefix_free_cb(struct lltable *llt, struct llentry *lle, void *farg)
{
struct prefix_match_data *pmd;
pmd = (struct prefix_match_data *)farg;
if (llt->llt_match_prefix(pmd->addr, pmd->mask, pmd->flags, lle)) {
LLE_WLOCK(lle);
CK_LIST_INSERT_HEAD(&pmd->dchain, lle, lle_chain);
}
return (0);
}
static void
htable_prefix_free(struct lltable *llt, const struct sockaddr *addr,
const struct sockaddr *mask, u_int flags)
{
struct llentry *lle, *next;
struct prefix_match_data pmd;
bzero(&pmd, sizeof(pmd));
pmd.addr = addr;
pmd.mask = mask;
pmd.flags = flags;
CK_LIST_INIT(&pmd.dchain);
IF_AFDATA_WLOCK(llt->llt_ifp);
/* Push matching lles to chain */
lltable_foreach_lle(llt, htable_prefix_free_cb, &pmd);
llentries_unlink(llt, &pmd.dchain);
IF_AFDATA_WUNLOCK(llt->llt_ifp);
CK_LIST_FOREACH_SAFE(lle, &pmd.dchain, lle_chain, next)
lltable_free_entry(llt, lle);
}
static void
htable_free_tbl(struct lltable *llt)
{
free(llt->lle_head, M_LLTABLE);
free(llt, M_LLTABLE);
}
static void
llentries_unlink(struct lltable *llt, struct llentries *head)
{
struct llentry *lle, *next;
CK_LIST_FOREACH_SAFE(lle, head, lle_chain, next)
llt->llt_unlink_entry(lle);
}
/*
* Helper function used to drop all mbufs in hold queue.
*
* Returns the number of held packets, if any, that were dropped.
2008-12-15 06:53:09 +00:00
*/
size_t
lltable_drop_entry_queue(struct llentry *lle)
2008-12-15 06:53:09 +00:00
{
size_t pkts_dropped;
struct mbuf *next;
2008-12-15 06:53:09 +00:00
LLE_WLOCK_ASSERT(lle);
pkts_dropped = 0;
while ((lle->la_numheld > 0) && (lle->la_hold != NULL)) {
next = lle->la_hold->m_nextpkt;
2008-12-15 06:53:09 +00:00
m_freem(lle->la_hold);
lle->la_hold = next;
lle->la_numheld--;
pkts_dropped++;
}
KASSERT(lle->la_numheld == 0,
("%s: la_numheld %d > 0, pkts_droped %zd", __func__,
lle->la_numheld, pkts_dropped));
2008-12-15 06:53:09 +00:00
return (pkts_dropped);
}
2015-11-07 11:12:00 +00:00
void
lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
const char *linkhdr, size_t linkhdrsize, int lladdr_off)
2015-11-07 11:12:00 +00:00
{
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
memcpy(lle->r_linkdata, linkhdr, linkhdrsize);
lle->r_hdrlen = linkhdrsize;
lle->ll_addr = &lle->r_linkdata[lladdr_off];
2015-11-07 11:12:00 +00:00
lle->la_flags |= LLE_VALID;
Remove LLE read lock from IPv4 fast path. LLE structure is mostly unchanged during its lifecycle. To be more specific, there are 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we re-send arp request to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: 1) introduce special r_skip_req field which is read lockless by fast path, but updated under (new) req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. 2) introduce simple state machine: incomplete->reachable<->verify->deleted. Before that we implicitely had incomplete->reachable->deleted state machine, with V_arpt_keep between "reachable" and "deleted". Verification was performed in runtime 5 seconds before V_arpt_keep expire. This is changed to "change state to verify 5 seconds before V_arpt_keep, set r_skip_req to non-zero value and check it every second". If the value is zero - then send arp verification probe. These changes do not introduce any signifficant control plane overhead: typically lle callout timer would fire 1 time more each V_arpt_keep (1200s) for used lles and up to arp_maxtries (5) for dead lles. As a result, all packets towards "reachable" lle are handled by fast path without acquiring lle read lock. Additional "req_mutex" is needed because callout / arpresolve_slow() or eventhandler might keep LLE lock for signifficant amount of time, which might not be feasible for fast path locking (e.g. having rmlock as ether AFDATA or lltable own lock). Differential Revision: https://reviews.freebsd.org/D3688
2015-12-05 09:50:37 +00:00
lle->r_flags |= RLLE_VALID;
2015-11-07 11:12:00 +00:00
}
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
/*
* Acquires lltable write lock.
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
*
* Returns true on success, with both lltable and lle lock held.
* On failure, false is returned and lle wlock is still held.
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
*/
bool
lltable_acquire_wlock(struct ifnet *ifp, struct llentry *lle)
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
{
NET_EPOCH_ASSERT();
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
/* Perform real LLE update */
/* use afdata WLOCK to update fields */
LLE_WUNLOCK(lle);
IF_AFDATA_WLOCK(ifp);
LLE_WLOCK(lle);
/*
* Since we droppped LLE lock, other thread might have deleted
* this lle. Check and return
*/
if ((lle->la_flags & LLE_DELETED) != 0) {
IF_AFDATA_WUNLOCK(ifp);
return (false);
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
}
return (true);
}
/*
* Tries to update @lle link-level address.
* Since update requires AFDATA WLOCK, function
* drops @lle lock, acquires AFDATA lock and then acquires
* @lle lock to maintain lock order.
*
* Returns 1 on success.
*/
int
lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
const char *linkhdr, size_t linkhdrsize, int lladdr_off)
{
if (!lltable_acquire_wlock(ifp, lle))
return (0);
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
/* Update data */
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off);
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
IF_AFDATA_WUNLOCK(ifp);
return (1);
}
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
/*
* Helper function used to pre-compute full/partial link-layer
* header data suitable for feeding into if_output().
*/
int
lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr,
char *buf, size_t *bufsize, int *lladdr_off)
{
struct if_encap_req ereq;
int error;
bzero(buf, *bufsize);
bzero(&ereq, sizeof(ereq));
ereq.buf = buf;
ereq.bufsize = *bufsize;
ereq.rtype = IFENCAP_LL;
ereq.family = family;
ereq.lladdr = lladdr;
ereq.lladdr_len = ifp->if_addrlen;
error = ifp->if_requestencap(ifp, &ereq);
if (error == 0) {
*bufsize = ereq.bufsize;
*lladdr_off = ereq.lladdr_off;
}
return (error);
}
/*
* Searches for the child entry matching @family inside @lle.
* Returns the entry or NULL.
*/
struct llentry *
llentry_lookup_family(struct llentry *lle, int family)
{
struct llentry *child_lle;
if (lle == NULL)
return (NULL);
CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) {
if (child_lle->r_family == family)
return (child_lle);
}
return (NULL);
}
/*
* Retrieves upper protocol family for the llentry.
* By default, all "normal" (e.g. upper_family == transport_family)
* llentries have r_family set to 0.
* Thus, use @default_family in that regard, otherwise use r_family.
*
* Returns upper protocol family
*/
int
llentry_get_upper_family(const struct llentry *lle, int default_family)
{
return (lle->r_family == 0 ? default_family : lle->r_family);
}
/*
* Prints llentry @lle data into provided buffer.
* Example: lle/inet/valid/em0/1.2.3.4
*
* Returns @buf.
*/
char *
llentry_print_buf(const struct llentry *lle, struct ifnet *ifp, int family,
char *buf, size_t bufsize)
{
char abuf[INET6_ADDRSTRLEN];
const char *valid = (lle->r_flags & RLLE_VALID) ? "valid" : "no_l2";
const char *upper_str = rib_print_family(llentry_get_upper_family(lle, family));
switch (family) {
#ifdef INET
case AF_INET:
inet_ntop(AF_INET, &lle->r_l3addr.addr4, abuf, sizeof(abuf));
snprintf(buf, bufsize, "lle/%s/%s/%s/%s", upper_str,
valid, if_name(ifp), abuf);
break;
#endif
#ifdef INET6
case AF_INET6:
inet_ntop(AF_INET6, &lle->r_l3addr.addr6, abuf, sizeof(abuf));
snprintf(buf, bufsize, "lle/%s/%s/%s/%s", upper_str,
valid, if_name(ifp), abuf);
break;
#endif
default:
snprintf(buf, bufsize, "lle/%s/%s/%s/????", upper_str,
valid, if_name(ifp));
break;
}
return (buf);
}
char *
llentry_print_buf_lltable(const struct llentry *lle, char *buf, size_t bufsize)
{
struct lltable *tbl = lle->lle_tbl;
return (llentry_print_buf(lle, lltable_get_ifp(tbl), lltable_get_af(tbl), buf, bufsize));
}
/*
* Requests feedback from the datapath.
* First packet using @lle should result in
* setting r_skip_req back to 0 and updating
* lle_hittime to the current time_uptime.
*/
void
llentry_request_feedback(struct llentry *lle)
{
struct llentry *child_lle;
LLE_REQ_LOCK(lle);
lle->r_skip_req = 1;
LLE_REQ_UNLOCK(lle);
CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) {
LLE_REQ_LOCK(child_lle);
child_lle->r_skip_req = 1;
LLE_REQ_UNLOCK(child_lle);
}
}
/*
* Updates the lle state to mark it has been used
* and record the time.
* Used by the llentry_provide_feedback() wrapper.
*/
void
llentry_mark_used(struct llentry *lle)
{
LLE_REQ_LOCK(lle);
lle->r_skip_req = 0;
lle->lle_hittime = time_uptime;
LLE_REQ_UNLOCK(lle);
}
/*
* Fetches the time when lle was used.
* Return 0 if the entry was not used, relevant time_uptime
* otherwise.
*/
static time_t
llentry_get_hittime_raw(struct llentry *lle)
{
time_t lle_hittime = 0;
LLE_REQ_LOCK(lle);
if ((lle->r_skip_req == 0) && (lle_hittime < lle->lle_hittime))
lle_hittime = lle->lle_hittime;
LLE_REQ_UNLOCK(lle);
return (lle_hittime);
}
time_t
llentry_get_hittime(struct llentry *lle)
{
time_t lle_hittime = 0;
struct llentry *child_lle;
lle_hittime = llentry_get_hittime_raw(lle);
CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) {
time_t hittime = llentry_get_hittime_raw(child_lle);
if (hittime > lle_hittime)
lle_hittime = hittime;
}
return (lle_hittime);
}
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
/*
* Update link-layer header for given @lle after
* interface lladdr was changed.
*/
static int
llentry_update_ifaddr(struct lltable *llt, struct llentry *lle, void *farg)
{
struct ifnet *ifp;
u_char linkhdr[LLE_MAX_LINKHDR];
size_t linkhdrsize;
u_char *lladdr;
int lladdr_off;
ifp = (struct ifnet *)farg;
lladdr = lle->ll_addr;
LLE_WLOCK(lle);
if ((lle->la_flags & LLE_VALID) == 0) {
LLE_WUNLOCK(lle);
return (0);
}
if ((lle->la_flags & LLE_IFADDR) != 0)
lladdr = IF_LLADDR(ifp);
linkhdrsize = sizeof(linkhdr);
lltable_calc_llheader(ifp, llt->llt_af, lladdr, linkhdr, &linkhdrsize,
&lladdr_off);
memcpy(lle->r_linkdata, linkhdr, linkhdrsize);
LLE_WUNLOCK(lle);
return (0);
}
/*
* Update all calculated headers for given @llt
*/
void
lltable_update_ifaddr(struct lltable *llt)
{
if (llt->llt_ifp->if_flags & IFF_LOOPBACK)
return;
IF_AFDATA_WLOCK(llt->llt_ifp);
lltable_foreach_lle(llt, llentry_update_ifaddr, llt->llt_ifp);
IF_AFDATA_WUNLOCK(llt->llt_ifp);
}
/*
*
* Performs generic cleanup routines and frees lle.
*
* Called for non-linked entries, with callouts and
* other AF-specific cleanups performed.
*
* @lle must be passed WLOCK'ed
*
* Returns the number of held packets, if any, that were dropped.
*/
size_t
llentry_free(struct llentry *lle)
{
size_t pkts_dropped;
LLE_WLOCK_ASSERT(lle);
KASSERT((lle->la_flags & LLE_LINKED) == 0, ("freeing linked lle"));
pkts_dropped = lltable_drop_entry_queue(lle);
/* cancel timer */
if (callout_stop(&lle->lle_timer) > 0)
LLE_REMREF(lle);
2008-12-15 06:53:09 +00:00
LLE_FREE_LOCKED(lle);
return (pkts_dropped);
2008-12-15 06:53:09 +00:00
}
/*
* Free all entries from given table and free itself.
*/
static int
lltable_free_cb(struct lltable *llt, struct llentry *lle, void *farg)
{
struct llentries *dchain;
dchain = (struct llentries *)farg;
LLE_WLOCK(lle);
CK_LIST_INSERT_HEAD(dchain, lle, lle_chain);
return (0);
}
2008-12-15 06:53:09 +00:00
/*
* Free all entries from given table and free itself.
*/
void
lltable_free(struct lltable *llt)
{
struct llentry *lle, *next;
struct llentries dchain;
2008-12-15 06:53:09 +00:00
KASSERT(llt != NULL, ("%s: llt is NULL", __func__));
lltable_unlink(llt);
2008-12-15 06:53:09 +00:00
CK_LIST_INIT(&dchain);
IF_AFDATA_WLOCK(llt->llt_ifp);
/* Push all lles to @dchain */
lltable_foreach_lle(llt, lltable_free_cb, &dchain);
llentries_unlink(llt, &dchain);
IF_AFDATA_WUNLOCK(llt->llt_ifp);
2008-12-15 06:53:09 +00:00
CK_LIST_FOREACH_SAFE(lle, &dchain, lle_chain, next) {
llentry_free(lle);
}
KASSERT(llt->llt_entries == 0, ("%s: lltable %p (%s) entires not 0: %d",
__func__, llt, llt->llt_ifp->if_xname, llt->llt_entries));
llt->llt_free_tbl(llt);
2008-12-15 06:53:09 +00:00
}
/*
* Deletes an address from given lltable.
* Used for userland interaction to remove
* individual entries. Skips entries added by OS.
*/
int
lltable_delete_addr(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr)
{
struct llentry *lle;
struct ifnet *ifp;
ifp = llt->llt_ifp;
IF_AFDATA_WLOCK(ifp);
lle = lla_lookup(llt, LLE_SF(l3addr->sa_family, LLE_EXCLUSIVE), l3addr);
if (lle == NULL) {
IF_AFDATA_WUNLOCK(ifp);
return (ENOENT);
}
if ((lle->la_flags & LLE_IFADDR) != 0 && (flags & LLE_IFADDR) == 0) {
IF_AFDATA_WUNLOCK(ifp);
LLE_WUNLOCK(lle);
return (EPERM);
}
lltable_unlink_entry(llt, lle);
IF_AFDATA_WUNLOCK(ifp);
llt->llt_delete_entry(llt, lle);
return (0);
}
void
lltable_prefix_free(int af, struct sockaddr *addr, struct sockaddr *mask,
u_int flags)
{
struct lltable *llt;
LLTABLE_LIST_RLOCK();
SLIST_FOREACH(llt, &V_lltables, llt_link) {
if (llt->llt_af != af)
continue;
llt->llt_prefix_free(llt, addr, mask, flags);
}
LLTABLE_LIST_RUNLOCK();
}
2008-12-15 06:53:09 +00:00
struct lltable *
lltable_allocate_htbl(uint32_t hsize)
2008-12-15 06:53:09 +00:00
{
struct lltable *llt;
int i;
2008-12-15 06:53:09 +00:00
llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK | M_ZERO);
llt->llt_hsize = hsize;
llt->lle_head = malloc(sizeof(struct llentries) * hsize,
M_LLTABLE, M_WAITOK | M_ZERO);
2008-12-15 06:53:09 +00:00
for (i = 0; i < llt->llt_hsize; i++)
CK_LIST_INIT(&llt->lle_head[i]);
2008-12-15 06:53:09 +00:00
/* Set some default callbacks */
llt->llt_link_entry = htable_link_entry;
llt->llt_unlink_entry = htable_unlink_entry;
llt->llt_prefix_free = htable_prefix_free;
llt->llt_foreach_entry = htable_foreach_lle;
llt->llt_free_tbl = htable_free_tbl;
return (llt);
}
2008-12-15 06:53:09 +00:00
/*
* Links lltable to global llt list.
2008-12-15 06:53:09 +00:00
*/
void
lltable_link(struct lltable *llt)
2008-12-15 06:53:09 +00:00
{
LLTABLE_LIST_WLOCK();
SLIST_INSERT_HEAD(&V_lltables, llt, llt_link);
LLTABLE_LIST_WUNLOCK();
}
static void
lltable_unlink(struct lltable *llt)
{
LLTABLE_LIST_WLOCK();
SLIST_REMOVE(&V_lltables, llt, lltable, llt_link);
LLTABLE_LIST_WUNLOCK();
2008-12-15 06:53:09 +00:00
}
/*
* Gets interface @ifp lltable for the specified @family
*/
struct lltable *
lltable_get(struct ifnet *ifp, int family)
{
switch (family) {
#ifdef INET
case AF_INET:
return (in_lltable_get(ifp));
#endif
#ifdef INET6
case AF_INET6:
return (in6_lltable_get(ifp));
#endif
}
return (NULL);
}
/*
* External methods used by lltable consumers
*/
int
lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg)
{
return (llt->llt_foreach_entry(llt, f, farg));
}
struct llentry *
lltable_alloc_entry(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr)
{
return (llt->llt_alloc_entry(llt, flags, l3addr));
}
void
lltable_free_entry(struct lltable *llt, struct llentry *lle)
{
llt->llt_free_entry(llt, lle);
}
int
lltable_link_entry(struct lltable *llt, struct llentry *lle)
{
return (llt->llt_link_entry(llt, lle));
}
void
lltable_link_child_entry(struct llentry *lle, struct llentry *child_lle)
{
child_lle->lle_parent = lle;
child_lle->lle_tbl = lle->lle_tbl;
child_lle->la_flags |= LLE_LINKED;
CK_SLIST_INSERT_HEAD(&lle->lle_children, child_lle, lle_child_next);
}
void
lltable_unlink_child_entry(struct llentry *child_lle)
{
struct llentry *lle = child_lle->lle_parent;
child_lle->la_flags &= ~LLE_LINKED;
child_lle->lle_parent = NULL;
CK_SLIST_REMOVE(&lle->lle_children, child_lle, llentry, lle_child_next);
}
int
lltable_unlink_entry(struct lltable *llt, struct llentry *lle)
{
return (llt->llt_unlink_entry(lle));
}
void
lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
{
struct lltable *llt;
llt = lle->lle_tbl;
llt->llt_fill_sa_entry(lle, sa);
}
struct ifnet *
lltable_get_ifp(const struct lltable *llt)
{
return (llt->llt_ifp);
}
int
lltable_get_af(const struct lltable *llt)
{
return (llt->llt_af);
}
2008-12-15 06:53:09 +00:00
/*
* Called in route_output when rtm_flags contains RTF_LLDATA.
2008-12-15 06:53:09 +00:00
*/
int
lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info)
{
struct sockaddr_dl *dl =
(struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY];
struct sockaddr *dst = (struct sockaddr *)info->rti_info[RTAX_DST];
struct ifnet *ifp;
struct lltable *llt;
struct llentry *lle, *lle_tmp;
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
uint8_t linkhdr[LLE_MAX_LINKHDR];
size_t linkhdrsize;
int lladdr_off;
u_int laflags = 0;
int error;
2008-12-15 06:53:09 +00:00
if (dl == NULL || dl->sdl_family != AF_LINK)
return (EINVAL);
/* XXX: should be ntohs() */
2008-12-15 06:53:09 +00:00
ifp = ifnet_byindex(dl->sdl_index);
if (ifp == NULL) {
log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n",
__func__, dl->sdl_index);
return EINVAL;
}
llt = lltable_get(ifp, dst->sa_family);
if (llt == NULL)
return (ESRCH);
2008-12-15 06:53:09 +00:00
error = 0;
switch (rtm->rtm_type) {
case RTM_ADD:
/* Add static LLE */
laflags = 0;
if (rtm->rtm_rmx.rmx_expire == 0)
laflags = LLE_STATIC;
lle = lltable_alloc_entry(llt, laflags, dst);
if (lle == NULL)
return (ENOMEM);
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
linkhdrsize = sizeof(linkhdr);
if (lltable_calc_llheader(ifp, dst->sa_family, LLADDR(dl),
linkhdr, &linkhdrsize, &lladdr_off) != 0)
return (EINVAL);
lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
lladdr_off);
if ((rtm->rtm_flags & RTF_ANNOUNCE))
lle->la_flags |= LLE_PUB;
lle->la_expire = rtm->rtm_rmx.rmx_expire;
laflags = lle->la_flags;
/* Try to link new entry */
lle_tmp = NULL;
IF_AFDATA_WLOCK(ifp);
LLE_WLOCK(lle);
lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, dst);
if (lle_tmp != NULL) {
/* Check if we are trying to replace immutable entry */
if ((lle_tmp->la_flags & LLE_IFADDR) != 0) {
IF_AFDATA_WUNLOCK(ifp);
LLE_WUNLOCK(lle_tmp);
lltable_free_entry(llt, lle);
return (EPERM);
}
/* Unlink existing entry from table */
lltable_unlink_entry(llt, lle_tmp);
}
lltable_link_entry(llt, lle);
IF_AFDATA_WUNLOCK(ifp);
if (lle_tmp != NULL) {
EVENTHANDLER_INVOKE(lle_event, lle_tmp,LLENTRY_EXPIRED);
lltable_free_entry(llt, lle_tmp);
}
/*
* By invoking LLE handler here we might get
* two events on static LLE entry insertion
* in routing socket. However, since we might have
* other subscribers we need to generate this event.
*/
EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
LLE_WUNLOCK(lle);
2008-12-15 06:53:09 +00:00
#ifdef INET
/* gratuitous ARP */
if ((laflags & LLE_PUB) && dst->sa_family == AF_INET)
arprequest(ifp,
&((struct sockaddr_in *)dst)->sin_addr,
&((struct sockaddr_in *)dst)->sin_addr,
(u_char *)LLADDR(dl));
2008-12-15 06:53:09 +00:00
#endif
break;
case RTM_DELETE:
return (lltable_delete_addr(llt, 0, dst));
default:
error = EINVAL;
}
2008-12-15 06:53:09 +00:00
return (error);
}
#ifdef DDB
struct llentry_sa {
struct llentry base;
struct sockaddr l3_addr;
};
static void
llatbl_lle_show(struct llentry_sa *la)
{
struct llentry *lle;
uint8_t octet[6];
lle = &la->base;
db_printf("lle=%p\n", lle);
db_printf(" lle_next=%p\n", lle->lle_next.cle_next);
db_printf(" lle_lock=%p\n", &lle->lle_lock);
db_printf(" lle_tbl=%p\n", lle->lle_tbl);
db_printf(" lle_head=%p\n", lle->lle_head);
db_printf(" la_hold=%p\n", lle->la_hold);
db_printf(" la_numheld=%d\n", lle->la_numheld);
db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire);
db_printf(" la_flags=0x%04x\n", lle->la_flags);
db_printf(" la_asked=%u\n", lle->la_asked);
db_printf(" la_preempt=%u\n", lle->la_preempt);
db_printf(" ln_state=%d\n", lle->ln_state);
db_printf(" ln_router=%u\n", lle->ln_router);
db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick);
db_printf(" lle_refcnt=%d\n", lle->lle_refcnt);
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
bcopy(lle->ll_addr, octet, sizeof(octet));
db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n",
octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]);
db_printf(" lle_timer=%p\n", &lle->lle_timer);
switch (la->l3_addr.sa_family) {
#ifdef INET
case AF_INET:
{
struct sockaddr_in *sin;
char l3s[INET_ADDRSTRLEN];
sin = (struct sockaddr_in *)&la->l3_addr;
inet_ntoa_r(sin->sin_addr, l3s);
2012-08-01 09:00:26 +00:00
db_printf(" l3_addr=%s\n", l3s);
break;
}
#endif
#ifdef INET6
case AF_INET6:
{
struct sockaddr_in6 *sin6;
char l3s[INET6_ADDRSTRLEN];
sin6 = (struct sockaddr_in6 *)&la->l3_addr;
ip6_sprintf(l3s, &sin6->sin6_addr);
2012-08-01 09:00:26 +00:00
db_printf(" l3_addr=%s\n", l3s);
break;
}
#endif
default:
db_printf(" l3_addr=N/A (af=%d)\n", la->l3_addr.sa_family);
break;
}
}
DB_SHOW_COMMAND(llentry, db_show_llentry)
{
if (!have_addr) {
db_printf("usage: show llentry <struct llentry *>\n");
return;
}
llatbl_lle_show((struct llentry_sa *)addr);
}
static void
llatbl_llt_show(struct lltable *llt)
{
int i;
struct llentry *lle;
db_printf("llt=%p llt_af=%d llt_ifp=%p\n",
llt, llt->llt_af, llt->llt_ifp);
for (i = 0; i < llt->llt_hsize; i++) {
CK_LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
llatbl_lle_show((struct llentry_sa *)lle);
if (db_pager_quit)
return;
}
}
}
DB_SHOW_COMMAND(lltable, db_show_lltable)
{
if (!have_addr) {
db_printf("usage: show lltable <struct lltable *>\n");
return;
}
llatbl_llt_show((struct lltable *)addr);
}
DB_SHOW_ALL_COMMAND(lltables, db_show_all_lltables)
{
VNET_ITERATOR_DECL(vnet_iter);
struct lltable *llt;
VNET_FOREACH(vnet_iter) {
CURVNET_SET_QUIET(vnet_iter);
#ifdef VIMAGE
db_printf("vnet=%p\n", curvnet);
#endif
SLIST_FOREACH(llt, &V_lltables, llt_link) {
db_printf("llt=%p llt_af=%d llt_ifp=%p(%s)\n",
llt, llt->llt_af, llt->llt_ifp,
(llt->llt_ifp != NULL) ?
llt->llt_ifp->if_xname : "?");
if (have_addr && addr != 0) /* verbose */
llatbl_llt_show(llt);
if (db_pager_quit) {
CURVNET_RESTORE();
return;
}
}
CURVNET_RESTORE();
}
}
#endif