freebsd-nq/sys/net/bridge.c

1281 lines
36 KiB
C
Raw Normal View History

/*-
* Copyright (c) 1998-2002 Luigi Rizzo
*
* Work partly supported by: Cisco Systems, Inc. - NSITE lab, RTP, NC
1998-09-12 22:07:47 +00:00
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
1998-09-12 22:07:47 +00:00
*/
/*
* This code implements bridging in FreeBSD. It only acts on ethernet
* interfaces, including VLANs (others are still usable for routing).
* A FreeBSD host can implement multiple logical bridges, called
* "clusters". Each cluster is made of a set of interfaces, and
* identified by a "cluster-id" which is a number in the range 1..2^16-1.
*
* Bridging is enabled by the sysctl variable
* net.link.ether.bridge.enable
* the grouping of interfaces into clusters is done with
* net.link.ether.bridge.config
* containing a list of interfaces each optionally followed by
* a colon and the cluster it belongs to (1 is the default).
* Separators can be spaces, commas or tabs, e.g.
* net.link.ether.bridge.config="fxp0:2 fxp1:2 dc0 dc1:1"
* Optionally bridged packets can be passed through the firewall,
* this is controlled by the variable
* net.link.ether.bridge.ipfw
*
* For each cluster there is a descriptor (cluster_softc) storing
* the following data structures:
* - a hash table with the MAC address and destination interface for each
* known node. The table is indexed using a hash of the source address.
* - an array with the MAC addresses of the interfaces used in the cluster.
1998-09-12 22:07:47 +00:00
*
* Input packets are tapped near the beginning of ether_input(), and
* analysed by bridge_in(). Depending on the result, the packet
1998-09-12 22:07:47 +00:00
* can be forwarded to one or more output interfaces using bdg_forward(),
* and/or sent to the upper layer (e.g. in case of multicast).
*
* Output packets are intercepted near the end of ether_output().
* The correct destination is selected by bridge_dst_lookup(),
* and then forwarding is done by bdg_forward().
1998-09-12 22:07:47 +00:00
*
* The arp code is also modified to let a machine answer to requests
* irrespective of the port the request came from.
*
* In case of loops in the bridging topology, the bridge detects this
* event and temporarily mutes output bridging on one of the ports.
* Periodically, interfaces are unmuted by bdg_timeout().
* Muting is only implemented as a safety measure, and also as
1998-09-12 22:07:47 +00:00
* a mechanism to support a user-space implementation of the spanning
* tree algorithm.
1998-09-12 22:07:47 +00:00
*
* To build a bridging kernel, use the following option
* option BRIDGE
* and then at runtime set the sysctl variable to enable bridging.
*
* Only one interface per cluster is supposed to have addresses set (but
* there are no substantial problems if you set addresses for none or
* for more than one interface).
1998-09-12 22:07:47 +00:00
* Bridging will act before routing, but nothing prevents a machine
* from doing both (modulo bugs in the implementation...).
*
* THINGS TO REMEMBER
* - bridging is incompatible with multicast routing on the same
* machine. There is not an easy fix to this.
* - be very careful when bridging VLANs
1998-09-12 22:07:47 +00:00
* - loop detection is still not very robust.
*/
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/malloc.h>
#include <sys/protosw.h>
1998-09-12 22:07:47 +00:00
#include <sys/systm.h>
#include <sys/socket.h> /* for net/if.h */
#include <sys/ctype.h> /* string functions */
1998-09-12 22:07:47 +00:00
#include <sys/kernel.h>
2004-05-30 20:27:19 +00:00
#include <sys/module.h>
1998-09-12 22:07:47 +00:00
#include <sys/sysctl.h>
#include <net/ethernet.h>
1998-09-12 22:07:47 +00:00
#include <net/if.h>
#include <net/if_arp.h> /* for struct arpcom */
1998-09-12 22:07:47 +00:00
#include <net/if_types.h>
#include <net/if_var.h>
#include <net/pfil.h>
1998-09-12 22:07:47 +00:00
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
1998-09-12 22:07:47 +00:00
#include <net/route.h>
#include <netinet/ip_fw.h>
1998-09-12 22:07:47 +00:00
#include <netinet/ip_dummynet.h>
#include <net/bridge.h>
/*--------------------*/
#define ETHER_ADDR_COPY(_dst,_src) bcopy(_src, _dst, ETHER_ADDR_LEN)
#define ETHER_ADDR_EQ(_a1,_a2) (bcmp(_a1, _a2, ETHER_ADDR_LEN) == 0)
/*
* For each cluster, source MAC addresses are stored into a hash
* table which locates the port they reside on.
*/
#define HASH_SIZE 8192 /* Table size, must be a power of 2 */
typedef struct hash_table { /* each entry. */
struct ifnet * name;
u_char etheraddr[ETHER_ADDR_LEN];
u_int16_t used; /* also, padding */
} bdg_hash_table ;
/*
* The hash function applied to MAC addresses. Out of the 6 bytes,
* the last ones tend to vary more. Since we are on a little endian machine,
* we have to do some gimmick...
*/
#define HASH_FN(addr) ( \
ntohs( ((u_int16_t *)addr)[1] ^ ((u_int16_t *)addr)[2] ) & (HASH_SIZE -1))
/*
* This is the data structure where local addresses are stored.
*/
struct bdg_addr {
u_char etheraddr[ETHER_ADDR_LEN];
u_int16_t _padding;
};
/*
* The configuration of each cluster includes the cluster id, a pointer to
* the hash table, and an array of local MAC addresses (of size "ports").
*/
struct cluster_softc {
u_int16_t cluster_id;
u_int16_t ports;
bdg_hash_table *ht;
struct bdg_addr *my_macs; /* local MAC addresses */
};
extern struct protosw inetsw[]; /* from netinet/ip_input.c */
static int n_clusters; /* number of clusters */
static struct cluster_softc *clusters;
static struct mtx bdg_mtx;
#define BDG_LOCK_INIT() mtx_init(&bdg_mtx, "bridge", NULL, MTX_DEF)
#define BDG_LOCK_DESTROY() mtx_destroy(&bdg_mtx)
#define BDG_LOCK() mtx_lock(&bdg_mtx)
#define BDG_UNLOCK() mtx_unlock(&bdg_mtx)
#define BDG_LOCK_ASSERT() mtx_assert(&bdg_mtx, MA_OWNED)
#define BDG_MUTED(ifp) (ifp2sc[ifp->if_index].flags & IFF_MUTE)
#define BDG_MUTE(ifp) ifp2sc[ifp->if_index].flags |= IFF_MUTE
#define BDG_CLUSTER(ifp) (ifp2sc[ifp->if_index].cluster)
#define BDG_SAMECLUSTER(ifp,src) \
(src == NULL || BDG_CLUSTER(ifp) == BDG_CLUSTER(src) )
#ifdef __i386__
#define BDG_MATCH(a,b) ( \
((u_int16_t *)(a))[2] == ((u_int16_t *)(b))[2] && \
*((u_int32_t *)(a)) == *((u_int32_t *)(b)) )
#define IS_ETHER_BROADCAST(a) ( \
*((u_int32_t *)(a)) == 0xffffffff && \
((u_int16_t *)(a))[2] == 0xffff )
#else
/* for machines that do not support unaligned access */
#define BDG_MATCH(a,b) ETHER_ADDR_EQ(a,b)
#define IS_ETHER_BROADCAST(a) ETHER_ADDR_EQ(a,"\377\377\377\377\377\377")
#endif
SYSCTL_DECL(_net_link_ether);
SYSCTL_NODE(_net_link_ether, OID_AUTO, bridge, CTLFLAG_RD, 0,
"Bridge parameters");
static char bridge_version[] = "031224";
SYSCTL_STRING(_net_link_ether_bridge, OID_AUTO, version, CTLFLAG_RD,
bridge_version, 0, "software version");
#define BRIDGE_DEBUG
#ifdef BRIDGE_DEBUG
int bridge_debug = 0;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, debug, CTLFLAG_RW, &bridge_debug,
0, "control debugging printfs");
#define DPRINTF(X) if (bridge_debug) printf X
#else
#define DPRINTF(X)
#endif
#ifdef BRIDGE_TIMING
1998-09-12 22:07:47 +00:00
/*
* For timing-related debugging, you can use the following macros.
1998-09-12 22:07:47 +00:00
* remember, rdtsc() only works on Pentium-class machines
quad_t ticks;
DDB(ticks = rdtsc();)
... interesting code ...
DDB(bdg_fw_ticks += (u_long)(rdtsc() - ticks) ; bdg_fw_count++ ;)
*
*/
#define DDB(x) x
static int bdg_fw_avg;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, fw_avg, CTLFLAG_RW,
&bdg_fw_avg, 0,"Cycle counter avg");
static int bdg_fw_ticks;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, fw_ticks, CTLFLAG_RW,
&bdg_fw_ticks, 0,"Cycle counter item");
static int bdg_fw_count;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, fw_count, CTLFLAG_RW,
&bdg_fw_count, 0,"Cycle counter count");
#else
#define DDB(x)
#endif
1998-09-12 22:07:47 +00:00
static int bdginit(void);
static void parse_bdg_cfg(void);
static struct mbuf *bdg_forward(struct mbuf *, struct ifnet *);
1998-09-12 22:07:47 +00:00
static int bdg_ipf; /* IPFilter enabled in bridge */
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, ipf, CTLFLAG_RW,
&bdg_ipf, 0,"Pass bridged pkts through IPFilter");
static int bdg_ipfw;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, ipfw, CTLFLAG_RW,
&bdg_ipfw,0,"Pass bridged pkts through firewall");
static int bdg_copy;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, copy, CTLFLAG_RW,
&bdg_copy, 0, "Force packet copy in bdg_forward");
int bdg_ipfw_drops;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, ipfw_drop,
CTLFLAG_RW, &bdg_ipfw_drops,0,"");
int bdg_ipfw_colls;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, ipfw_collisions,
CTLFLAG_RW, &bdg_ipfw_colls,0,"");
static int bdg_thru;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, packets, CTLFLAG_RW,
&bdg_thru, 0, "Packets through bridge");
static int bdg_dropped;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, dropped, CTLFLAG_RW,
&bdg_dropped, 0, "Packets dropped in bdg_forward");
static int bdg_predict;
SYSCTL_INT(_net_link_ether_bridge, OID_AUTO, predict, CTLFLAG_RW,
&bdg_predict, 0, "Correctly predicted header location");
#ifdef BRIDGE_DEBUG
static char *bdg_dst_names[] = {
"BDG_NULL ",
"BDG_BCAST ",
"BDG_MCAST ",
"BDG_LOCAL ",
"BDG_DROP ",
"BDG_UNKNOWN ",
"BDG_IN ",
"BDG_OUT ",
"BDG_FORWARD " };
#endif /* BRIDGE_DEBUG */
1998-09-12 22:07:47 +00:00
/*
* System initialization
*/
static struct bdg_stats bdg_stats ;
SYSCTL_STRUCT(_net_link_ether_bridge, OID_AUTO, stats, CTLFLAG_RD,
&bdg_stats, bdg_stats, "bridge statistics");
static struct callout bdg_callout;
/*
* Add an interface to a cluster, possibly creating a new entry in
* the cluster table. This requires reallocation of the table and
* updating pointers in ifp2sc.
*/
static struct cluster_softc *
add_cluster(u_int16_t cluster_id, struct ifnet *ifp)
{
struct cluster_softc *c = NULL;
int i;
BDG_LOCK_ASSERT();
for (i = 0; i < n_clusters ; i++)
if (clusters[i].cluster_id == cluster_id)
goto found;
/* Not found, need to reallocate */
c = malloc((1+n_clusters) * sizeof (*c), M_IFADDR, M_NOWAIT | M_ZERO);
if (c == NULL) {/* malloc failure */
printf("-- bridge: cannot add new cluster\n");
goto bad;
}
c[n_clusters].ht = (struct hash_table *)
malloc(HASH_SIZE * sizeof(struct hash_table),
M_IFADDR, M_NOWAIT | M_ZERO);
if (c[n_clusters].ht == NULL) {
printf("-- bridge: cannot allocate hash table for new cluster\n");
goto bad;
}
c[n_clusters].my_macs = (struct bdg_addr *)
malloc(BDG_MAX_PORTS * sizeof(struct bdg_addr),
M_IFADDR, M_NOWAIT | M_ZERO);
if (c[n_clusters].my_macs == NULL) {
printf("-- bridge: cannot allocate mac addr table for new cluster\n");
free(c[n_clusters].ht, M_IFADDR);
goto bad;
}
c[n_clusters].cluster_id = cluster_id;
c[n_clusters].ports = 0;
/*
* now copy old descriptors here
*/
if (n_clusters > 0) {
for (i=0; i < n_clusters; i++)
c[i] = clusters[i];
/*
* and finally update pointers in ifp2sc
*/
for (i = 0 ; i < if_index && i < BDG_MAX_PORTS; i++)
if (ifp2sc[i].cluster != NULL)
ifp2sc[i].cluster = c + (ifp2sc[i].cluster - clusters);
free(clusters, M_IFADDR);
}
clusters = c;
i = n_clusters; /* index of cluster entry */
n_clusters++;
found:
c = clusters + i; /* the right cluster ... */
ETHER_ADDR_COPY(c->my_macs[c->ports].etheraddr, IFP2AC(ifp)->ac_enaddr);
c->ports++;
return c;
bad:
if (c)
free(c, M_IFADDR);
return NULL;
}
/*
* Turn off bridging, by clearing promisc mode on the interface,
* marking the interface as unused, and clearing the name in the
* stats entry.
* Also dispose the hash tables associated with the clusters.
*/
static void
bridge_off(void)
{
struct ifnet *ifp ;
int i;
BDG_LOCK_ASSERT();
DPRINTF(("%s: n_clusters %d\n", __func__, n_clusters));
2002-12-22 05:35:03 +00:00
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &ifnet, if_link) {
struct bdg_softc *b;
if (ifp->if_index >= BDG_MAX_PORTS)
continue; /* make sure we do not go beyond the end */
b = &ifp2sc[ifp->if_index];
if ( b->flags & IFF_BDG_PROMISC ) {
ifpromisc(ifp, 0);
b->flags &= ~(IFF_BDG_PROMISC|IFF_MUTE) ;
DPRINTF(("%s: %s promisc OFF if_flags 0x%x "
"bdg_flags 0x%x\n", __func__, ifp->if_xname,
ifp->if_flags, b->flags));
2001-02-02 00:19:25 +00:00
}
b->flags &= ~(IFF_USED) ;
b->cluster = NULL;
bdg_stats.s[ifp->if_index].name[0] = '\0';
}
2002-12-22 05:35:03 +00:00
IFNET_RUNLOCK();
/* flush_tables */
for (i=0; i < n_clusters; i++) {
free(clusters[i].ht, M_IFADDR);
free(clusters[i].my_macs, M_IFADDR);
}
if (clusters != NULL)
free(clusters, M_IFADDR);
clusters = NULL;
n_clusters =0;
}
/*
* set promisc mode on the interfaces we use.
*/
static void
bridge_on(void)
{
struct ifnet *ifp ;
BDG_LOCK_ASSERT();
2002-12-22 05:35:03 +00:00
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &ifnet, if_link) {
struct bdg_softc *b = &ifp2sc[ifp->if_index];
if ( !(b->flags & IFF_USED) )
continue ;
if ( !( ifp->if_flags & IFF_UP) ) {
if_up(ifp);
}
if ( !(b->flags & IFF_BDG_PROMISC) ) {
(void) ifpromisc(ifp, 1);
b->flags |= IFF_BDG_PROMISC ;
DPRINTF(("%s: %s promisc ON if_flags 0x%x bdg_flags 0x%x\n",
__func__, ifp->if_xname, ifp->if_flags, b->flags));
}
if (b->flags & IFF_MUTE) {
DPRINTF(("%s: unmuting %s\n", __func__, ifp->if_xname));
b->flags &= ~IFF_MUTE;
}
}
2002-12-22 05:35:03 +00:00
IFNET_RUNLOCK();
}
1998-09-12 22:07:47 +00:00
static char bridge_cfg[1024]; /* NB: in BSS so initialized to zero */
/**
* reconfigure bridge.
* This is also done every time we attach or detach an interface.
* Main use is to make sure that we do not bridge on some old
* (ejected) device. So, it would be really useful to have a
* pointer to the modified device as an argument. Without it, we
* have to scan all interfaces.
*/
static void
reconfigure_bridge_locked(void)
1998-09-12 22:07:47 +00:00
{
BDG_LOCK_ASSERT();
bridge_off();
if (do_bridge) {
if (if_index >= BDG_MAX_PORTS) {
printf("-- sorry too many interfaces (%d, max is %d),"
" disabling bridging\n", if_index, BDG_MAX_PORTS);
do_bridge = 0;
return;
2001-02-02 00:19:25 +00:00
}
parse_bdg_cfg();
bridge_on();
}
}
static void
reconfigure_bridge(void)
{
BDG_LOCK();
reconfigure_bridge_locked();
BDG_UNLOCK();
}
/*
* parse the config string, set IFF_USED, name and cluster_id
* for all interfaces found.
* The config string is a list of "if[:cluster]" with
* a number of possible separators (see "sep"). In particular the
* use of the space lets you set bridge_cfg with the output from
* "ifconfig -l"
*/
static void
parse_bdg_cfg(void)
{
char *p, *beg;
int l, cluster;
static const char *sep = ", \t";
BDG_LOCK_ASSERT();
for (p = bridge_cfg; *p ; p++) {
struct ifnet *ifp;
int found = 0;
char c;
if (index(sep, *p)) /* skip separators */
continue ;
/* names are lowercase and digits */
for ( beg = p ; islower(*p) || isdigit(*p) ; p++ )
;
l = p - beg ; /* length of name string */
if (l == 0) /* invalid name */
break ;
if ( *p != ':' ) /* no ':', assume default cluster 1 */
cluster = 1 ;
else /* fetch cluster */
cluster = strtoul( p+1, &p, 10);
c = *p;
*p = '\0';
/*
* now search in interface list for a matching name
*/
2002-12-22 05:35:03 +00:00
IFNET_RLOCK(); /* could sleep XXX */
TAILQ_FOREACH(ifp, &ifnet, if_link) {
if (!strncmp(beg, ifp->if_xname, max(l, strlen(ifp->if_xname)))) {
struct bdg_softc *b = &ifp2sc[ifp->if_index];
if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) {
printf("%s is not an ethernet, continue\n", ifp->if_xname);
continue;
}
if (b->flags & IFF_USED) {
printf("%s already used, skipping\n", ifp->if_xname);
break;
}
b->cluster = add_cluster(htons(cluster), ifp);
b->flags |= IFF_USED ;
snprintf(bdg_stats.s[ifp->if_index].name,
sizeof(bdg_stats.s[ifp->if_index].name),
"%s:%d", ifp->if_xname, cluster);
DPRINTF(("%s: found %s next c %d\n", __func__,
bdg_stats.s[ifp->if_index].name, c));
found = 1;
break ;
}
}
2002-12-22 05:35:03 +00:00
IFNET_RUNLOCK();
if (!found)
printf("interface %s Not found in bridge\n", beg);
*p = c;
if (c == '\0')
break; /* no more */
}
}
/*
* handler for net.link.ether.bridge
*/
static int
sysctl_bdg(SYSCTL_HANDLER_ARGS)
{
int enable = do_bridge;
int error;
error = sysctl_handle_int(oidp, &enable, 0, req);
enable = (enable) ? 1 : 0;
BDG_LOCK();
if (enable != do_bridge) {
do_bridge = enable;
reconfigure_bridge_locked();
}
BDG_UNLOCK();
return error ;
}
SYSCTL_PROC(_net_link_ether_bridge, OID_AUTO, enable, CTLTYPE_INT|CTLFLAG_RW,
&do_bridge, 0, &sysctl_bdg, "I", "Bridging");
/*
* handler for net.link.ether.bridge_cfg
*/
static int
sysctl_bdg_cfg(SYSCTL_HANDLER_ARGS)
{
int error;
char *new_cfg;
new_cfg = malloc(sizeof(bridge_cfg), M_TEMP, M_WAITOK);
bcopy(bridge_cfg, new_cfg, sizeof(bridge_cfg));
error = sysctl_handle_string(oidp, new_cfg, oidp->oid_arg2, req);
if (error == 0) {
BDG_LOCK();
if (strcmp(new_cfg, bridge_cfg)) {
bcopy(new_cfg, bridge_cfg, sizeof(bridge_cfg));
reconfigure_bridge_locked();
}
BDG_UNLOCK();
}
free(new_cfg, M_TEMP);
return error;
1998-09-12 22:07:47 +00:00
}
SYSCTL_PROC(_net_link_ether_bridge, OID_AUTO, config, CTLTYPE_STRING|CTLFLAG_RW,
&bridge_cfg, sizeof(bridge_cfg), &sysctl_bdg_cfg, "A",
"Bridge configuration");
1998-09-12 22:07:47 +00:00
static int
sysctl_refresh(SYSCTL_HANDLER_ARGS)
{
if (req->newptr)
reconfigure_bridge();
return 0;
}
SYSCTL_PROC(_net_link_ether_bridge, OID_AUTO, refresh, CTLTYPE_INT|CTLFLAG_WR,
NULL, 0, &sysctl_refresh, "I", "iface refresh");
#ifndef BURN_BRIDGES
#define SYSCTL_OID_COMPAT(parent, nbr, name, kind, a1, a2, handler, fmt, descr)\
static struct sysctl_oid sysctl__##parent##_##name##_compat = { \
&sysctl_##parent##_children, { 0 }, \
nbr, kind, a1, a2, #name, handler, fmt, 0, descr }; \
DATA_SET(sysctl_set, sysctl__##parent##_##name##_compat)
#define SYSCTL_INT_COMPAT(parent, nbr, name, access, ptr, val, descr) \
SYSCTL_OID_COMPAT(parent, nbr, name, CTLTYPE_INT|(access), \
ptr, val, sysctl_handle_int, "I", descr)
#define SYSCTL_STRUCT_COMPAT(parent, nbr, name, access, ptr, type, descr)\
SYSCTL_OID_COMPAT(parent, nbr, name, CTLTYPE_OPAQUE|(access), \
ptr, sizeof(struct type), sysctl_handle_opaque, \
"S," #type, descr)
#define SYSCTL_PROC_COMPAT(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
SYSCTL_OID_COMPAT(parent, nbr, name, (access), \
ptr, arg, handler, fmt, descr)
SYSCTL_INT_COMPAT(_net_link_ether, OID_AUTO, bridge_ipf, CTLFLAG_RW,
&bdg_ipf, 0,"Pass bridged pkts through IPFilter");
SYSCTL_INT_COMPAT(_net_link_ether, OID_AUTO, bridge_ipfw, CTLFLAG_RW,
&bdg_ipfw,0,"Pass bridged pkts through firewall");
SYSCTL_STRUCT_COMPAT(_net_link_ether, PF_BDG, bdgstats, CTLFLAG_RD,
&bdg_stats, bdg_stats, "bridge statistics");
SYSCTL_PROC_COMPAT(_net_link_ether, OID_AUTO, bridge_cfg,
CTLTYPE_STRING|CTLFLAG_RW,
&bridge_cfg, sizeof(bridge_cfg), &sysctl_bdg_cfg, "A",
"Bridge configuration");
SYSCTL_PROC_COMPAT(_net_link_ether, OID_AUTO, bridge_refresh,
CTLTYPE_INT|CTLFLAG_WR,
NULL, 0, &sysctl_refresh, "I", "iface refresh");
#endif
static int bdg_loops;
static int bdg_slowtimer = 0;
static int bdg_age_index = 0; /* index of table position to age */
1998-09-12 22:07:47 +00:00
/*
* called periodically to flush entries etc.
*/
static void
bdg_timeout(void *dummy)
{
if (do_bridge) {
int l, i;
BDG_LOCK();
1998-09-12 22:07:47 +00:00
/*
* age entries in the forwarding table.
*/
l = bdg_age_index + HASH_SIZE/4 ;
1998-09-12 22:07:47 +00:00
if (l > HASH_SIZE)
l = HASH_SIZE;
for (i = 0; i < n_clusters; i++) {
bdg_hash_table *bdg_table = clusters[i].ht;
for (; bdg_age_index < l; bdg_age_index++)
if (bdg_table[bdg_age_index].used)
bdg_table[bdg_age_index].used = 0;
else if (bdg_table[bdg_age_index].name) {
DPRINTF(("%s: flushing stale entry %d\n",
__func__, bdg_age_index));
bdg_table[bdg_age_index].name = NULL;
}
}
if (bdg_age_index >= HASH_SIZE)
bdg_age_index = 0;
1998-09-12 22:07:47 +00:00
if (--bdg_slowtimer <= 0 ) {
bdg_slowtimer = 5;
1998-09-12 22:07:47 +00:00
bridge_on(); /* we just need unmute, really */
bdg_loops = 0;
1998-09-12 22:07:47 +00:00
}
BDG_UNLOCK();
1998-09-12 22:07:47 +00:00
}
callout_reset(&bdg_callout, 2*hz, bdg_timeout, NULL);
1998-09-12 22:07:47 +00:00
}
/*
* Find the right pkt destination:
* BDG_BCAST is a broadcast
* BDG_MCAST is a multicast
* BDG_LOCAL is for a local address
* BDG_DROP must be dropped
* other ifp of the dest. interface (incl.self)
*
* We assume this is only called for interfaces for which bridging
* is enabled, i.e. BDG_USED(ifp) is true.
*/
static __inline struct ifnet *
bridge_dst_lookup(struct ether_header *eh, struct cluster_softc *c)
{
bdg_hash_table *bt; /* pointer to entry in hash table */
BDG_LOCK_ASSERT();
if (ETHER_IS_MULTICAST(eh->ether_dhost))
return IS_ETHER_BROADCAST(eh->ether_dhost) ? BDG_BCAST : BDG_MCAST;
/*
* Lookup local addresses in case one matches. We optimize
* for the common case of two interfaces.
*/
KASSERT(c->ports != 0, ("lookup with no ports!"));
switch (c->ports) {
int i;
default:
for (i = c->ports-1; i > 1; i--) {
if (ETHER_ADDR_EQ(c->my_macs[i].etheraddr, eh->ether_dhost))
return BDG_LOCAL;
}
/* fall thru... */
case 2:
if (ETHER_ADDR_EQ(c->my_macs[1].etheraddr, eh->ether_dhost))
return BDG_LOCAL;
case 1:
if (ETHER_ADDR_EQ(c->my_macs[0].etheraddr, eh->ether_dhost))
return BDG_LOCAL;
}
/*
* Look for a possible destination in table
*/
bt = &c->ht[HASH_FN(eh->ether_dhost)];
if (bt->name && ETHER_ADDR_EQ(bt->etheraddr, eh->ether_dhost))
return bt->name;
else
return BDG_UNKNOWN;
1998-09-12 22:07:47 +00:00
}
/**
1998-09-12 22:07:47 +00:00
* bridge_in() is invoked to perform bridging decision on input packets.
*
1998-09-12 22:07:47 +00:00
* On Input:
* eh Ethernet header of the incoming packet.
* ifp interface the packet is coming from.
1998-09-12 22:07:47 +00:00
*
* On Return: destination of packet, one of
* BDG_BCAST broadcast
* BDG_MCAST multicast
* BDG_LOCAL is only for a local address (do not forward)
* BDG_DROP drop the packet
* ifp ifp of the destination interface.
*
* Forwarding is not done directly to give a chance to some drivers
* to fetch more of the packet, or simply drop it completely.
*/
static struct mbuf *
bridge_in(struct ifnet *ifp, struct mbuf *m)
1998-09-12 22:07:47 +00:00
{
struct ether_header *eh;
struct ifnet *dst, *old;
bdg_hash_table *bt; /* location in hash table */
int dropit = BDG_MUTED(ifp);
int index;
eh = mtod(m, struct ether_header *);
1998-09-12 22:07:47 +00:00
/*
* hash the source address
*/
BDG_LOCK();
index = HASH_FN(eh->ether_shost);
bt = &BDG_CLUSTER(ifp)->ht[index];
bt->used = 1;
old = bt->name;
if (old) { /* the entry is valid */
if (!ETHER_ADDR_EQ(eh->ether_shost, bt->etheraddr)) {
bdg_ipfw_colls++;
bt->name = NULL; /* NB: will overwrite below */
} else if (old != ifp) {
1998-09-12 22:07:47 +00:00
/*
* Found a loop. Either a machine has moved, or there
1998-09-12 22:07:47 +00:00
* is a misconfiguration/reconfiguration of the network.
* First, do not forward this packet!
* Record the relocation anyways; then, if loops persist,
* suspect a reconfiguration and disable forwarding
* from the old interface.
*/
bt->name = ifp; /* relocate address */
printf("-- loop (%d) %6D to %s from %s (%s)\n",
1998-09-12 22:07:47 +00:00
bdg_loops, eh->ether_shost, ".",
ifp->if_xname, old->if_xname,
2001-02-02 00:19:25 +00:00
BDG_MUTED(old) ? "muted":"active");
dropit = 1;
if (!BDG_MUTED(old)) {
if (bdg_loops++ > 10)
BDG_MUTE(old);
1998-09-12 22:07:47 +00:00
}
}
1998-09-12 22:07:47 +00:00
}
/*
* now write the source address into the table
*/
if (bt->name == NULL) {
DPRINTF(("%s: new addr %6D at %d for %s\n",
__func__, eh->ether_shost, ".", index, ifp->if_xname));
ETHER_ADDR_COPY(bt->etheraddr, eh->ether_shost);
bt->name = ifp;
1998-09-12 22:07:47 +00:00
}
dst = bridge_dst_lookup(eh, BDG_CLUSTER(ifp));
BDG_UNLOCK();
/*
* bridge_dst_lookup can return the following values:
1998-09-12 22:07:47 +00:00
* BDG_BCAST, BDG_MCAST, BDG_LOCAL, BDG_UNKNOWN, BDG_DROP, ifp.
* For muted interfaces, or when we detect a loop, the first 3 are
* changed in BDG_LOCAL (we still listen to incoming traffic),
* and others to BDG_DROP (no use for the local host).
* Also, for incoming packets, ifp is changed to BDG_DROP if ifp == src.
* These changes are not necessary for outgoing packets from ether_output().
1998-09-12 22:07:47 +00:00
*/
BDG_STAT(ifp, BDG_IN);
switch ((uintptr_t)dst) {
case (uintptr_t)BDG_BCAST:
case (uintptr_t)BDG_MCAST:
case (uintptr_t)BDG_LOCAL:
case (uintptr_t)BDG_UNKNOWN:
case (uintptr_t)BDG_DROP:
1998-09-12 22:07:47 +00:00
BDG_STAT(ifp, dst);
break;
default:
if (dst == ifp || dropit)
1998-09-12 22:07:47 +00:00
BDG_STAT(ifp, BDG_DROP);
else
BDG_STAT(ifp, BDG_FORWARD);
break;
1998-09-12 22:07:47 +00:00
}
if (dropit) {
1998-09-12 22:07:47 +00:00
if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_LOCAL)
dst = BDG_LOCAL;
1998-09-12 22:07:47 +00:00
else
dst = BDG_DROP;
1998-09-12 22:07:47 +00:00
} else {
if (dst == ifp)
dst = BDG_DROP;
1998-09-12 22:07:47 +00:00
}
DPRINTF(("%s: %6D ->%6D ty 0x%04x dst %s\n", __func__,
eh->ether_shost, ".",
eh->ether_dhost, ".",
ntohs(eh->ether_type),
2003-09-17 18:14:49 +00:00
(dst <= BDG_FORWARD) ? bdg_dst_names[(uintptr_t)dst] :
dst->if_xname));
switch ((uintptr_t)dst) {
case (uintptr_t)BDG_DROP:
m_freem(m);
return (NULL);
case (uintptr_t)BDG_LOCAL:
return (m);
case (uintptr_t)BDG_BCAST:
case (uintptr_t)BDG_MCAST:
m = bdg_forward(m, dst);
#ifdef DIAGNOSTIC /* glebius: am I right here? */
if (m == NULL) {
if_printf(ifp, "bridge dropped %s packet\n",
dst == BDG_BCAST ? "broadcast" : "multicast");
return (NULL);
}
#endif
return (m);
default:
m = bdg_forward(m, dst);
}
return (NULL); /* not reached */
}
/*
* Return 1 if it's ok to send a packet out the specified interface.
* The interface must be:
* used for bridging,
* not muted,
* not full,
* up and running,
* not the source interface, and
* belong to the same cluster as the 'real_dst'.
*/
static __inline int
bridge_ifok(struct ifnet *ifp, struct ifnet *src, struct ifnet *dst)
{
return (BDG_USED(ifp)
&& !BDG_MUTED(ifp)
&& !_IF_QFULL(&ifp->if_snd)
&& (ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING)
&& ifp != src
&& BDG_SAMECLUSTER(ifp, dst));
1998-09-12 22:07:47 +00:00
}
/*
* Forward a packet to dst -- which can be a single interface or
* an entire cluster. The src port and muted interfaces are excluded.
*
* If src == NULL, the pkt comes from ether_output, and dst is the real
* interface the packet is originally sent to. In this case, we must forward
* it to the whole cluster.
* We never call bdg_forward from ether_output on interfaces which are
* not part of a cluster.
*
* If possible (i.e. we can determine that the caller does not need
* a copy), the packet is consumed here, and bdg_forward returns NULL.
* Otherwise, a pointer to a copy of the packet is returned.
1998-09-12 22:07:47 +00:00
*/
static struct mbuf *
bdg_forward(struct mbuf *m0, struct ifnet *dst)
1998-09-12 22:07:47 +00:00
{
#define EH_RESTORE(_m) do { \
M_PREPEND((_m), ETHER_HDR_LEN, M_DONTWAIT); \
if ((_m) == NULL) { \
bdg_dropped++; \
return NULL; \
} \
if (eh != mtod((_m), struct ether_header *)) \
bcopy(&save_eh, mtod((_m), struct ether_header *), ETHER_HDR_LEN); \
else \
bdg_predict++; \
} while (0);
struct ether_header *eh;
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
struct ifnet *src;
struct ifnet *ifp, *last;
int shared = bdg_copy; /* someone else is using the mbuf */
int error;
struct ifnet *real_dst = dst; /* real dst from ether_output */
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
struct ip_fw_args args;
struct ether_header save_eh;
struct mbuf *m;
DDB(quad_t ticks; ticks = rdtsc();)
args.rule = ip_dn_claim_rule(m0);
if (args.rule)
shared = 0; /* For sure this is our own mbuf. */
else
bdg_thru++; /* count 1st time through bdg_forward */
/*
* The packet arrives with the Ethernet header at the front.
*/
eh = mtod(m0, struct ether_header *);
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
src = m0->m_pkthdr.rcvif;
if (src == NULL) { /* packet from ether_output */
BDG_LOCK();
dst = bridge_dst_lookup(eh, BDG_CLUSTER(real_dst));
BDG_UNLOCK();
}
if (dst == BDG_DROP) { /* this should not happen */
printf("xx bdg_forward for BDG_DROP\n");
m_freem(m0);
bdg_dropped++;
2001-02-02 00:19:25 +00:00
return NULL;
1998-09-12 22:07:47 +00:00
}
if (dst == BDG_LOCAL) { /* this should not happen as well */
1998-09-12 22:07:47 +00:00
printf("xx ouch, bdg_forward for local pkt\n");
2001-02-02 00:19:25 +00:00
return m0;
1998-09-12 22:07:47 +00:00
}
2002-12-22 05:35:03 +00:00
if (dst == BDG_BCAST || dst == BDG_MCAST) {
/* need a copy for the local stack */
shared = 1;
1998-09-12 22:07:47 +00:00
}
1998-09-12 22:07:47 +00:00
/*
* Do filtering in a very similar way to what is done in ip_output.
* Only if firewall is loaded, enabled, and the packet is not
* from ether_output() (src==NULL, or we would filter it twice).
* Additional restrictions may apply e.g. non-IP, short packets,
* and pkts already gone through a pipe.
1998-09-12 22:07:47 +00:00
*/
if (src != NULL && (
(inet_pfil_hook.ph_busy_count >= 0 && bdg_ipf != 0) ||
(IPFW_LOADED && bdg_ipfw != 0))) {
2001-02-02 00:19:25 +00:00
int i;
1998-09-12 22:07:47 +00:00
if (args.rule != NULL && fw_one_pass)
goto forward; /* packet already partially processed */
/*
2001-02-02 00:19:25 +00:00
* i need some amt of data to be contiguous, and in case others need
* the packet (shared==1) also better be in the first mbuf.
*/
2001-02-02 00:19:25 +00:00
i = min(m0->m_pkthdr.len, max_protohdr) ;
if (shared || m0->m_len < i) {
m0 = m_pullup(m0, i);
2001-02-02 00:19:25 +00:00
if (m0 == NULL) {
printf("%s: m_pullup failed\n", __func__); /* XXXDPRINTF*/
bdg_dropped++;
return NULL;
}
eh = mtod(m0, struct ether_header *);
1998-09-12 22:07:47 +00:00
}
/*
* Processing below expects the Ethernet header is stripped.
* Furthermore, the mbuf chain might be replaced at various
* places. To deal with this we copy the header to a temporary
* location, strip the header, and restore it as needed.
*/
bcopy(eh, &save_eh, ETHER_HDR_LEN); /* local copy for restore */
m_adj(m0, ETHER_HDR_LEN); /* temporarily strip header */
/*
* NetBSD-style generic packet filter, pfil(9), hooks.
* Enables ipf(8) in bridging.
*/
Convert ipfw to use PFIL_HOOKS. This is change is transparent to userland and preserves the ipfw ABI. The ipfw core packet inspection and filtering functions have not been changed, only how ipfw is invoked is different. However there are many changes how ipfw is and its add-on's are handled: In general ipfw is now called through the PFIL_HOOKS and most associated magic, that was in ip_input() or ip_output() previously, is now done in ipfw_check_[in|out]() in the ipfw PFIL handler. IPDIVERT is entirely handled within the ipfw PFIL handlers. A packet to be diverted is checked if it is fragmented, if yes, ip_reass() gets in for reassembly. If not, or all fragments arrived and the packet is complete, divert_packet is called directly. For 'tee' no reassembly attempt is made and a copy of the packet is sent to the divert socket unmodified. The original packet continues its way through ip_input/output(). ipfw 'forward' is done via m_tag's. The ipfw PFIL handlers tag the packet with the new destination sockaddr_in. A check if the new destination is a local IP address is made and the m_flags are set appropriately. ip_input() and ip_output() have some more work to do here. For ip_input() the m_flags are checked and a packet for us is directly sent to the 'ours' section for further processing. Destination changes on the input path are only tagged and the 'srcrt' flag to ip_forward() is set to disable destination checks and ICMP replies at this stage. The tag is going to be handled on output. ip_output() again checks for m_flags and the 'ours' tag. If found, the packet will be dropped back to the IP netisr where it is going to be picked up by ip_input() again and the directly sent to the 'ours' section. When only the destination changes, the route's 'dst' is overwritten with the new destination from the forward m_tag. Then it jumps back at the route lookup again and skips the firewall check because it has been marked with M_SKIP_FIREWALL. ipfw 'forward' has to be compiled into the kernel with 'option IPFIREWALL_FORWARD' to enable it. DUMMYNET is entirely handled within the ipfw PFIL handlers. A packet for a dummynet pipe or queue is directly sent to dummynet_io(). Dummynet will then inject it back into ip_input/ip_output() after it has served its time. Dummynet packets are tagged and will continue from the next rule when they hit the ipfw PFIL handlers again after re-injection. BRIDGING and IPFW_ETHER are not changed yet and use ipfw_chk() directly as they did before. Later this will be changed to dedicated ETHER PFIL_HOOKS. More detailed changes to the code: conf/files Add netinet/ip_fw_pfil.c. conf/options Add IPFIREWALL_FORWARD option. modules/ipfw/Makefile Add ip_fw_pfil.c. net/bridge.c Disable PFIL_HOOKS if ipfw for bridging is active. Bridging ipfw is still directly invoked to handle layer2 headers and packets would get a double ipfw when run through PFIL_HOOKS as well. netinet/ip_divert.c Removed divert_clone() function. It is no longer used. netinet/ip_dummynet.[ch] Neither the route 'ro' nor the destination 'dst' need to be stored while in dummynet transit. Structure members and associated macros are removed. netinet/ip_fastfwd.c Removed all direct ipfw handling code and replace it with the new 'ipfw forward' handling code. netinet/ip_fw.h Removed 'ro' and 'dst' from struct ip_fw_args. netinet/ip_fw2.c (Re)moved some global variables and the module handling. netinet/ip_fw_pfil.c New file containing the ipfw PFIL handlers and module initialization. netinet/ip_input.c Removed all direct ipfw handling code and replace it with the new 'ipfw forward' handling code. ip_forward() does not longer require the 'next_hop' struct sockaddr_in argument. Disable early checks if 'srcrt' is set. netinet/ip_output.c Removed all direct ipfw handling code and replace it with the new 'ipfw forward' handling code. netinet/ip_var.h Add ip_reass() as general function. (Used from ipfw PFIL handlers for IPDIVERT.) netinet/raw_ip.c Directly check if ipfw and dummynet control pointers are active. netinet/tcp_input.c Rework the 'ipfw forward' to local code to work with the new way of forward tags. netinet/tcp_sack.c Remove include 'opt_ipfw.h' which is not needed here. sys/mbuf.h Remove m_claim_next() macro which was exclusively for ipfw 'forward' and is no longer needed. Approved by: re (scottl)
2004-08-17 22:05:54 +00:00
if (!IPFW_LOADED) { /* XXX: Prevent ipfw from being run twice. */
if (inet_pfil_hook.ph_busy_count >= 0 &&
m0->m_pkthdr.len >= sizeof(struct ip) &&
ntohs(save_eh.ether_type) == ETHERTYPE_IP) {
/*
* before calling the firewall, swap fields the same as IP does.
* here we assume the pkt is an IP one and the header is contiguous
*/
struct ip *ip = mtod(m0, struct ip *);
ip->ip_len = ntohs(ip->ip_len);
ip->ip_off = ntohs(ip->ip_off);
if (pfil_run_hooks(&inet_pfil_hook, &m0, src, PFIL_IN, NULL) != 0) {
/* NB: hook should consume packet */
return NULL;
}
if (m0 == NULL) /* consumed by filter */
return m0;
/*
* If we get here, the firewall has passed the pkt, but the mbuf
* pointer might have changed. Restore ip and the fields ntohs()'d.
*/
ip = mtod(m0, struct ip *);
ip->ip_len = htons(ip->ip_len);
ip->ip_off = htons(ip->ip_off);
}
Convert ipfw to use PFIL_HOOKS. This is change is transparent to userland and preserves the ipfw ABI. The ipfw core packet inspection and filtering functions have not been changed, only how ipfw is invoked is different. However there are many changes how ipfw is and its add-on's are handled: In general ipfw is now called through the PFIL_HOOKS and most associated magic, that was in ip_input() or ip_output() previously, is now done in ipfw_check_[in|out]() in the ipfw PFIL handler. IPDIVERT is entirely handled within the ipfw PFIL handlers. A packet to be diverted is checked if it is fragmented, if yes, ip_reass() gets in for reassembly. If not, or all fragments arrived and the packet is complete, divert_packet is called directly. For 'tee' no reassembly attempt is made and a copy of the packet is sent to the divert socket unmodified. The original packet continues its way through ip_input/output(). ipfw 'forward' is done via m_tag's. The ipfw PFIL handlers tag the packet with the new destination sockaddr_in. A check if the new destination is a local IP address is made and the m_flags are set appropriately. ip_input() and ip_output() have some more work to do here. For ip_input() the m_flags are checked and a packet for us is directly sent to the 'ours' section for further processing. Destination changes on the input path are only tagged and the 'srcrt' flag to ip_forward() is set to disable destination checks and ICMP replies at this stage. The tag is going to be handled on output. ip_output() again checks for m_flags and the 'ours' tag. If found, the packet will be dropped back to the IP netisr where it is going to be picked up by ip_input() again and the directly sent to the 'ours' section. When only the destination changes, the route's 'dst' is overwritten with the new destination from the forward m_tag. Then it jumps back at the route lookup again and skips the firewall check because it has been marked with M_SKIP_FIREWALL. ipfw 'forward' has to be compiled into the kernel with 'option IPFIREWALL_FORWARD' to enable it. DUMMYNET is entirely handled within the ipfw PFIL handlers. A packet for a dummynet pipe or queue is directly sent to dummynet_io(). Dummynet will then inject it back into ip_input/ip_output() after it has served its time. Dummynet packets are tagged and will continue from the next rule when they hit the ipfw PFIL handlers again after re-injection. BRIDGING and IPFW_ETHER are not changed yet and use ipfw_chk() directly as they did before. Later this will be changed to dedicated ETHER PFIL_HOOKS. More detailed changes to the code: conf/files Add netinet/ip_fw_pfil.c. conf/options Add IPFIREWALL_FORWARD option. modules/ipfw/Makefile Add ip_fw_pfil.c. net/bridge.c Disable PFIL_HOOKS if ipfw for bridging is active. Bridging ipfw is still directly invoked to handle layer2 headers and packets would get a double ipfw when run through PFIL_HOOKS as well. netinet/ip_divert.c Removed divert_clone() function. It is no longer used. netinet/ip_dummynet.[ch] Neither the route 'ro' nor the destination 'dst' need to be stored while in dummynet transit. Structure members and associated macros are removed. netinet/ip_fastfwd.c Removed all direct ipfw handling code and replace it with the new 'ipfw forward' handling code. netinet/ip_fw.h Removed 'ro' and 'dst' from struct ip_fw_args. netinet/ip_fw2.c (Re)moved some global variables and the module handling. netinet/ip_fw_pfil.c New file containing the ipfw PFIL handlers and module initialization. netinet/ip_input.c Removed all direct ipfw handling code and replace it with the new 'ipfw forward' handling code. ip_forward() does not longer require the 'next_hop' struct sockaddr_in argument. Disable early checks if 'srcrt' is set. netinet/ip_output.c Removed all direct ipfw handling code and replace it with the new 'ipfw forward' handling code. netinet/ip_var.h Add ip_reass() as general function. (Used from ipfw PFIL handlers for IPDIVERT.) netinet/raw_ip.c Directly check if ipfw and dummynet control pointers are active. netinet/tcp_input.c Rework the 'ipfw forward' to local code to work with the new way of forward tags. netinet/tcp_sack.c Remove include 'opt_ipfw.h' which is not needed here. sys/mbuf.h Remove m_claim_next() macro which was exclusively for ipfw 'forward' and is no longer needed. Approved by: re (scottl)
2004-08-17 22:05:54 +00:00
} /* XXX: Prevent ipfw from being run twice. */
/*
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
* Prepare arguments and call the firewall.
*/
if (!IPFW_LOADED || bdg_ipfw == 0) {
EH_RESTORE(m0); /* restore Ethernet header */
goto forward; /* not using ipfw, accept the packet */
}
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
/*
* XXX The following code is very similar to the one in
* if_ethersubr.c:ether_ipfw_chk()
*/
args.m = m0; /* the packet we are looking at */
args.oif = NULL; /* this is an input packet */
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
args.next_hop = NULL; /* we do not support forward yet */
args.eh = &save_eh; /* MAC header for bridged/MAC packets */
i = ip_fw_chk_ptr(&args);
m0 = args.m; /* in case the firewall used the mbuf */
if (m0 != NULL)
EH_RESTORE(m0); /* restore Ethernet header */
if (i == IP_FW_DENY) /* drop */
return m0;
KASSERT(m0 != NULL, ("bdg_forward: m0 is NULL"));
2001-02-02 00:19:25 +00:00
if (i == 0) /* a PASS rule. */
goto forward;
if (DUMMYNET_LOADED && (i == IP_FW_DUMMYNET)) {
1998-09-12 22:07:47 +00:00
/*
* Pass the pkt to dummynet, which consumes it.
* If shared, make a copy and keep the original.
1998-09-12 22:07:47 +00:00
*/
2001-02-02 00:19:25 +00:00
if (shared) {
m = m_copypacket(m0, M_DONTWAIT);
if (m == NULL) { /* copy failed, give up */
bdg_dropped++;
return NULL;
}
2001-02-02 00:19:25 +00:00
} else {
m = m0 ; /* pass the original to dummynet */
m0 = NULL ; /* and nothing back to the caller */
}
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
args.oif = real_dst;
ip_dn_io_ptr(m, DN_TO_BDG_FWD, &args);
return m0;
1998-09-12 22:07:47 +00:00
}
/*
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
* XXX at some point, add support for divert/forward actions.
* If none of the above matches, we have to drop the packet.
*/
bdg_ipfw_drops++;
return m0;
1998-09-12 22:07:47 +00:00
}
forward:
/*
2001-02-02 00:19:25 +00:00
* Again, bring up the headers in case of shared bufs to avoid
* corruptions in the future.
*/
if (shared) {
int i = min(m0->m_pkthdr.len, max_protohdr);
2001-02-02 00:19:25 +00:00
m0 = m_pullup(m0, i);
if (m0 == NULL) {
bdg_dropped++;
return NULL;
}
/* NB: eh is not used below; no need to recalculate it */
}
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
/*
* now real_dst is used to determine the cluster where to forward.
* For packets coming from ether_input, this is the one of the 'src'
* interface, whereas for locally generated packets (src==NULL) it
* is the cluster of the original destination interface, which
* was already saved into real_dst.
*/
if (src != NULL)
real_dst = src;
Remove (almost all) global variables that were used to hold packet forwarding state ("annotations") during ip processing. The code is considerably cleaner now. The variables removed by this change are: ip_divert_cookie used by divert sockets ip_fw_fwd_addr used for transparent ip redirection last_pkt used by dynamic pipes in dummynet Removal of the first two has been done by carrying the annotations into volatile structs prepended to the mbuf chains, and adding appropriate code to add/remove annotations in the routines which make use of them, i.e. ip_input(), ip_output(), tcp_input(), bdg_forward(), ether_demux(), ether_output_frame(), div_output(). On passing, remove a bug in divert handling of fragmented packet. Now it is the fragment at offset 0 which sets the divert status of the whole packet, whereas formerly it was the last incoming fragment to decide. Removal of last_pkt required a change in the interface of ip_fw_chk() and dummynet_io(). On passing, use the same mechanism for dummynet annotations and for divert/forward annotations. option IPFIREWALL_FORWARD is effectively useless, the code to implement it is very small and is now in by default to avoid the obfuscation of conditionally compiled code. NOTES: * there is at least one global variable left, sro_fwd, in ip_output(). I am not sure if/how this can be removed. * I have deliberately avoided gratuitous style changes in this commit to avoid cluttering the diffs. Minor stule cleanup will likely be necessary * this commit only focused on the IP layer. I am sure there is a number of global variables used in the TCP and maybe UDP stack. * despite the number of files touched, there are absolutely no API's or data structures changed by this commit (except the interfaces of ip_fw_chk() and dummynet_io(), which are internal anyways), so an MFC is quite safe and unintrusive (and desirable, given the improved readability of the code). MFC after: 10 days
2002-06-22 11:51:02 +00:00
last = NULL;
2002-12-22 05:35:03 +00:00
if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_UNKNOWN) {
/*
* Scan all ports and send copies to all but the last.
*/
IFNET_RLOCK(); /* XXX replace with generation # */
TAILQ_FOREACH(ifp, &ifnet, if_link) {
if (bridge_ifok(ifp, src, real_dst)) {
if (last) {
/*
* At this point we know two interfaces need a copy
* of the packet (last + ifp) so we must create a
* copy to handoff to last.
*/
m = m_copypacket(m0, M_DONTWAIT);
if (m == NULL) {
IFNET_RUNLOCK();
printf("%s: , m_copypacket failed!\n", __func__);
bdg_dropped++;
return m0; /* the original is still there... */
}
IFQ_HANDOFF(last, m, error);
if (!error)
BDG_STAT(last, BDG_OUT);
else
bdg_dropped++;
2001-02-02 00:19:25 +00:00
}
last = ifp;
1998-09-12 22:07:47 +00:00
}
}
IFNET_RUNLOCK();
} else {
if (bridge_ifok(dst, src, real_dst))
last = dst;
}
if (last) {
if (shared) { /* need to copy */
m = m_copypacket(m0, M_DONTWAIT);
if (m == NULL) {
printf("%s: sorry, m_copypacket failed!\n", __func__);
bdg_dropped++ ;
return m0; /* the original is still there... */
1998-09-12 22:07:47 +00:00
}
} else { /* consume original */
m = m0, m0 = NULL;
1998-09-12 22:07:47 +00:00
}
IFQ_HANDOFF(last, m, error);
if (!error)
BDG_STAT(last, BDG_OUT);
else
bdg_dropped++;
1998-09-12 22:07:47 +00:00
}
DDB(bdg_fw_ticks += (u_long)(rdtsc() - ticks) ; bdg_fw_count++ ;
if (bdg_fw_count != 0) bdg_fw_avg = bdg_fw_ticks/bdg_fw_count; )
return m0;
#undef EH_RESTORE
1998-09-12 22:07:47 +00:00
}
/*
* initialization of bridge code.
*/
static int
bdginit(void)
{
if (bootverbose)
printf("BRIDGE %s loaded\n", bridge_version);
ifp2sc = malloc(BDG_MAX_PORTS * sizeof(struct bdg_softc),
M_IFADDR, M_WAITOK | M_ZERO );
if (ifp2sc == NULL)
return ENOMEM;
BDG_LOCK_INIT();
n_clusters = 0;
clusters = NULL;
do_bridge = 0;
bzero(&bdg_stats, sizeof(bdg_stats));
bridge_in_ptr = bridge_in;
bdg_forward_ptr = bdg_forward;
bdgtakeifaces_ptr = reconfigure_bridge;
bdgtakeifaces_ptr(); /* XXX does this do anything? */
callout_init(&bdg_callout, debug_mpsafenet ? CALLOUT_MPSAFE : 0);
bdg_timeout(0);
return 0 ;
}
static void
bdgdestroy(void)
{
bridge_in_ptr = NULL;
bdg_forward_ptr = NULL;
bdgtakeifaces_ptr = NULL;
callout_stop(&bdg_callout);
BDG_LOCK();
bridge_off();
if (ifp2sc) {
free(ifp2sc, M_IFADDR);
ifp2sc = NULL;
}
BDG_LOCK_DESTROY();
}
/*
* initialization code, both for static and dynamic loading.
*/
static int
bridge_modevent(module_t mod, int type, void *unused)
{
int err;
switch (type) {
case MOD_LOAD:
if (BDG_LOADED)
err = EEXIST;
else
err = bdginit();
break;
case MOD_UNLOAD:
do_bridge = 0;
bdgdestroy();
err = 0;
break;
default:
err = EINVAL;
break;
}
return err;
}
static moduledata_t bridge_mod = {
"bridge",
bridge_modevent,
0
};
DECLARE_MODULE(bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
MODULE_VERSION(bridge, 1);