2005-01-07 01:45:51 +00:00
|
|
|
/*-
|
2004-08-17 22:05:54 +00:00
|
|
|
* Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2007-10-07 20:44:24 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2004-08-26 14:18:30 +00:00
|
|
|
#if !defined(KLD_MODULE)
|
2004-08-17 22:05:54 +00:00
|
|
|
#include "opt_ipfw.h"
|
|
|
|
#include "opt_ipdn.h"
|
|
|
|
#include "opt_inet.h"
|
|
|
|
#ifndef INET
|
|
|
|
#error IPFIREWALL requires INET.
|
|
|
|
#endif /* INET */
|
2004-08-27 15:16:24 +00:00
|
|
|
#endif /* KLD_MODULE */
|
2005-11-29 17:56:11 +00:00
|
|
|
#include "opt_inet6.h"
|
2004-08-17 22:05:54 +00:00
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/malloc.h>
|
|
|
|
#include <sys/mbuf.h>
|
|
|
|
#include <sys/module.h>
|
|
|
|
#include <sys/kernel.h>
|
Conditionally compile out V_ globals while instantiating the appropriate
container structures, depending on VIMAGE_GLOBALS compile time option.
Make VIMAGE_GLOBALS a new compile-time option, which by default will not
be defined, resulting in instatiations of global variables selected for
V_irtualization (enclosed in #ifdef VIMAGE_GLOBALS blocks) to be
effectively compiled out. Instantiate new global container structures
to hold V_irtualized variables: vnet_net_0, vnet_inet_0, vnet_inet6_0,
vnet_ipsec_0, vnet_netgraph_0, and vnet_gif_0.
Update the VSYM() macro so that depending on VIMAGE_GLOBALS the V_
macros resolve either to the original globals, or to fields inside
container structures, i.e. effectively
#ifdef VIMAGE_GLOBALS
#define V_rt_tables rt_tables
#else
#define V_rt_tables vnet_net_0._rt_tables
#endif
Update SYSCTL_V_*() macros to operate either on globals or on fields
inside container structs.
Extend the internal kldsym() lookups with the ability to resolve
selected fields inside the virtualization container structs. This
applies only to the fields which are explicitly registered for kldsym()
visibility via VNET_MOD_DECLARE() and vnet_mod_register(), currently
this is done only in sys/net/if.c.
Fix a few broken instances of MODULE_GLOBAL() macro use in SCTP code,
and modify the MODULE_GLOBAL() macro to resolve to V_ macros, which in
turn result in proper code being generated depending on VIMAGE_GLOBALS.
De-virtualize local static variables in sys/contrib/pf/net/pf_subr.c
which were prematurely V_irtualized by automated V_ prepending scripts
during earlier merging steps. PF virtualization will be done
separately, most probably after next PF import.
Convert a few variable initializations at instantiation to
initialization in init functions, most notably in ipfw. Also convert
TUNABLE_INT() initializers for V_ variables to TUNABLE_FETCH_INT() in
initializer functions.
Discussed at: devsummit Strassburg
Reviewed by: bz, julian
Approved by: julian (mentor)
Obtained from: //depot/projects/vimage-commit2/...
X-MFC after: never
Sponsored by: NLnet Foundation, The FreeBSD Foundation
2008-12-10 23:12:39 +00:00
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/rwlock.h>
|
2004-08-17 22:05:54 +00:00
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/sysctl.h>
|
|
|
|
|
|
|
|
#include <net/if.h>
|
|
|
|
#include <net/route.h>
|
|
|
|
#include <net/pfil.h>
|
2009-08-21 11:20:10 +00:00
|
|
|
#include <net/vnet.h>
|
2004-08-17 22:05:54 +00:00
|
|
|
|
|
|
|
#include <netinet/in.h>
|
|
|
|
#include <netinet/in_systm.h>
|
|
|
|
#include <netinet/ip.h>
|
|
|
|
#include <netinet/ip_var.h>
|
|
|
|
#include <netinet/ip_fw.h>
|
2009-12-15 16:15:14 +00:00
|
|
|
#include <netinet/ipfw/ip_fw_private.h>
|
2004-08-17 22:05:54 +00:00
|
|
|
#include <netinet/ip_divert.h>
|
|
|
|
#include <netinet/ip_dummynet.h>
|
|
|
|
|
|
|
|
#include <machine/in_cksum.h>
|
|
|
|
|
2009-12-16 10:48:40 +00:00
|
|
|
static VNET_DEFINE(int, fw_enable) = 1;
|
|
|
|
#define V_fw_enable VNET(fw_enable)
|
|
|
|
|
2006-05-12 04:41:27 +00:00
|
|
|
#ifdef INET6
|
2009-12-16 10:48:40 +00:00
|
|
|
static VNET_DEFINE(int, fw6_enable) = 1;
|
|
|
|
#define V_fw6_enable VNET(fw6_enable)
|
2008-12-11 16:26:38 +00:00
|
|
|
#endif
|
2006-05-12 04:41:27 +00:00
|
|
|
|
|
|
|
int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
|
2004-08-17 22:05:54 +00:00
|
|
|
|
2004-10-19 21:14:57 +00:00
|
|
|
/* Divert hooks. */
|
|
|
|
ip_divert_packet_t *ip_divert_ptr = NULL;
|
2004-08-17 22:05:54 +00:00
|
|
|
|
2005-02-05 12:06:33 +00:00
|
|
|
/* ng_ipfw hooks. */
|
|
|
|
ng_ipfw_input_t *ng_ipfw_input_p = NULL;
|
|
|
|
|
2004-10-19 21:14:57 +00:00
|
|
|
/* Forward declarations. */
|
2009-12-28 10:47:04 +00:00
|
|
|
static void ipfw_divert(struct mbuf **, int, int);
|
2004-08-17 22:05:54 +00:00
|
|
|
|
2009-12-16 10:48:40 +00:00
|
|
|
#ifdef SYSCTL_NODE
|
|
|
|
SYSCTL_DECL(_net_inet_ip_fw);
|
|
|
|
SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
|
|
|
|
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
|
|
|
|
ipfw_chg_hook, "I", "Enable ipfw");
|
|
|
|
#ifdef INET6
|
|
|
|
SYSCTL_DECL(_net_inet6_ip6_fw);
|
|
|
|
SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
|
|
|
|
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
|
|
|
|
ipfw_chg_hook, "I", "Enable ipfw+6");
|
|
|
|
#endif /* INET6 */
|
|
|
|
#endif /* SYSCTL_NODE */
|
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
/*
|
|
|
|
* The pfilter hook to pass packets to ipfw_chk and then to
|
|
|
|
* dummynet, divert, netgraph or other modules.
|
|
|
|
* The packet may be consumed.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
|
2004-09-29 04:54:33 +00:00
|
|
|
struct inpcb *inp)
|
2004-08-17 22:05:54 +00:00
|
|
|
{
|
|
|
|
struct ip_fw_args args;
|
2005-02-05 12:06:33 +00:00
|
|
|
struct ng_ipfw_tag *ng_tag;
|
2004-08-17 22:05:54 +00:00
|
|
|
struct m_tag *dn_tag;
|
2009-12-28 10:47:04 +00:00
|
|
|
int ipfw;
|
|
|
|
int ret;
|
2004-08-17 22:05:54 +00:00
|
|
|
#ifdef IPFIREWALL_FORWARD
|
|
|
|
struct m_tag *fwd_tag;
|
|
|
|
#endif
|
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
/* convert dir to IPFW values */
|
|
|
|
dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
|
2004-08-17 22:05:54 +00:00
|
|
|
bzero(&args, sizeof(args));
|
|
|
|
|
2005-02-05 12:06:33 +00:00
|
|
|
ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0,
|
|
|
|
NULL);
|
|
|
|
if (ng_tag != NULL) {
|
2009-12-28 10:47:04 +00:00
|
|
|
KASSERT(ng_tag->dir == dir,
|
2005-02-05 12:06:33 +00:00
|
|
|
("ng_ipfw tag with wrong direction"));
|
merge code from ipfw3-head to reduce contention on the ipfw lock
and remove all O(N) sequences from kernel critical sections in ipfw.
In detail:
1. introduce a IPFW_UH_LOCK to arbitrate requests from
the upper half of the kernel. Some things, such as 'ipfw show',
can be done holding this lock in read mode, whereas insert and
delete require IPFW_UH_WLOCK.
2. introduce a mapping structure to keep rules together. This replaces
the 'next' chain currently used in ipfw rules. At the moment
the map is a simple array (sorted by rule number and then rule_id),
so we can find a rule quickly instead of having to scan the list.
This reduces many expensive lookups from O(N) to O(log N).
3. when an expensive operation (such as insert or delete) is done
by userland, we grab IPFW_UH_WLOCK, create a new copy of the map
without blocking the bottom half of the kernel, then acquire
IPFW_WLOCK and quickly update pointers to the map and related info.
After dropping IPFW_LOCK we can then continue the cleanup protected
by IPFW_UH_LOCK. So userland still costs O(N) but the kernel side
is only blocked for O(1).
4. do not pass pointers to rules through dummynet, netgraph, divert etc,
but rather pass a <slot, chain_id, rulenum, rule_id> tuple.
We validate the slot index (in the array of #2) with chain_id,
and if successful do a O(1) dereference; otherwise, we can find
the rule in O(log N) through <rulenum, rule_id>
All the above does not change the userland/kernel ABI, though there
are some disgusting casts between pointers and uint32_t
Operation costs now are as follows:
Function Old Now Planned
-------------------------------------------------------------------
+ skipto X, non cached O(N) O(log N)
+ skipto X, cached O(1) O(1)
XXX dynamic rule lookup O(1) O(log N) O(1)
+ skipto tablearg O(N) O(1)
+ reinject, non cached O(N) O(log N)
+ reinject, cached O(1) O(1)
+ kernel blocked during setsockopt() O(N) O(1)
-------------------------------------------------------------------
The only (very small) regression is on dynamic rule lookup and this will
be fixed in a day or two, without changing the userland/kernel ABI
Supported by: Valeria Paoli
MFC after: 1 month
2009-12-22 19:01:47 +00:00
|
|
|
args.slot = ng_tag->slot;
|
|
|
|
args.rulenum = ng_tag->rulenum;
|
2009-06-09 21:27:11 +00:00
|
|
|
args.rule_id = ng_tag->rule_id;
|
|
|
|
args.chain_id = ng_tag->chain_id;
|
2005-02-05 12:06:33 +00:00
|
|
|
m_tag_delete(*m0, (struct m_tag *)ng_tag);
|
|
|
|
}
|
|
|
|
|
2004-09-13 16:46:05 +00:00
|
|
|
again:
|
2007-11-06 23:01:42 +00:00
|
|
|
dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL);
|
|
|
|
if (dn_tag != NULL) {
|
|
|
|
struct dn_pkt_tag *dt;
|
|
|
|
|
|
|
|
dt = (struct dn_pkt_tag *)(dn_tag+1);
|
merge code from ipfw3-head to reduce contention on the ipfw lock
and remove all O(N) sequences from kernel critical sections in ipfw.
In detail:
1. introduce a IPFW_UH_LOCK to arbitrate requests from
the upper half of the kernel. Some things, such as 'ipfw show',
can be done holding this lock in read mode, whereas insert and
delete require IPFW_UH_WLOCK.
2. introduce a mapping structure to keep rules together. This replaces
the 'next' chain currently used in ipfw rules. At the moment
the map is a simple array (sorted by rule number and then rule_id),
so we can find a rule quickly instead of having to scan the list.
This reduces many expensive lookups from O(N) to O(log N).
3. when an expensive operation (such as insert or delete) is done
by userland, we grab IPFW_UH_WLOCK, create a new copy of the map
without blocking the bottom half of the kernel, then acquire
IPFW_WLOCK and quickly update pointers to the map and related info.
After dropping IPFW_LOCK we can then continue the cleanup protected
by IPFW_UH_LOCK. So userland still costs O(N) but the kernel side
is only blocked for O(1).
4. do not pass pointers to rules through dummynet, netgraph, divert etc,
but rather pass a <slot, chain_id, rulenum, rule_id> tuple.
We validate the slot index (in the array of #2) with chain_id,
and if successful do a O(1) dereference; otherwise, we can find
the rule in O(log N) through <rulenum, rule_id>
All the above does not change the userland/kernel ABI, though there
are some disgusting casts between pointers and uint32_t
Operation costs now are as follows:
Function Old Now Planned
-------------------------------------------------------------------
+ skipto X, non cached O(N) O(log N)
+ skipto X, cached O(1) O(1)
XXX dynamic rule lookup O(1) O(log N) O(1)
+ skipto tablearg O(N) O(1)
+ reinject, non cached O(N) O(log N)
+ reinject, cached O(1) O(1)
+ kernel blocked during setsockopt() O(N) O(1)
-------------------------------------------------------------------
The only (very small) regression is on dynamic rule lookup and this will
be fixed in a day or two, without changing the userland/kernel ABI
Supported by: Valeria Paoli
MFC after: 1 month
2009-12-22 19:01:47 +00:00
|
|
|
args.slot = dt->slot;
|
|
|
|
args.rulenum = dt->rulenum;
|
2009-06-09 21:27:11 +00:00
|
|
|
args.rule_id = dt->rule_id;
|
|
|
|
args.chain_id = dt->chain_id;
|
2007-11-06 23:01:42 +00:00
|
|
|
m_tag_delete(*m0, dn_tag);
|
|
|
|
}
|
|
|
|
|
2004-08-17 22:05:54 +00:00
|
|
|
args.m = *m0;
|
2009-12-28 10:47:04 +00:00
|
|
|
args.oif = dir == DIR_OUT ? ifp : NULL;
|
2004-09-29 04:54:33 +00:00
|
|
|
args.inp = inp;
|
2004-08-17 22:05:54 +00:00
|
|
|
|
merge code from ipfw3-head to reduce contention on the ipfw lock
and remove all O(N) sequences from kernel critical sections in ipfw.
In detail:
1. introduce a IPFW_UH_LOCK to arbitrate requests from
the upper half of the kernel. Some things, such as 'ipfw show',
can be done holding this lock in read mode, whereas insert and
delete require IPFW_UH_WLOCK.
2. introduce a mapping structure to keep rules together. This replaces
the 'next' chain currently used in ipfw rules. At the moment
the map is a simple array (sorted by rule number and then rule_id),
so we can find a rule quickly instead of having to scan the list.
This reduces many expensive lookups from O(N) to O(log N).
3. when an expensive operation (such as insert or delete) is done
by userland, we grab IPFW_UH_WLOCK, create a new copy of the map
without blocking the bottom half of the kernel, then acquire
IPFW_WLOCK and quickly update pointers to the map and related info.
After dropping IPFW_LOCK we can then continue the cleanup protected
by IPFW_UH_LOCK. So userland still costs O(N) but the kernel side
is only blocked for O(1).
4. do not pass pointers to rules through dummynet, netgraph, divert etc,
but rather pass a <slot, chain_id, rulenum, rule_id> tuple.
We validate the slot index (in the array of #2) with chain_id,
and if successful do a O(1) dereference; otherwise, we can find
the rule in O(log N) through <rulenum, rule_id>
All the above does not change the userland/kernel ABI, though there
are some disgusting casts between pointers and uint32_t
Operation costs now are as follows:
Function Old Now Planned
-------------------------------------------------------------------
+ skipto X, non cached O(N) O(log N)
+ skipto X, cached O(1) O(1)
XXX dynamic rule lookup O(1) O(log N) O(1)
+ skipto tablearg O(N) O(1)
+ reinject, non cached O(N) O(log N)
+ reinject, cached O(1) O(1)
+ kernel blocked during setsockopt() O(N) O(1)
-------------------------------------------------------------------
The only (very small) regression is on dynamic rule lookup and this will
be fixed in a day or two, without changing the userland/kernel ABI
Supported by: Valeria Paoli
MFC after: 1 month
2009-12-22 19:01:47 +00:00
|
|
|
if (V_fw_one_pass == 0 || args.slot == 0) {
|
2009-04-27 17:37:36 +00:00
|
|
|
ipfw = ipfw_chk(&args);
|
|
|
|
*m0 = args.m;
|
|
|
|
} else
|
|
|
|
ipfw = IP_FW_PASS;
|
|
|
|
|
2005-01-14 09:00:46 +00:00
|
|
|
KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
|
|
|
|
__func__));
|
2004-08-17 22:05:54 +00:00
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
/* breaking out of the switch means drop */
|
|
|
|
ret = 0; /* default return value for pass */
|
2005-01-14 09:00:46 +00:00
|
|
|
switch (ipfw) {
|
|
|
|
case IP_FW_PASS:
|
2009-12-28 10:47:04 +00:00
|
|
|
/* next_hop may be set by ipfw_chk */
|
2005-01-14 09:00:46 +00:00
|
|
|
if (args.next_hop == NULL)
|
2009-12-28 10:47:04 +00:00
|
|
|
break; /* pass */
|
|
|
|
#ifndef IPFIREWALL_FORWARD
|
|
|
|
ret = EACCES;
|
|
|
|
#else
|
|
|
|
/* Incoming packets should not be tagged so we do not
|
|
|
|
* m_tag_find. Outgoing packets may be tagged, so we
|
|
|
|
* reuse the tag if present.
|
|
|
|
*/
|
|
|
|
fwd_tag = (dir == DIR_IN) ? NULL :
|
|
|
|
m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
|
|
|
|
if (fwd_tag != NULL) {
|
|
|
|
m_tag_unlink(*m0, fwd_tag);
|
|
|
|
} else {
|
2004-08-17 22:05:54 +00:00
|
|
|
fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
|
|
|
|
sizeof(struct sockaddr_in), M_NOWAIT);
|
2009-12-28 10:47:04 +00:00
|
|
|
if (fwd_tag == NULL) {
|
|
|
|
ret = EACCES;
|
|
|
|
break; /* i.e. drop */
|
|
|
|
}
|
|
|
|
}
|
2004-08-17 22:05:54 +00:00
|
|
|
bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
|
|
|
|
m_tag_prepend(*m0, fwd_tag);
|
|
|
|
|
|
|
|
if (in_localip(args.next_hop->sin_addr))
|
|
|
|
(*m0)->m_flags |= M_FASTFWD_OURS;
|
|
|
|
#endif
|
2009-12-28 10:47:04 +00:00
|
|
|
break;
|
2005-01-14 09:00:46 +00:00
|
|
|
|
|
|
|
case IP_FW_DENY:
|
2009-12-28 10:47:04 +00:00
|
|
|
ret = EACCES;
|
|
|
|
break; /* i.e. drop */
|
2005-01-14 09:00:46 +00:00
|
|
|
|
|
|
|
case IP_FW_DUMMYNET:
|
2009-12-28 10:47:04 +00:00
|
|
|
ret = EACCES;
|
2009-06-05 13:44:30 +00:00
|
|
|
if (ip_dn_io_ptr == NULL)
|
2009-12-28 10:47:04 +00:00
|
|
|
break; /* i.e. drop */
|
2005-04-18 18:35:05 +00:00
|
|
|
if (mtod(*m0, struct ip *)->ip_v == 4)
|
2009-12-28 10:47:04 +00:00
|
|
|
ret = ip_dn_io_ptr(m0, dir, &args);
|
2005-04-18 18:35:05 +00:00
|
|
|
else if (mtod(*m0, struct ip *)->ip_v == 6)
|
2009-12-28 10:47:04 +00:00
|
|
|
ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
|
|
|
|
else
|
|
|
|
break; /* drop it */
|
|
|
|
/*
|
|
|
|
* XXX should read the return value.
|
|
|
|
* dummynet normally eats the packet and sets *m0=NULL
|
|
|
|
* unless the packet can be sent immediately. In this
|
|
|
|
* case args is updated and we should re-run the
|
|
|
|
* check without clearing args.
|
|
|
|
*/
|
2007-11-06 23:01:42 +00:00
|
|
|
if (*m0 != NULL)
|
|
|
|
goto again;
|
2005-01-14 09:00:46 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case IP_FW_TEE:
|
|
|
|
case IP_FW_DIVERT:
|
2009-12-28 10:47:04 +00:00
|
|
|
if (ip_divert_ptr == NULL) {
|
|
|
|
ret = EACCES;
|
|
|
|
break; /* i.e. drop */
|
|
|
|
}
|
|
|
|
ipfw_divert(m0, dir, (ipfw == IP_FW_TEE) ? 1 : 0);
|
|
|
|
if (*m0) {
|
|
|
|
/* continue processing for this one. We set
|
|
|
|
* args.slot=0, but the divert tag is processed
|
|
|
|
* in ipfw_chk to jump to the right place.
|
|
|
|
*/
|
merge code from ipfw3-head to reduce contention on the ipfw lock
and remove all O(N) sequences from kernel critical sections in ipfw.
In detail:
1. introduce a IPFW_UH_LOCK to arbitrate requests from
the upper half of the kernel. Some things, such as 'ipfw show',
can be done holding this lock in read mode, whereas insert and
delete require IPFW_UH_WLOCK.
2. introduce a mapping structure to keep rules together. This replaces
the 'next' chain currently used in ipfw rules. At the moment
the map is a simple array (sorted by rule number and then rule_id),
so we can find a rule quickly instead of having to scan the list.
This reduces many expensive lookups from O(N) to O(log N).
3. when an expensive operation (such as insert or delete) is done
by userland, we grab IPFW_UH_WLOCK, create a new copy of the map
without blocking the bottom half of the kernel, then acquire
IPFW_WLOCK and quickly update pointers to the map and related info.
After dropping IPFW_LOCK we can then continue the cleanup protected
by IPFW_UH_LOCK. So userland still costs O(N) but the kernel side
is only blocked for O(1).
4. do not pass pointers to rules through dummynet, netgraph, divert etc,
but rather pass a <slot, chain_id, rulenum, rule_id> tuple.
We validate the slot index (in the array of #2) with chain_id,
and if successful do a O(1) dereference; otherwise, we can find
the rule in O(log N) through <rulenum, rule_id>
All the above does not change the userland/kernel ABI, though there
are some disgusting casts between pointers and uint32_t
Operation costs now are as follows:
Function Old Now Planned
-------------------------------------------------------------------
+ skipto X, non cached O(N) O(log N)
+ skipto X, cached O(1) O(1)
XXX dynamic rule lookup O(1) O(log N) O(1)
+ skipto tablearg O(N) O(1)
+ reinject, non cached O(N) O(log N)
+ reinject, cached O(1) O(1)
+ kernel blocked during setsockopt() O(N) O(1)
-------------------------------------------------------------------
The only (very small) regression is on dynamic rule lookup and this will
be fixed in a day or two, without changing the userland/kernel ABI
Supported by: Valeria Paoli
MFC after: 1 month
2009-12-22 19:01:47 +00:00
|
|
|
args.slot = 0;
|
2005-01-14 09:00:46 +00:00
|
|
|
goto again; /* continue with packet */
|
2005-04-06 14:00:33 +00:00
|
|
|
}
|
2009-12-28 10:47:04 +00:00
|
|
|
break;
|
2005-01-14 09:00:46 +00:00
|
|
|
|
2005-02-05 12:06:33 +00:00
|
|
|
case IP_FW_NGTEE:
|
|
|
|
case IP_FW_NETGRAPH:
|
2009-12-28 10:47:04 +00:00
|
|
|
if (!NG_IPFW_LOADED) {
|
|
|
|
ret = EACCES;
|
|
|
|
break; /* i.e. drop */
|
|
|
|
}
|
|
|
|
ret = ng_ipfw_input_p(m0, dir, &args,
|
|
|
|
(ipfw == IP_FW_NGTEE) ? 1 : 0);
|
|
|
|
if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
|
|
|
|
goto again; /* continue with packet */
|
|
|
|
break;
|
2005-02-05 12:06:33 +00:00
|
|
|
|
2006-12-29 21:59:17 +00:00
|
|
|
case IP_FW_NAT:
|
2009-04-01 20:23:47 +00:00
|
|
|
case IP_FW_REASS:
|
2009-12-28 10:47:04 +00:00
|
|
|
goto again; /* continue with packet */
|
2009-04-01 20:23:47 +00:00
|
|
|
|
2005-01-14 09:00:46 +00:00
|
|
|
default:
|
|
|
|
KASSERT(0, ("%s: unknown retval", __func__));
|
|
|
|
}
|
2004-08-17 22:05:54 +00:00
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
if (ret != 0) {
|
|
|
|
if (*m0)
|
|
|
|
m_freem(*m0);
|
|
|
|
*m0 = NULL;
|
|
|
|
}
|
|
|
|
return ret;
|
2004-08-17 22:05:54 +00:00
|
|
|
}
|
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
static void
|
|
|
|
ipfw_divert(struct mbuf **m0, int incoming, int tee)
|
2004-08-17 22:05:54 +00:00
|
|
|
{
|
|
|
|
/*
|
2004-09-13 16:46:05 +00:00
|
|
|
* ipfw_chk() has already tagged the packet with the divert tag.
|
2004-08-17 22:05:54 +00:00
|
|
|
* If tee is set, copy packet and return original.
|
|
|
|
* If not tee, consume packet and send it to divert socket.
|
|
|
|
*/
|
2009-12-28 10:47:04 +00:00
|
|
|
struct mbuf *clone;
|
2004-08-17 22:05:54 +00:00
|
|
|
struct ip *ip;
|
2004-10-19 21:14:57 +00:00
|
|
|
|
2004-08-17 22:05:54 +00:00
|
|
|
/* Cloning needed for tee? */
|
2009-12-28 10:47:04 +00:00
|
|
|
if (tee == 0) {
|
|
|
|
clone = *m0; /* use the original mbuf */
|
|
|
|
*m0 = NULL;
|
|
|
|
} else {
|
|
|
|
clone = m_dup(*m0, M_DONTWAIT);
|
|
|
|
/* If we cannot duplicate the mbuf, we sacrifice the divert
|
|
|
|
* chain and continue with the tee-ed packet.
|
|
|
|
*/
|
|
|
|
if (clone == NULL)
|
|
|
|
return;
|
|
|
|
}
|
2004-08-17 22:05:54 +00:00
|
|
|
|
|
|
|
/*
|
2009-12-28 10:47:04 +00:00
|
|
|
* Divert listeners can normally handle non-fragmented packets,
|
|
|
|
* but we can only reass in the non-tee case.
|
|
|
|
* This means that listeners on a tee rule may get fragments,
|
|
|
|
* and have to live with that.
|
|
|
|
* Note that we now have the 'reass' ipfw option so if we care
|
|
|
|
* we can do it before a 'tee'.
|
2004-08-17 22:05:54 +00:00
|
|
|
*/
|
|
|
|
ip = mtod(clone, struct ip *);
|
|
|
|
if (!tee && ip->ip_off & (IP_MF | IP_OFFMASK)) {
|
2009-12-28 10:47:04 +00:00
|
|
|
int hlen;
|
|
|
|
struct mbuf *reass;
|
2004-08-17 22:05:54 +00:00
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
reass = ip_reass(clone); /* Reassemble packet. */
|
|
|
|
if (reass == NULL)
|
|
|
|
return;
|
|
|
|
/* if reass = NULL then it was consumed by ip_reass */
|
2004-08-17 22:05:54 +00:00
|
|
|
/*
|
|
|
|
* IP header checksum fixup after reassembly and leave header
|
|
|
|
* in network byte order.
|
|
|
|
*/
|
2009-12-28 10:47:04 +00:00
|
|
|
ip = mtod(reass, struct ip *);
|
|
|
|
hlen = ip->ip_hl << 2;
|
|
|
|
SET_NET_IPLEN(ip);
|
|
|
|
ip->ip_sum = 0;
|
|
|
|
if (hlen == sizeof(struct ip))
|
|
|
|
ip->ip_sum = in_cksum_hdr(ip);
|
|
|
|
else
|
|
|
|
ip->ip_sum = in_cksum(reass, hlen);
|
|
|
|
clone = reass;
|
2004-08-17 22:05:54 +00:00
|
|
|
} else {
|
|
|
|
/* Convert header to network byte order. */
|
2009-12-28 10:47:04 +00:00
|
|
|
SET_NET_IPLEN(ip);
|
2004-08-17 22:05:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Do the dirty job... */
|
2009-12-28 10:47:04 +00:00
|
|
|
ip_divert_ptr(clone, incoming);
|
2006-05-12 04:41:27 +00:00
|
|
|
}
|
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
/*
|
|
|
|
* attach or detach hooks for a given protocol family
|
|
|
|
*/
|
2009-12-16 10:48:40 +00:00
|
|
|
static int
|
2009-12-28 10:47:04 +00:00
|
|
|
ipfw_hook(int onoff, int pf)
|
2006-05-12 04:41:27 +00:00
|
|
|
{
|
2009-12-28 10:47:04 +00:00
|
|
|
const int arg = PFIL_IN | PFIL_OUT | PFIL_WAITOK;
|
|
|
|
struct pfil_head *pfh;
|
2006-05-12 04:41:27 +00:00
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
pfh = pfil_head_get(PFIL_TYPE_AF, pf);
|
|
|
|
if (pfh == NULL)
|
2005-04-18 18:35:05 +00:00
|
|
|
return ENOENT;
|
2004-08-17 22:05:54 +00:00
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
if (onoff)
|
|
|
|
(void)pfil_add_hook(ipfw_check_hook, NULL, arg, pfh);
|
|
|
|
else
|
|
|
|
(void)pfil_remove_hook(ipfw_check_hook, NULL, arg, pfh);
|
2004-08-17 22:05:54 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2006-05-12 04:41:27 +00:00
|
|
|
|
2009-12-16 10:48:40 +00:00
|
|
|
int
|
2009-12-28 10:47:04 +00:00
|
|
|
ipfw_attach_hooks(int arg)
|
2009-12-16 10:48:40 +00:00
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
if (arg == 0) /* detach */
|
|
|
|
ipfw_hook(0, AF_INET);
|
|
|
|
else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
|
2009-12-16 10:48:40 +00:00
|
|
|
error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
|
|
|
|
printf("ipfw_hook() error\n");
|
|
|
|
}
|
|
|
|
#ifdef INET6
|
2009-12-28 10:47:04 +00:00
|
|
|
if (arg == 0) /* detach */
|
|
|
|
ipfw_hook(0, AF_INET6);
|
|
|
|
else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
|
2009-12-16 10:48:40 +00:00
|
|
|
error = ENOENT;
|
|
|
|
printf("ipfw6_hook() error\n");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2006-05-12 04:41:27 +00:00
|
|
|
int
|
|
|
|
ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
2009-10-11 05:59:43 +00:00
|
|
|
int enable;
|
|
|
|
int oldenable;
|
2006-05-12 04:41:27 +00:00
|
|
|
int error;
|
2009-12-28 10:47:04 +00:00
|
|
|
int af;
|
2006-05-12 04:41:27 +00:00
|
|
|
|
2009-10-11 05:59:43 +00:00
|
|
|
if (arg1 == &VNET_NAME(fw_enable)) {
|
|
|
|
enable = V_fw_enable;
|
2009-12-28 10:47:04 +00:00
|
|
|
af = AF_INET;
|
2009-10-11 05:59:43 +00:00
|
|
|
}
|
|
|
|
#ifdef INET6
|
|
|
|
else if (arg1 == &VNET_NAME(fw6_enable)) {
|
|
|
|
enable = V_fw6_enable;
|
2009-12-28 10:47:04 +00:00
|
|
|
af = AF_INET6;
|
2009-10-11 05:59:43 +00:00
|
|
|
}
|
2009-08-21 11:20:10 +00:00
|
|
|
#endif
|
2009-10-11 05:59:43 +00:00
|
|
|
else
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
oldenable = enable;
|
|
|
|
|
2006-05-12 04:41:27 +00:00
|
|
|
error = sysctl_handle_int(oidp, &enable, 0, req);
|
2009-10-11 05:59:43 +00:00
|
|
|
|
2006-05-12 04:41:27 +00:00
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
enable = (enable) ? 1 : 0;
|
|
|
|
|
2009-10-11 05:59:43 +00:00
|
|
|
if (enable == oldenable)
|
2006-05-12 04:41:27 +00:00
|
|
|
return (0);
|
|
|
|
|
2009-12-28 10:47:04 +00:00
|
|
|
error = ipfw_hook(enable, af);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
if (af == AF_INET)
|
2009-10-11 05:59:43 +00:00
|
|
|
V_fw_enable = enable;
|
2006-05-12 04:41:27 +00:00
|
|
|
#ifdef INET6
|
2009-12-28 10:47:04 +00:00
|
|
|
else if (af == AF_INET6)
|
2009-10-11 05:59:43 +00:00
|
|
|
V_fw6_enable = enable;
|
2006-05-12 04:41:27 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
2009-12-16 10:48:40 +00:00
|
|
|
/* end of file */
|