freebsd-dev/sys/netinet/ip_fw.c

/*
 * Copyright (c) 1993 Daniel Boulet
 * Copyright (c) 1994 Ugen J.S.Antsilevich
 * Copyright (c) 1996 Alex Nash
 * Copyright (c) 2000-2002 Luigi Rizzo
 *
 * Redistribution and use in source forms, with and without modification,
 * are permitted provided that this entire comment appears intact.
 *
 * Redistribution in binary form may occur without any restrictions.
 * Obviously, it would be nice if you gave credit where credit is due
 * but requiring it would be too onerous.
 *
 * This software is provided ``AS IS'' without any warranties of any kind.
 *
 * $FreeBSD$
 */

#define        DEB(x)
#define        DDB(x) x

/*
 * Implement IP packet firewall
 */

#if !defined(KLD_MODULE)
#include "opt_ipfw.h"
#include "opt_ipdn.h"
#include "opt_ipdivert.h"
#include "opt_inet.h"
#ifndef INET
#error IPFIREWALL requires INET.
#endif /* INET */
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/ucred.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_fw.h>
#include <netinet/ip_dummynet.h>
#include <netinet/tcp.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>

#include <netinet/if_ether.h> /* XXX ethertype_ip */

static int fw_debug = 1;
#ifdef IPFIREWALL_VERBOSE
static int fw_verbose = 1;
#else
static int fw_verbose = 0;
#endif
int fw_one_pass = 1 ;
#ifdef IPFIREWALL_VERBOSE_LIMIT
static int fw_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
#else
static int fw_verbose_limit = 0;
#endif
static int fw_permanent_rules = 0;

/*
 * Right now, two fields in the IP header are changed to host format
 * by the IP layer before calling the firewall. Ideally, we would like
 * to have them in network format so that the packet can be
 * used as it comes from the device driver (and is thus readonly).
 */

static u_int64_t counter;	/* counter for ipfw_report(NULL...) */

#define	IPFW_DEFAULT_RULE	((u_int)(u_short)~0)

LIST_HEAD (ip_fw_head, ip_fw) ip_fw_chain_head;

MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");

#ifdef SYSCTL_NODE
SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable, CTLFLAG_RW,
    &fw_enable, 0, "Enable ipfw");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
    &fw_one_pass, 0,
    "Only do a single pass through ipfw when using dummynet(4)");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
    &fw_debug, 0, "Enable printing of debug ip_fw statements");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
    &fw_verbose, 0, "Log matches to ipfw rules");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
    &fw_verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, permanent_rules, CTLFLAG_RW,
    &fw_permanent_rules, 0, "Set rule number, below which rules are permanent");

/*
 * Extension for stateful ipfw.
 *
 * Dynamic rules are stored in lists accessed through a hash table
 * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
 * be modified through the sysctl variable dyn_buckets which is
 * updated when the table becomes empty.
 *
 * XXX currently there is only one list, ipfw_dyn.
 *
 * When a packet is received, it is first hashed, then matched
 * against the entries in the corresponding list.
 * Matching occurs according to the rule type. The default is to
 * match the four fields and the protocol, and rules are bidirectional.
 *
 * For a busy proxy/web server we will have lots of connections to
 * the server. We could decide for a rule type where we ignore
 * ports (different hashing) and avoid special SYN/RST/FIN handling.
 *
 * XXX when we decide to support more than one rule type, we should
 * repeat the hashing multiple times uing only the useful fields.
 * Or, we could run the various tests in parallel, because the
 * 'move to front' technique should shorten the average search.
 *
 * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
 * measured in seconds and depending on the flags.
 *
 * The total number of dynamic rules is stored in dyn_count.
 * The max number of dynamic rules is dyn_max. When we reach
 * the maximum number of rules we do not create anymore. This is
 * done to avoid consuming too much memory, but also too much
 * time when searching on each packet (ideally, we should try instead
 * to put a limit on the length of the list on each bucket...).
 *
 * Each dynamic rules holds a pointer to the parent ipfw rule so
 * we know what action to perform. Dynamic rules are removed when
 * the parent rule is deleted.
 * There are some limitations with dynamic rules -- we do not
 * obey the 'randomized match', and we do not do multiple
 * passes through the firewall.
 * XXX check the latter!!!
 */
static struct ipfw_dyn_rule **ipfw_dyn_v = NULL ;
static u_int32_t dyn_buckets = 256 ; /* must be power of 2 */
static u_int32_t curr_dyn_buckets = 256 ; /* must be power of 2 */

/*
 * timeouts for various events in handing dynamic rules.
 */
static u_int32_t dyn_ack_lifetime = 300 ;
static u_int32_t dyn_syn_lifetime = 20 ;
static u_int32_t dyn_fin_lifetime = 1 ;
static u_int32_t dyn_rst_lifetime = 1 ;
static u_int32_t dyn_udp_lifetime = 10 ;
static u_int32_t dyn_short_lifetime = 5 ;

/*
 * after reaching 0, dynamic rules are considered still valid for
 * an additional grace time, unless there is lack of resources.
 */
static u_int32_t dyn_grace_time = 10 ;

static u_int32_t static_count = 0 ;	/* # of static rules */
static u_int32_t dyn_count = 0 ;	/* # of dynamic rules */
static u_int32_t dyn_max = 1000 ;	/* max # of dynamic rules */

SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW,
    &dyn_buckets, 0, "Number of dyn. buckets");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD,
    &curr_dyn_buckets, 0, "Current Number of dyn. buckets");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD,
    &dyn_count, 0, "Number of dyn. rules");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW,
    &dyn_max, 0, "Max number of dyn. rules");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
    &static_count, 0, "Number of static rules");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
    &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
    &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
    &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
    &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
    &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
    &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_grace_time, CTLFLAG_RD,
    &dyn_grace_time, 0, "Grace time for dyn. rules");

#endif /* SYSCTL_NODE */

#define dprintf(a)	do {						\
				if (fw_debug)				\
					printf a;			\
			} while (0)
#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0

static int	add_entry (struct ip_fw_head *chainptr, struct ip_fw *frwl);
static int	del_entry (struct ip_fw_head *chainptr, u_short number);
static int	zero_entry (struct ip_fw *, int);
static int	check_ipfw_struct (struct ip_fw *m);
static int	iface_match (struct ifnet *ifp, union ip_fw_if *ifu,
				 int byname);
static int	ipopts_match (struct ip *ip, struct ip_fw *f);
static int	iptos_match (struct ip *ip, struct ip_fw *f);
static __inline int
		port_match (u_short *portptr, int nports, u_short port,
				int range_flag, int mask);
static int	tcpflg_match (struct tcphdr *tcp, struct ip_fw *f);
static int	icmptype_match (struct icmp *  icmp, struct ip_fw * f);
static void	ipfw_report (struct ip_fw *f, struct ip *ip, int ip_off,
				int ip_len, struct ifnet *rif,
				struct ifnet *oif);

static void flush_rule_ptrs(void);

static ip_fw_chk_t	ip_fw_chk;
static int	ip_fw_ctl (struct sockopt *sopt);

ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL;

static char err_prefix[] = "ip_fw_ctl:";

/*
 * Returns 1 if the port is matched by the vector, 0 otherwise
 */
static __inline int
port_match(u_short *portptr, int nports, u_short port, int range_flag, int mask)
{
	if (!nports)
		return 1;
	if (mask) {
		if ( 0 == ((portptr[0] ^ port) & portptr[1]) )
			return 1;
		nports -= 2;
		portptr += 2;
	}
	if (range_flag) {
		if (portptr[0] <= port && port <= portptr[1])
			return 1;
		nports -= 2;
		portptr += 2;
	}
	while (nports-- > 0)
		if (*portptr++ == port)
			return 1;
	return 0;
}

static int
tcpflg_match(struct tcphdr *tcp, struct ip_fw *f)
{
	u_char		flg_set, flg_clr;

	/*
	 * If an established connection is required, reject packets that
	 * have only SYN of RST|ACK|SYN set.  Otherwise, fall through to
	 * other flag requirements.
	 */
	if ((f->fw_ipflg & IP_FW_IF_TCPEST) &&
	    ((tcp->th_flags & (TH_RST | TH_ACK | TH_SYN)) == TH_SYN))
		return 0;

	flg_set = tcp->th_flags & f->fw_tcpf;
	flg_clr = tcp->th_flags & f->fw_tcpnf;

	if (flg_set != f->fw_tcpf)
		return 0;
	if (flg_clr)
		return 0;

	return 1;
}

static int
icmptype_match(struct icmp *icmp, struct ip_fw *f)
{
	int type;

	if (!(f->fw_flg & IP_FW_F_ICMPBIT))
		return(1);

	type = icmp->icmp_type;

	/* check for matching type in the bitmap */
	if (type < IP_FW_ICMPTYPES_MAX &&
	    (f->fw_uar.fw_icmptypes[type / (sizeof(unsigned) * NBBY)] &
	    (1U << (type % (sizeof(unsigned) * NBBY)))))
		return(1);

	return(0); /* no match */
}

static int
is_icmp_query(struct ip *ip)
{
	const struct icmp *icmp;
	int icmp_type;

	icmp = (struct icmp *)((u_int32_t *)ip + ip->ip_hl);
	icmp_type = icmp->icmp_type;

	if (icmp_type == ICMP_ECHO || icmp_type == ICMP_ROUTERSOLICIT ||
	    icmp_type == ICMP_TSTAMP || icmp_type == ICMP_IREQ ||
	    icmp_type == ICMP_MASKREQ)
		return(1);

	return(0);
}

static int
ipopts_match(struct ip *ip, struct ip_fw *f)
{
	register u_char *cp;
	int opt, optlen, cnt;
	u_char	opts, nopts, nopts_sve;

	cp = (u_char *)(ip + 1);
	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
	opts = f->fw_ipopt;
	nopts = nopts_sve = f->fw_ipnopt;

	for (; cnt > 0; cnt -= optlen, cp += optlen) {
		opt = cp[IPOPT_OPTVAL];
		if (opt == IPOPT_EOL)
			break;
		if (opt == IPOPT_NOP)
			optlen = 1;
		else {
			optlen = cp[IPOPT_OLEN];
			if (optlen <= 0 || optlen > cnt) {
				return 0; /*XXX*/
			}
		}
		switch (opt) {

		default:
			break;

		case IPOPT_LSRR:
			opts &= ~IP_FW_IPOPT_LSRR;
			nopts &= ~IP_FW_IPOPT_LSRR;
			break;

		case IPOPT_SSRR:
			opts &= ~IP_FW_IPOPT_SSRR;
			nopts &= ~IP_FW_IPOPT_SSRR;
			break;

		case IPOPT_RR:
			opts &= ~IP_FW_IPOPT_RR;
			nopts &= ~IP_FW_IPOPT_RR;
			break;
		case IPOPT_TS:
			opts &= ~IP_FW_IPOPT_TS;
			nopts &= ~IP_FW_IPOPT_TS;
			break;
		}
		if (opts == nopts)
			break;
	}
	if (opts == 0 && nopts == nopts_sve)
		return 1;
	else
		return 0;
}

static int
iptos_match(struct ip *ip, struct ip_fw *f)
{

	u_int flags = (ip->ip_tos & 0x1f);
	u_char opts, nopts, nopts_sve;

	opts = (f->fw_iptos & 0x1f);
	nopts = nopts_sve = f->fw_ipntos;

	while (flags != 0) {
		u_int flag;

		flag = 1 << (ffs(flags) -1);
		opts &= ~flag;
		nopts &= ~flag;
		flags &= ~flag;
	}

	if (opts == 0 && nopts == nopts_sve)
		return 1;
	else
		return 0;

}


static int
tcpopts_match(struct tcphdr *tcp, struct ip_fw *f)
{
	register u_char *cp;
	int opt, optlen, cnt;
	u_char	opts, nopts, nopts_sve;

	cp = (u_char *)(tcp + 1);
	cnt = (tcp->th_off << 2) - sizeof (struct tcphdr);
	opts = f->fw_tcpopt;
	nopts = nopts_sve = f->fw_tcpnopt;

	for (; cnt > 0; cnt -= optlen, cp += optlen) {
		opt = cp[0];
		if (opt == TCPOPT_EOL)
			break;
		if (opt == TCPOPT_NOP)
			optlen = 1;
		else {
			optlen = cp[1];
			if (optlen <= 0)
				break;
		}


		switch (opt) {

		default:
			break;

		case TCPOPT_MAXSEG:
			opts &= ~IP_FW_TCPOPT_MSS;
			nopts &= ~IP_FW_TCPOPT_MSS;
			break;

		case TCPOPT_WINDOW:
			opts &= ~IP_FW_TCPOPT_WINDOW;
			nopts &= ~IP_FW_TCPOPT_WINDOW;
			break;

		case TCPOPT_SACK_PERMITTED:
		case TCPOPT_SACK:
			opts &= ~IP_FW_TCPOPT_SACK;
			nopts &= ~IP_FW_TCPOPT_SACK;
			break;

		case TCPOPT_TIMESTAMP:
			opts &= ~IP_FW_TCPOPT_TS;
			nopts &= ~IP_FW_TCPOPT_TS;
			break;

		case TCPOPT_CC:
		case TCPOPT_CCNEW:
		case TCPOPT_CCECHO:
			opts &= ~IP_FW_TCPOPT_CC;
			nopts &= ~IP_FW_TCPOPT_CC;
			break;
		}
		if (opts == nopts)
			break;
	}
	if (opts == 0 && nopts == nopts_sve)
		return 1;
	else
		return 0;
}

static int
iface_match(struct ifnet *ifp, union ip_fw_if *ifu, int byname)
{
	/* Check by name or by IP address */
	if (byname) {
		/* Check unit number (-1 is wildcard) */
		if (ifu->fu_via_if.unit != -1
		    && ifp->if_unit != ifu->fu_via_if.unit)
			return(0);
		/* Check name */
		if (strncmp(ifp->if_name, ifu->fu_via_if.name, FW_IFNLEN))
			return(0);
		return(1);
	} else if (ifu->fu_via_ip.s_addr != 0) {	/* Zero == wildcard */
		struct ifaddr *ia;

		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
			if (ia->ifa_addr == NULL)
				continue;
			if (ia->ifa_addr->sa_family != AF_INET)
				continue;
			if (ifu->fu_via_ip.s_addr != ((struct sockaddr_in *)
			    (ia->ifa_addr))->sin_addr.s_addr)
				continue;
			return(1);
		}
		return(0);
	}
	return(1);
}

static void
ipfw_report(struct ip_fw *f, struct ip *ip, int ip_off, int ip_len,
	struct ifnet *rif, struct ifnet *oif)
{
    struct tcphdr *const tcp = (struct tcphdr *) ((u_int32_t *) ip+ ip->ip_hl);
    struct udphdr *const udp = (struct udphdr *) ((u_int32_t *) ip+ ip->ip_hl);
    struct icmp *const icmp = (struct icmp *) ((u_int32_t *) ip + ip->ip_hl);
    u_int64_t count;
    char *action;
    char action2[32], proto[47], name[18], fragment[27];
    int len;
    int offset = ip_off & IP_OFFMASK;

    count = f ? f->fw_pcnt : ++counter;
    if ((f == NULL && fw_verbose_limit != 0 && count > fw_verbose_limit) ||
	(f && f->fw_logamount != 0 && count > f->fw_loghighest))
	    return;

    /* Print command name */
    snprintf(SNPARGS(name, 0), "ipfw: %d", f ? f->fw_number : -1);

    action = action2;
    if (!f)
	    action = "Refuse";
    else {
	    switch (f->fw_flg & IP_FW_F_COMMAND) {
	    case IP_FW_F_DENY:
		    action = "Deny";
		    break;
	    case IP_FW_F_REJECT:
		    if (f->fw_reject_code == IP_FW_REJECT_RST)
			    action = "Reset";
		    else
			    action = "Unreach";
		    break;
	    case IP_FW_F_ACCEPT:
		    action = "Accept";
		    break;
	    case IP_FW_F_COUNT:
		    action = "Count";
		    break;
#ifdef IPDIVERT
	    case IP_FW_F_DIVERT:
		    snprintf(SNPARGS(action2, 0), "Divert %d",
			f->fw_divert_port);
		    break;
	    case IP_FW_F_TEE:
		    snprintf(SNPARGS(action2, 0), "Tee %d",
			f->fw_divert_port);
		    break;
#endif
	    case IP_FW_F_SKIPTO:
		    snprintf(SNPARGS(action2, 0), "SkipTo %d",
			f->fw_skipto_rule);
		    break;
	    case IP_FW_F_PIPE:
		    snprintf(SNPARGS(action2, 0), "Pipe %d",
			f->fw_skipto_rule);
		    break;
	    case IP_FW_F_QUEUE:
		    snprintf(SNPARGS(action2, 0), "Queue %d",
			f->fw_skipto_rule);
		    break;

	    case IP_FW_F_FWD:
		    if (f->fw_fwd_ip.sin_port)
			    snprintf(SNPARGS(action2, 0),
				"Forward to %s:%d",
				inet_ntoa(f->fw_fwd_ip.sin_addr),
				f->fw_fwd_ip.sin_port);
		    else
			    snprintf(SNPARGS(action2, 0), "Forward to %s",
				inet_ntoa(f->fw_fwd_ip.sin_addr));
		    break;

	    default:
		    action = "UNKNOWN";
		    break;
	    }
    }

    switch (ip->ip_p) {
    case IPPROTO_TCP:
	    len = snprintf(SNPARGS(proto, 0), "TCP %s",
		inet_ntoa(ip->ip_src));
	    if (offset == 0)
		    len += snprintf(SNPARGS(proto, len), ":%d ",
			ntohs(tcp->th_sport));
	    else
		    len += snprintf(SNPARGS(proto, len), " ");
	    len += snprintf(SNPARGS(proto, len), "%s",
		inet_ntoa(ip->ip_dst));
	    if (offset == 0)
		    snprintf(SNPARGS(proto, len), ":%d",
			ntohs(tcp->th_dport));
	    break;
    case IPPROTO_UDP:
	    len = snprintf(SNPARGS(proto, 0), "UDP %s",
		inet_ntoa(ip->ip_src));
	    if (offset == 0)
		    len += snprintf(SNPARGS(proto, len), ":%d ",
			ntohs(udp->uh_sport));
	    else
		    len += snprintf(SNPARGS(proto, len), " ");
	    len += snprintf(SNPARGS(proto, len), "%s",
		inet_ntoa(ip->ip_dst));
	    if (offset == 0)
		    snprintf(SNPARGS(proto, len), ":%d",
			ntohs(udp->uh_dport));
	    break;
    case IPPROTO_ICMP:
	    if (offset == 0)
		    len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ",
			icmp->icmp_type, icmp->icmp_code);
	    else
		    len = snprintf(SNPARGS(proto, 0), "ICMP ");
	    len += snprintf(SNPARGS(proto, len), "%s",
		inet_ntoa(ip->ip_src));
	    snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst));
	    break;
    default:
	    len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
		inet_ntoa(ip->ip_src));
	    snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst));
	    break;
    }

    if (ip_off & (IP_MF | IP_OFFMASK))
	    snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
		     ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
		     offset << 3,
		     (ip_off & IP_MF) ? "+" : "");
    else
	    fragment[0] = '\0';
    if (oif)
	    log(LOG_SECURITY | LOG_INFO, "%s %s %s out via %s%d%s\n",
		name, action, proto, oif->if_name, oif->if_unit, fragment);
    else if (rif)
	    log(LOG_SECURITY | LOG_INFO, "%s %s %s in via %s%d%s\n", name,
		action, proto, rif->if_name, rif->if_unit, fragment);
    else
	    log(LOG_SECURITY | LOG_INFO, "%s %s %s%s\n", name, action,
		proto, fragment);
    if ((f ? f->fw_logamount != 0 : 1) &&
	count == (f ? f->fw_loghighest : fw_verbose_limit))
	    log(LOG_SECURITY | LOG_NOTICE,
		"ipfw: limit %d reached on entry %d\n",
		f ? f->fw_logamount : fw_verbose_limit,
		f ? f->fw_number : -1);
}

static __inline int
hash_packet(struct ipfw_flow_id *id)
{
    u_int32_t i ;

    i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
    i &= (curr_dyn_buckets - 1) ;
    return i ;
}

/**
 * unlink a dynamic rule from a chain. prev is a pointer to
 * the previous one, q is a pointer to the rule to delete,
 * head is a pointer to the head of the queue.
 * Modifies q and potentially also head.
 */
#define UNLINK_DYN_RULE(prev, head, q) {				\
	struct ipfw_dyn_rule *old_q = q;				\
									\
	/* remove a refcount to the parent */				\
	if (q->dyn_type == DYN_LIMIT)					\
		q->parent->count--;					\
	DEB(printf("-- unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",	\
		(q->id.src_ip), (q->id.src_port),			\
		(q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); )	\
	if (prev != NULL)						\
		prev->next = q = q->next ;				\
	else								\
		ipfw_dyn_v[i] = q = q->next ;				\
	dyn_count-- ;							\
	free(old_q, M_IPFW); }

#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
/**
 * Remove all dynamic rules pointing to a given rule, or all
 * rules if rule == NULL. Second parameter is 1 if we want to
 * delete unconditionally, otherwise only expired rules are removed.
 */
static void
remove_dyn_rule(struct ip_fw *rule, int force)
{
    struct ipfw_dyn_rule *prev, *q;
    int i, pass, max_pass ;
    static u_int32_t last_remove = 0 ;

    if (ipfw_dyn_v == NULL || dyn_count == 0)
	return ;
    /* do not expire more than once per second, it is useless */
    if (force == 0 && last_remove == time_second)
	return ;
    last_remove = time_second ;

    /*
     * because DYN_LIMIT refer to parent rules, during the first pass only
     * remove child and mark any pending LIMIT_PARENT, and remove
     * them in a second pass.
     */
  for (pass = max_pass = 0; pass <= max_pass ; pass++ ) {
    for (i = 0 ; i < curr_dyn_buckets ; i++) {
	for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) {
	    /*
	     * logic can become complex here, so we split tests.
	     * First, test if we match any rule,
	     * then make sure the rule is expired or we want to kill it,
	     * and possibly more in the future.
	     */
	    int zap = ( rule == NULL || rule == q->rule);
	    if (zap)
		zap = force || TIME_LEQ( q->expire , time_second );
	    /* do not zap parent in first pass, record we need a second pass */
	    if (q->dyn_type == DYN_LIMIT_PARENT) {
		max_pass = 1; /* we need a second pass */
		if (zap == 1 && (pass == 0 || q->count != 0) ) {
		    zap = 0 ;
		    if (pass == 1) /* should not happen */
			printf("OUCH! cannot remove rule, count %d\n",
				q->count);
		}
	    }
	    if (zap) {
		UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
	    } else {
		prev = q ;
		q = q->next ;
	    }
	}
    }
  }
}

#define EXPIRE_DYN_CHAIN(rule) remove_dyn_rule(rule, 0 /* expired ones */)
#define EXPIRE_DYN_CHAINS() remove_dyn_rule(NULL, 0 /* expired ones */)
#define DELETE_DYN_CHAIN(rule) remove_dyn_rule(rule, 1 /* force removal */)
#define DELETE_DYN_CHAINS() remove_dyn_rule(NULL, 1 /* force removal */)

/**
 * lookup a dynamic rule.
 */
static struct ipfw_dyn_rule *
lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction)
{
    /*
     * stateful ipfw extensions.
     * Lookup into dynamic session queue
     */
    struct ipfw_dyn_rule *prev, *q ;
    int i, dir = 0;
#define MATCH_FORWARD 1

    if (ipfw_dyn_v == NULL)
	return NULL ;
    i = hash_packet( pkt );
    for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) {
	if (q->dyn_type == DYN_LIMIT_PARENT)
	    goto next;
	if (TIME_LEQ( q->expire , time_second ) ) { /* expire entry */
	    UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
            continue;
	}
	if ( pkt->proto == q->id.proto) {
	    if (pkt->src_ip == q->id.src_ip &&
		    pkt->dst_ip == q->id.dst_ip &&
		    pkt->src_port == q->id.src_port &&
		    pkt->dst_port == q->id.dst_port ) {
		dir = MATCH_FORWARD ;
		goto found ;
	    }
	    if (pkt->src_ip == q->id.dst_ip &&
		    pkt->dst_ip == q->id.src_ip &&
		    pkt->src_port == q->id.dst_port &&
		    pkt->dst_port == q->id.src_port ) {
		dir = 0 ; /* reverse match */
		goto found ;
	    }
	}
next:
	prev = q ;
	q = q->next ;
    }
    return NULL ; /* clearly not found */
found:
    if ( prev != NULL) { /* found and not in front */
	prev->next = q->next ;
	q->next = ipfw_dyn_v[i] ;
	ipfw_dyn_v[i] = q ;
    }
    if (pkt->proto == IPPROTO_TCP) {
	/* update state according to flags */
	u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
	q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
	switch (q->state) {
	case TH_SYN :
	    /* opening */
	    q->expire = time_second + dyn_syn_lifetime ;
	    break ;
	case TH_SYN | (TH_SYN << 8) :
	    /* move to established */
	    q->expire = time_second + dyn_ack_lifetime ;
	    break ;
	case TH_SYN | (TH_SYN << 8) | TH_FIN :
	case TH_SYN | (TH_SYN << 8) | (TH_FIN << 8) :
	    /* one side tries to close */
	    q->expire = time_second + dyn_ack_lifetime ;
	    break ;
	case TH_SYN | (TH_SYN << 8) | TH_FIN | (TH_FIN << 8) :
	    /* both sides closed */
	    q->expire = time_second + dyn_fin_lifetime ;
	    break ;
	default:
#if 0
	    /*
	     * reset or some invalid combination, but can also
	     * occur if we use keep-state the wrong way.
	     */
	    if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
		printf("invalid state: 0x%x\n", q->state);
#endif
	    q->expire = time_second + dyn_rst_lifetime ;
	    break ;
	}
    } else if (pkt->proto == IPPROTO_UDP) {
	q->expire = time_second + dyn_udp_lifetime ;
    } else {
	/* other protocols */
	q->expire = time_second + dyn_short_lifetime ;
    }
    if (match_direction)
	*match_direction = dir ;
    return q ;
}

/**
 * Install state of type 'type' for a dynamic session.
 * The hash table contains two type of rules:
 * - regular rules (DYN_KEEP_STATE)
 * - rules for sessions with limited number of sess per user
 *   (DYN_LIMIT). When they are created, the parent is
 *   increased by 1, and decreased on delete. In this case,
 *   the third parameter is the parent rule and not the chain.
 * - "parent" rules for the above (DYN_LIMIT_PARENT).
 */

static struct ipfw_dyn_rule *
add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
{
    struct ipfw_dyn_rule *r ;

    int i ;
    if (ipfw_dyn_v == NULL ||
		(dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) {
	/* try reallocation, make sure we have a power of 2 */
	u_int32_t i = dyn_buckets ;
	while ( i > 0 && (i & 1) == 0 )
	    i >>= 1 ;
	if (i != 1) /* not a power of 2 */
	    dyn_buckets = curr_dyn_buckets ; /* reset */
	else {
	    curr_dyn_buckets = dyn_buckets ;
	    if (ipfw_dyn_v != NULL)
		free(ipfw_dyn_v, M_IPFW);
	    ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof r,
                   M_IPFW, M_DONTWAIT | M_ZERO);
	    if (ipfw_dyn_v == NULL)
		return NULL; /* failed ! */
	}
    }
    i = hash_packet(id);

    r = malloc(sizeof *r, M_IPFW, M_NOWAIT | M_ZERO);
    if (r == NULL) {
	printf ("sorry cannot allocate state\n");
	return NULL ;
    }

    /* increase refcount on parent, and set pointer */
    if (dyn_type == DYN_LIMIT) {
	struct ipfw_dyn_rule *parent = (struct ipfw_dyn_rule *)rule;
	if ( parent->dyn_type != DYN_LIMIT_PARENT)
	    panic("invalid parent");
	parent->count++ ;
	r->parent = parent ;
	rule = parent->rule;
    }

    r->id = *id ;
    r->expire = time_second + dyn_syn_lifetime ;
    r->rule = rule ;
    r->dyn_type = dyn_type ;
    r->pcnt = r->bcnt = 0 ;
    r->count = 0 ;

    r->bucket = i ;
    r->next = ipfw_dyn_v[i] ;
    ipfw_dyn_v[i] = r ;
    dyn_count++ ;
    DEB(printf("-- add entry 0x%08x %d -> 0x%08x %d, total %d\n",
       (r->id.src_ip), (r->id.src_port),
       (r->id.dst_ip), (r->id.dst_port),
       dyn_count ); )
    return r;
}

/**
 * lookup dynamic parent rule using pkt and rule as search keys.
 * If the lookup fails, then install one.
 */
static struct ipfw_dyn_rule *
lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
{
    struct ipfw_dyn_rule *q;
    int i;

    if (ipfw_dyn_v) {
	i = hash_packet( pkt );
	for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next)
	    if (q->dyn_type == DYN_LIMIT_PARENT && rule == q->rule &&
		    pkt->proto == q->id.proto &&
		    pkt->src_ip == q->id.src_ip &&
		    pkt->dst_ip == q->id.dst_ip &&
		    pkt->src_port == q->id.src_port &&
		    pkt->dst_port == q->id.dst_port) {
		q->expire = time_second + dyn_short_lifetime ;
		DEB(printf("lookup_dyn_parent found 0x%p\n", q);)
		return q;
	    }
    }
    return add_dyn_rule(pkt, DYN_LIMIT_PARENT, rule);
}

/*
 * Install dynamic state.
 * There are different types of dynamic rules which can be installed.
 * The type is in rule->dyn_type.
 * Type 0 (default) is a bidirectional rule
 *
 * Returns 1 (failure) if state is not installed because of errors or because
 * session limitations are enforced.
 */
static int
install_state(struct ip_fw *rule, struct ip_fw_args *args)
{
    struct ipfw_dyn_rule *q ;
    static int last_log ;

    u_int8_t type = rule->dyn_type ;

    DEB(printf("-- install state type %d 0x%08x %u -> 0x%08x %u\n",
       type,
       (args->f_id.src_ip), (args->f_id.src_port),
       (args->f_id.dst_ip), (args->f_id.dst_port) );)

    q = lookup_dyn_rule(&args->f_id, NULL) ;
    if (q != NULL) { /* should never occur */
	if (last_log != time_second) {
	    last_log = time_second ;
	    printf(" entry already present, done\n");
	}
	return 0 ;
    }
    if (dyn_count >= dyn_max) /* try remove old ones... */
	EXPIRE_DYN_CHAINS();
    if (dyn_count >= dyn_max) {
	if (last_log != time_second) {
	    last_log = time_second ;
	    printf(" Too many dynamic rules, sorry\n");
	}
	return 1; /* cannot install, notify caller */
    }

    switch (type) {
    case DYN_KEEP_STATE: /* bidir rule */
	add_dyn_rule(&args->f_id, DYN_KEEP_STATE, rule);
	break ;
    case DYN_LIMIT: /* limit number of sessions */
	{
	u_int16_t limit_mask = rule->limit_mask ;
	u_int16_t conn_limit = rule->conn_limit ;
	struct ipfw_flow_id id;
	struct ipfw_dyn_rule *parent;

	DEB(printf("installing dyn-limit rule %d\n", conn_limit);)

	id.dst_ip = id.src_ip = 0;
	id.dst_port = id.src_port = 0 ;
	id.proto = args->f_id.proto ;

	if (limit_mask & DYN_SRC_ADDR)
	    id.src_ip = args->f_id.src_ip;
	if (limit_mask & DYN_DST_ADDR)
	    id.dst_ip = args->f_id.dst_ip;
	if (limit_mask & DYN_SRC_PORT)
	    id.src_port = args->f_id.src_port;
	if (limit_mask & DYN_DST_PORT)
	    id.dst_port = args->f_id.dst_port;
	parent = lookup_dyn_parent(&id, rule);
	if (parent == NULL) {
	    printf("add parent failed\n");
	    return 1;
	}
	if (parent->count >= conn_limit) {
	    EXPIRE_DYN_CHAIN(rule); /* try to expire some */
	    if (parent->count >= conn_limit) {
		printf("drop session, too many entries\n");
		return 1;
	    }
	}
	add_dyn_rule(&args->f_id, DYN_LIMIT, (struct ip_fw *)parent);
	}
	break ;
    default:
	printf("unknown dynamic rule type %u\n", type);
	return 1 ;
    }
    lookup_dyn_rule(&args->f_id, NULL) ; /* XXX just set the lifetime */
    return 0;
}

/*
 * given an ip_fw *, lookup_next_rule will return a pointer
 * of the same type to the next one. This can be either the jump
 * target (for skipto instructions) or the next one in the list (in
 * all other cases including a missing jump target).
 * Backward jumps are not allowed, so start looking from the next
 * rule...
 */
static struct ip_fw * lookup_next_rule(struct ip_fw *me);

static struct ip_fw *
lookup_next_rule(struct ip_fw *me)
{
    struct ip_fw *rule ;
    int rulenum = me->fw_skipto_rule ; /* guess... */

    if ( (me->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_SKIPTO )
	for (rule = LIST_NEXT(me,next); rule ; rule = LIST_NEXT(rule,next))
	    if (rule->fw_number >= rulenum)
		return rule ;
    return LIST_NEXT(me,next) ; /* failure or not a skipto */
}

/*
 * Parameters:
 *
 *	*m	The packet; we set to NULL when/if we nuke it.
 *	oif	Outgoing interface, or NULL if packet is incoming
 *	*cookie Skip up to the first rule past this rule number;
 *		upon return, non-zero port number for divert or tee.
 *		Special case: cookie == NULL on input for bridging.
 *	*flow_id pointer to the last matching rule (in/out)
 *	*next_hop socket we are forwarding to (in/out).
 *		For bridged packets, this is a pointer to the MAC header.
 *
 * Return value:
 *
 *	IP_FW_PORT_DENY_FLAG	the packet must be dropped.
 *	0	The packet is to be accepted and routed normally OR
 *      	the packet was denied/rejected and has been dropped;
 *		in the latter case, *m is equal to NULL upon return.
 *	port	Divert the packet to port, with these caveats:
 *
 *		- If IP_FW_PORT_TEE_FLAG is set, tee the packet instead
 *		  of diverting it (ie, 'ipfw tee').
 *
 *		- If IP_FW_PORT_DYNT_FLAG is set, interpret the lower
 *		  16 bits as a dummynet pipe number instead of diverting
 */

static int
ip_fw_chk(struct ip_fw_args *args)
#if 0 /* the old interface was this: */
	struct mbuf **m, struct ifnet *oif, u_int16_t *cookie,
	struct ip_fw **flow_id, struct sockaddr_in **next_hop)
#endif
{
	/*
	 * grab things into variables to minimize diffs.
	 * XXX this has to be cleaned up later.
	 */
	struct mbuf **m = &(args->m);
	struct ifnet *oif = args->oif;
	u_int16_t *cookie = &(args->divert_rule);
	struct ip_fw **flow_id = &(args->rule);
	struct sockaddr_in **next_hop = &(args->next_hop);

	struct ip_fw *f = NULL;		/* matching rule */
	struct ip *ip = mtod(*m, struct ip *);
	struct ifnet *const rif = (*m)->m_pkthdr.rcvif;
	struct ifnet *tif;
	u_int hlen = ip->ip_hl << 2;
	struct ether_header * eh = NULL;

	u_short ip_off=0, offset = 0 ;
	/* local copy of addresses for faster matching */
	u_short src_port = 0, dst_port = 0;
	struct in_addr src_ip, dst_ip;
	u_int8_t proto= 0, flags = 0;

	u_int16_t skipto;
	u_int16_t ip_len=0;

	int dyn_checked = 0 ; /* set after dyn.rules have been checked. */
	int direction = MATCH_FORWARD ; /* dirty trick... */
	struct ipfw_dyn_rule *q = NULL ;

#define BRIDGED	(args->eh != NULL)
	if (BRIDGED) {	/* this is a bridged packet */
	    eh = args->eh;
	    if ( (*m)->m_pkthdr.len >= sizeof(struct ip) &&
			ntohs(eh->ether_type) == ETHERTYPE_IP)
		hlen = ip->ip_hl << 2;
	} else
	    hlen = ip->ip_hl << 2;

	/* Grab and reset cookie */
	skipto = *cookie;
	*cookie = 0;

	/*
	 * Collect parameters into local variables for faster matching.
	 */
	if (hlen > 0) { /* this is an IP packet */
	    proto = ip->ip_p;
	    src_ip = ip->ip_src;
	    dst_ip = ip->ip_dst;
	    if (BRIDGED) { /* bridged packets are as on the wire */
		ip_off = ntohs(ip->ip_off);
		ip_len = ntohs(ip->ip_len);
	    } else {
		ip_off = ip->ip_off;
		ip_len = ip->ip_len;
	    }
	    offset = ip_off & IP_OFFMASK;
	    if (offset == 0) {

#define PULLUP_TO(len)						\
		do {						\
			if ((*m)->m_len < (len)) {		\
			    *m = m_pullup(*m, (len));		\
			    if (*m == 0)			\
				goto bogusfrag;			\
			    ip = mtod(*m, struct ip *);		\
			}					\
		} while (0)

		switch (proto) {
		case IPPROTO_TCP : {
		    struct tcphdr *tcp;

		    PULLUP_TO(hlen + sizeof(struct tcphdr));
		    tcp =(struct tcphdr *)((u_int32_t *)ip + ip->ip_hl);
		    dst_port = tcp->th_dport ;
		    src_port = tcp->th_sport ;
		    flags = tcp->th_flags ;
		    }
		    break ;

		case IPPROTO_UDP : {
		    struct udphdr *udp;

		    PULLUP_TO(hlen + sizeof(struct udphdr));
		    udp =(struct udphdr *)((u_int32_t *)ip + ip->ip_hl);
		    dst_port = udp->uh_dport ;
		    src_port = udp->uh_sport ;
		    }
		    break;

		case IPPROTO_ICMP:
		    PULLUP_TO(hlen + 4);	/* type, code and checksum. */
		    flags = ((struct icmp *)
			    ((u_int32_t *)ip + ip->ip_hl))->icmp_type ;
		    break ;

		default :
		    break;
		}
#undef PULLUP_TO
	    }
	}
	args->f_id.src_ip = ntohl(src_ip.s_addr);
	args->f_id.dst_ip = ntohl(dst_ip.s_addr);
	args->f_id.proto = proto;
	args->f_id.src_port = ntohs(src_port);
	args->f_id.dst_port = ntohs(dst_port);
	args->f_id.flags = flags;

	if (*flow_id) {
	    /*
	     * Packet has already been tagged. Look for the next rule
	     * to restart processing.
	     */
	    if (fw_one_pass) /* just accept if fw_one_pass is set */
		return 0;

	    f = (*flow_id)->next_rule_ptr ;
	    if (f == NULL)
		f = (*flow_id)->next_rule_ptr = lookup_next_rule(*flow_id);
	    if (f == NULL)
		goto dropit;
	} else {
	    /*
	     * Go down the list, looking for enlightment.
	     * If we've been asked to start at a given rule, do so.
	     */
	    f = LIST_FIRST(&ip_fw_chain_head);
	    if (skipto != 0) {
		if (skipto >= IPFW_DEFAULT_RULE)
		    goto dropit;
		while (f && f->fw_number <= skipto)
		    f = LIST_NEXT(f, next);
		if (f == NULL)
		    goto dropit;
	    }
	}

	for (; f; f = LIST_NEXT(f, next)) {
again:
		if (f->fw_number == IPFW_DEFAULT_RULE)
		    goto got_match ;

		/* Check if rule only valid for bridged packets */
		if ((f->fw_flg & IP_FW_BRIDGED) != 0 && !(BRIDGED))
		    continue;

		if (oif) {
		    /* Check direction outbound */
		    if (!(f->fw_flg & IP_FW_F_OUT))
			continue;
		} else {
		    /* Check direction inbound */
		    if (!(f->fw_flg & IP_FW_F_IN))
			continue;
		}

		if (f->fw_flg & IP_FW_F_MAC) {
		    u_int32_t *want, *mask, *hdr;

		    if (eh == NULL) /* header not available */
			continue;

		    want = (void *)&(f->fw_mac_hdr);
		    mask = (void *)&(f->fw_mac_mask);
		    hdr = (void *)eh;

		    if ( want[0] != (hdr[0] & mask[0]) )
			continue;
		    if ( want[1] != (hdr[1] & mask[1]) )
			continue;
		    if ( want[2] != (hdr[2] & mask[2]) )
			continue;
		    if (f->fw_flg & IP_FW_F_SRNG) {
			u_int16_t type = ntohs(eh->ether_type);
			if (type < (u_int16_t)(f->fw_mac_type) ||
				type > (u_int16_t)(f->fw_mac_mask_type) )
			    continue;
		    } else {
			if ((u_int16_t)(f->fw_mac_type) != (eh->ether_type &
				(u_int16_t)(f->fw_mac_mask_type)) )
			    continue;
		    }
		}

		/* Interface check */
		if ((f->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) {
			struct ifnet *const iface = oif ? oif : rif;

			/* Backwards compatibility hack for "via" */
			if (!iface || !iface_match(iface,
			    &f->fw_in_if, f->fw_flg & IP_FW_F_OIFNAME))
				continue;
		} else {
			/* Check receive interface */
			if ((f->fw_flg & IP_FW_F_IIFACE)
			    && (!rif || !iface_match(rif,
			      &f->fw_in_if, f->fw_flg & IP_FW_F_IIFNAME)))
				continue;
			/* Check outgoing interface */
			if ((f->fw_flg & IP_FW_F_OIFACE)
			    && (!oif || !iface_match(oif,
			      &f->fw_out_if, f->fw_flg & IP_FW_F_OIFNAME)))
				continue;
		}

		/*
		 * For packets which matched the MAC check, we do not need
		 * to continue, this is a valid match.
		 * For not-ip packets, the rule does not apply.
		 */
		if (f->fw_flg & IP_FW_F_MAC)
			goto rnd_then_got_match;

		if (hlen == 0)
			continue;

		/*
		 * dynamic rules are checked at the first keep-state or
		 * check-state occurrence.
		 */
		if (f->fw_flg & (IP_FW_F_KEEP_S|IP_FW_F_CHECK_S) &&
			 dyn_checked == 0 ) {
		    dyn_checked = 1 ;
		    q = lookup_dyn_rule(&args->f_id, &direction);
		    if (q != NULL) {
			DEB(printf("-- dynamic match 0x%08x %d %s 0x%08x %d\n",
			    (q->id.src_ip), (q->id.src_port),
			    (direction == MATCH_FORWARD ? "-->" : "<--"),
			    (q->id.dst_ip), (q->id.dst_port) ); )
			f = q->rule ;
			q->pcnt++ ;
			q->bcnt += ip_len;
			goto got_match ; /* random not allowed here */
		    }
		    /* if this was a check-only rule, continue with next */
		    if (f->fw_flg & IP_FW_F_CHECK_S)
			continue ;
		}

		/* Fragments */
		if ((f->fw_flg & IP_FW_F_FRAG) && offset == 0 )
			continue;

		/*
		 * For matching addresses, tif != NULL means we matched
		 * the address we requested (either "me" or addr/mask).
		 * Then the check for "xxx" or "not xxx" can be done
		 * with an XOR.
		 */

		/* source address --	mandatory */
		if (f->fw_flg & IP_FW_F_SME) {
			INADDR_TO_IFP(src_ip, tif);
		} else
			(int)tif = f->fw_src.s_addr ==
			    (src_ip.s_addr & f->fw_smsk.s_addr);
		if ( ((f->fw_flg & IP_FW_F_INVSRC) != 0) ^ (tif == NULL) )
			continue;

		/* dst address --	mandatory */
		if (f->fw_flg & IP_FW_F_DME) {
			INADDR_TO_IFP(dst_ip, tif);
		} else
			(int)tif = f->fw_dst.s_addr ==
			    (dst_ip.s_addr & f->fw_dmsk.s_addr);
		if ( ((f->fw_flg & IP_FW_F_INVDST) != 0) ^ (tif == NULL) )
			continue;

		/* Check IP header values */
		if (f->fw_ipflg & IP_FW_IF_IPOPT && !ipopts_match(ip, f))
			continue;
		if (f->fw_ipflg & IP_FW_IF_IPLEN && f->fw_iplen != ip_len)
			continue;
		if (f->fw_ipflg & IP_FW_IF_IPID && f->fw_ipid != ntohs(ip->ip_id))
			continue;
		if (f->fw_ipflg & IP_FW_IF_IPPRE &&
		     (f->fw_iptos & 0xe0) != (ip->ip_tos & 0xe0))
			continue;
		if (f->fw_ipflg & IP_FW_IF_IPTOS && !iptos_match(ip, f))
			continue;
		if (f->fw_ipflg & IP_FW_IF_IPTTL && f->fw_ipttl != ip->ip_ttl)
			continue;
		if (f->fw_ipflg & IP_FW_IF_IPVER && f->fw_ipver != ip->ip_v)
			continue;

		/* Check protocol; if wildcard, and no [ug]id, match */
		if (f->fw_prot == IPPROTO_IP) {
			if (!(f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID)))
				goto rnd_then_got_match;
		} else
		    /* If different, don't match */
		    if (proto != f->fw_prot)
			    continue;

		/* Protocol specific checks for uid only */
		if (f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID)) {
		    switch (proto) {
		    case IPPROTO_TCP:
			{
			    struct inpcb *P;

			    if (offset == 1)	/* cf. RFC 1858 */
				    goto bogusfrag;
			    if (offset != 0)
				    continue;

			    if (oif)
				P = in_pcblookup_hash(&tcbinfo, dst_ip,
				   dst_port, src_ip, src_port, 0,
				   oif);
			    else
				P = in_pcblookup_hash(&tcbinfo, src_ip,
				   src_port, dst_ip, dst_port, 0,
				   NULL);

			    if (P && P->inp_socket) {
				if (f->fw_flg & IP_FW_F_UID) {
					if (socheckuid(P->inp_socket, f->fw_uid))
						continue;
				} else if (!groupmember(f->fw_gid,
					    P->inp_socket->so_cred))
						continue;
			    } else
				continue;
			    break;
			}

		    case IPPROTO_UDP:
			{
			    struct inpcb *P;

			    if (offset != 0)
				continue;

			    if (oif)
				P = in_pcblookup_hash(&udbinfo, dst_ip,
				   dst_port, src_ip, src_port, 1,
				   oif);
			    else
				P = in_pcblookup_hash(&udbinfo, src_ip,
				   src_port, dst_ip, dst_port, 1,
				   NULL);

			    if (P && P->inp_socket) {
				if (f->fw_flg & IP_FW_F_UID) {
					if (socheckuid(P->inp_socket, f->fw_uid))
						continue;
				} else if (!groupmember(f->fw_gid,
					    P->inp_socket->so_cred))
						continue;
			    } else
				continue;
			    break;
			}

		    default:
			    continue;
		    }
		}

		/* Protocol specific checks */
		switch (proto) {
		case IPPROTO_TCP:
		    {
			struct tcphdr *tcp;

			if (offset == 1)	/* cf. RFC 1858 */
				goto bogusfrag;
			if (offset != 0) {
				/*
				 * TCP flags and ports aren't available in this
				 * packet -- if this rule specified either one,
				 * we consider the rule a non-match.
				 */
				if (IP_FW_HAVEPORTS(f) != 0 ||
				    f->fw_ipflg & IP_FW_IF_TCPMSK)
					continue;

				break;
			}
			tcp = (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl);

			if (f->fw_ipflg & IP_FW_IF_TCPOPT && !tcpopts_match(tcp, f))
				continue;
			if (((f->fw_ipflg & IP_FW_IF_TCPFLG) ||
			    (f->fw_ipflg & IP_FW_IF_TCPEST)) &&
			    !tcpflg_match(tcp, f))
				continue;
			if (f->fw_ipflg & IP_FW_IF_TCPSEQ && tcp->th_seq != f->fw_tcpseq)
				continue;
			if (f->fw_ipflg & IP_FW_IF_TCPACK && tcp->th_ack != f->fw_tcpack)
				continue;
			if (f->fw_ipflg & IP_FW_IF_TCPWIN && tcp->th_win != f->fw_tcpwin)
				continue;
			goto check_ports;
		    }

		case IPPROTO_UDP:
			if (offset != 0) {
				/*
				 * Port specification is unavailable -- if this
				 * rule specifies a port, we consider the rule
				 * a non-match.
				 */
				if (IP_FW_HAVEPORTS(f) )
					continue;

				break;
			}
check_ports:
			if (!port_match(&f->fw_uar.fw_pts[0],
			    IP_FW_GETNSRCP(f), ntohs(src_port),
			    f->fw_flg & IP_FW_F_SRNG,
			    f->fw_flg & IP_FW_F_SMSK))
				continue;
			if (!port_match(&f->fw_uar.fw_pts[IP_FW_GETNSRCP(f)],
			    IP_FW_GETNDSTP(f), ntohs(dst_port),
			    f->fw_flg & IP_FW_F_DRNG,
			    f->fw_flg & IP_FW_F_DMSK))
				continue;
			break;

		case IPPROTO_ICMP:
		    {
			struct icmp *icmp;

			if (offset != 0)	/* Type isn't valid */
				break;
			icmp = (struct icmp *) ((u_int32_t *)ip + ip->ip_hl);
			if (!icmptype_match(icmp, f))
				continue;
			break;
		    }

		default:
			break;

bogusfrag:
		if (fw_verbose) {
			if (*m != NULL)
				ipfw_report(NULL, ip, ip_off, ip_len, rif, oif);
			else
				printf("pullup failed\n");
		}
		goto dropit;

		}

rnd_then_got_match:
		if ( f->dont_match_prob && random() < f->dont_match_prob )
			continue ;
got_match:
		/*
		 * If not a dynamic match (q == NULL) and keep-state, install
		 * a new dynamic entry.
		 */
		if (q == NULL && f->fw_flg & IP_FW_F_KEEP_S) {
		    if (install_state(f, args)) /* error or limit violation */
			goto dropit;
		}
		/* Update statistics */
		f->fw_pcnt += 1;
		f->fw_bcnt += ip_len;
		f->timestamp = time_second;

		/* Log to console if desired */
		if ((f->fw_flg & IP_FW_F_PRN) && fw_verbose && hlen >0)
			ipfw_report(f, ip, ip_off, ip_len, rif, oif);

		/* Take appropriate action */
		switch (f->fw_flg & IP_FW_F_COMMAND) {
		case IP_FW_F_ACCEPT:
			return(0);
		case IP_FW_F_COUNT:
			continue;
#ifdef IPDIVERT
		case IP_FW_F_DIVERT:
			*cookie = f->fw_number;
			return(f->fw_divert_port);
		case IP_FW_F_TEE:
			*cookie = f->fw_number;
			return(f->fw_divert_port | IP_FW_PORT_TEE_FLAG);
#endif
		case IP_FW_F_SKIPTO: /* XXX check */
			if (f->next_rule_ptr == NULL)
			    f->next_rule_ptr = lookup_next_rule(f) ;
			f = f->next_rule_ptr;
			if (!f)
			    goto dropit;
			goto again ;

		case IP_FW_F_PIPE:
		case IP_FW_F_QUEUE:
			*flow_id = f; /* XXX set flow id */
			return(f->fw_pipe_nr | IP_FW_PORT_DYNT_FLAG);

		case IP_FW_F_FWD:
			/* Change the next-hop address for this packet.
			 * Initially we'll only worry about directly
			 * reachable next-hop's, but ultimately
			 * we will work out for next-hops that aren't
			 * direct the route we would take for it. We
			 * [cs]ould leave this latter problem to
			 * ip_output.c. We hope to high [name the abode of
			 * your favourite deity] that ip_output doesn't modify
			 * the new value of next_hop (which is dst there)
			 * XXX warning-- there is a dangerous reference here
			 * from next_hop to a field within the rule. If the
			 * rule is deleted, weird things might occur.
			 */
			if (next_hop != NULL /* Make sure, first... */
			    && (q == NULL || direction == MATCH_FORWARD) )
				*next_hop = &(f->fw_fwd_ip);
			return(0); /* Allow the packet */

		}

		/* Deny/reject this packet using this rule */
		break;
	}

	/* Rule IPFW_DEFAULT_RULE should always be there and match */
	KASSERT(f != NULL, ("ip_fw: no chain"));

	/*
	 * At this point, we're going to drop the packet.
	 * Send a reject notice if all of the following are true:
	 *
	 * - The packet matched a reject rule
	 * - The packet is not an ICMP packet, or is an ICMP query packet
	 * - The packet is not a multicast or broadcast packet
	 */
	if ((f->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_REJECT
	    && (proto != IPPROTO_ICMP || is_icmp_query(ip))
	    && !((*m)->m_flags & (M_BCAST|M_MCAST))
	    && !IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
		/* Must convert to host order for icmp_error() etc. */
		if (BRIDGED) {
			ip->ip_len = ntohs(ip->ip_len);
			ip->ip_off = ntohs(ip->ip_off);
		}
		switch (f->fw_reject_code) {
		case IP_FW_REJECT_RST:
		    {
			/* XXX warning, this code writes into the mbuf */
			struct tcphdr *const tcp =
				(struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl);
			struct tcpiphdr ti, *const tip = (struct tcpiphdr *) ip;

			if (offset != 0 || (tcp->th_flags & TH_RST))
				break;
			ti.ti_i = *((struct ipovly *) ip);
			ti.ti_t = *tcp;
			bcopy(&ti, ip, sizeof(ti));
			tip->ti_seq = ntohl(tip->ti_seq);
			tip->ti_ack = ntohl(tip->ti_ack);
			tip->ti_len = ip_len - hlen - (tip->ti_off << 2);
			if (tcp->th_flags & TH_ACK) {
				tcp_respond(NULL, (void *)ip, tcp, *m,
				    (tcp_seq)0, tcp->th_ack, TH_RST);
			} else {
				if (tcp->th_flags & TH_SYN)
					tip->ti_len++;
				tcp_respond(NULL, (void *)ip, tcp, *m,
				    tip->ti_seq + tip->ti_len,
				    (tcp_seq)0, TH_RST|TH_ACK);
			}
			*m = NULL;
			break;
		    }
		default:	/* Send an ICMP unreachable using code */
			icmp_error(*m, ICMP_UNREACH,
			    f->fw_reject_code, 0L, 0);
			*m = NULL;
			break;
		}
	}

dropit:
	/*
	 * Finally, drop the packet.
	 */
	return(IP_FW_PORT_DENY_FLAG);
#undef BRIDGED
}

/*
 * when a rule is added/deleted, zero the direct pointers within
 * all firewall rules. These will be reconstructed on the fly
 * as packets are matched.
 * Must be called at splimp().
 */
static void
flush_rule_ptrs()
{
    struct ip_fw *fcp ;

    LIST_FOREACH(fcp, &ip_fw_chain_head, next) {
	fcp->next_rule_ptr = NULL ;
    }
}

static int
add_entry(struct ip_fw_head *head, struct ip_fw *rule)
{
	struct ip_fw *ftmp, *fcp, *fcpl;
	u_short nbr = 0;
	int s;

	ftmp = malloc(sizeof *ftmp, M_IPFW, M_NOWAIT | M_ZERO);
	if (!ftmp)
		return (ENOSPC);
	bcopy(rule, ftmp, sizeof(*ftmp));

	ftmp->fw_in_if.fu_via_if.name[FW_IFNLEN - 1] = '\0';
	ftmp->fw_pcnt = 0L;
	ftmp->fw_bcnt = 0L;
	ftmp->next_rule_ptr = NULL ;
	ftmp->pipe_ptr = NULL ;

	s = splimp();

	if (LIST_FIRST(head) == 0) {
		LIST_INSERT_HEAD(head, ftmp, next);
		goto done;
        }

	/* If entry number is 0, find highest numbered rule and add 100 */
	if (ftmp->fw_number == 0) {
		LIST_FOREACH(fcp, head, next) {
			if (fcp->fw_number != IPFW_DEFAULT_RULE)
				nbr = fcp->fw_number;
			else
				break;
		}
		if (nbr < IPFW_DEFAULT_RULE - 100)
			nbr += 100;
		ftmp->fw_number = rule->fw_number = nbr;
	}

	/* Got a valid number; now insert it, keeping the list ordered */
	fcpl = NULL ;
	LIST_FOREACH(fcp, head, next) {
		if (fcp->fw_number > ftmp->fw_number) {
			if (fcpl) {
				LIST_INSERT_AFTER(fcpl, ftmp, next);
			} else {
				LIST_INSERT_HEAD(head, ftmp, next);
			}
			break;
		} else {
			fcpl = fcp;
		}
	}
	flush_rule_ptrs();
done:
	static_count++;
	splx(s);
	DEB(printf("++ installed rule %d, static count now %d\n",
		ftmp->fw_number, static_count);)
	return (0);
}

/**
 * free storage associated with a static rule entry (including
 * dependent dynamic rules), and zeroes rule pointers to avoid
 * dangling pointer dereferences.
 * @return a pointer to the next entry.
 * Must be called at splimp() and with a non-null argument.
 */
static struct ip_fw *
free_chain(struct ip_fw *fcp)
{
    struct ip_fw *n;

    n = LIST_NEXT(fcp, next);
    DELETE_DYN_CHAIN(fcp);
    LIST_REMOVE(fcp, next);
    static_count--;
    if (DUMMYNET_LOADED)
	ip_dn_ruledel_ptr(fcp) ;
    flush_rule_ptrs(); /* more efficient to do outside the loop */
    free(fcp, M_IPFW);
    return n;
}

/**
 * remove all rules with given number.
 */
static int
del_entry(struct ip_fw_head *chainptr, u_short number)
{
    struct ip_fw *rule;

    if (number != IPFW_DEFAULT_RULE) {
	LIST_FOREACH(rule, chainptr, next) {
	    if (rule->fw_number == number) {
		int s ;

		s = splimp(); /* prevent access to rules while removing */
		while (rule && rule->fw_number == number)
		    rule = free_chain(rule);
		/* XXX could move flush_rule_ptrs() here */
		splx(s);
		return 0 ;
	    }
	}
    }
    return (EINVAL);
}

/**
 * Reset some or all counters on firewall rules.
 * @arg frwl is null to clear all entries, or contains a specific
 * rule number.
 * @arg log_only is 1 if we only want to reset logs, zero otherwise.
 */

static int
zero_entry(struct ip_fw *frwl, int log_only)
{
    struct ip_fw *rule;
    int s;
    u_short number = 0 ;
    char *msg ;

    if (frwl == 0) {
	s = splimp();
	LIST_FOREACH(rule, &ip_fw_chain_head, next) {
	    if (log_only == 0) {
		rule->fw_bcnt = rule->fw_pcnt = 0;
		rule->timestamp = 0;
	    }
	    rule->fw_loghighest = rule->fw_pcnt+rule->fw_logamount;
	}
	splx(s);
	msg = log_only ? "ipfw: All logging counts cleared.\n" :
			"ipfw: Accounting cleared.\n";
    } else {
	int cleared = 0;
	number = frwl->fw_number ;
	/*
	 * It is possible to insert multiple chain entries with the
	 * same number, so we don't stop after finding the first
	 * match if zeroing a specific entry.
	 */
	LIST_FOREACH(rule, &ip_fw_chain_head, next)
	    if (number == rule->fw_number) {
		s = splimp();
		while (rule && number == rule->fw_number) {
		    if (log_only == 0) {
			rule->fw_bcnt = rule->fw_pcnt = 0;
			rule->timestamp = 0;
		    }
		    rule->fw_loghighest = rule->fw_pcnt+ rule->fw_logamount;
		    rule = LIST_NEXT(rule, next);
		}
		splx(s);
		cleared = 1;
		break;
	    }
	if (!cleared)	/* we did not find any matching rules */
	    return (EINVAL);
	msg = log_only ? "ipfw: Entry %d logging count reset.\n" :
			"ipfw: Entry %d cleared.\n";
    }
    if (fw_verbose)
	log(LOG_SECURITY | LOG_NOTICE, msg, number);
    return (0);
}

static int
check_ipfw_struct(struct ip_fw *frwl)
{
	/* Check for invalid flag bits */
	if ((frwl->fw_flg & ~IP_FW_F_MASK) != 0) {
		dprintf(("%s undefined flag bits set (flags=%x)\n",
		    err_prefix, frwl->fw_flg));
		return (EINVAL);
	}
	if ( (frwl->fw_flg & IP_FW_F_MAC) ) {	/* match MAC address */
		return 0;
	}
	if (frwl->fw_flg == IP_FW_F_CHECK_S) {
		/* check-state */
		return 0 ;
	}
	/* Must apply to incoming or outgoing (or both) */
	if (!(frwl->fw_flg & (IP_FW_F_IN | IP_FW_F_OUT))) {
		dprintf(("%s neither in nor out\n", err_prefix));
		return (EINVAL);
	}
	/* Empty interface name is no good */
	if (((frwl->fw_flg & IP_FW_F_IIFNAME)
	      && !*frwl->fw_in_if.fu_via_if.name)
	    || ((frwl->fw_flg & IP_FW_F_OIFNAME)
	      && !*frwl->fw_out_if.fu_via_if.name)) {
		dprintf(("%s empty interface name\n", err_prefix));
		return (EINVAL);
	}
	/* Sanity check interface matching */
	if ((frwl->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) {
		;		/* allow "via" backwards compatibility */
	} else if ((frwl->fw_flg & IP_FW_F_IN)
	    && (frwl->fw_flg & IP_FW_F_OIFACE)) {
		dprintf(("%s outgoing interface check on incoming\n",
		    err_prefix));
		return (EINVAL);
	}
	/* Sanity check port ranges */
	if ((frwl->fw_flg & IP_FW_F_SRNG) && IP_FW_GETNSRCP(frwl) < 2) {
		dprintf(("%s src range set but n_src_p=%d\n",
		    err_prefix, IP_FW_GETNSRCP(frwl)));
		return (EINVAL);
	}
	if ((frwl->fw_flg & IP_FW_F_DRNG) && IP_FW_GETNDSTP(frwl) < 2) {
		dprintf(("%s dst range set but n_dst_p=%d\n",
		    err_prefix, IP_FW_GETNDSTP(frwl)));
		return (EINVAL);
	}
	if (IP_FW_GETNSRCP(frwl) + IP_FW_GETNDSTP(frwl) > IP_FW_MAX_PORTS) {
		dprintf(("%s too many ports (%d+%d)\n",
		    err_prefix, IP_FW_GETNSRCP(frwl), IP_FW_GETNDSTP(frwl)));
		return (EINVAL);
	}
	/*
	 *	Protocols other than TCP/UDP don't use port range
	 */
	if ((frwl->fw_prot != IPPROTO_TCP) &&
	    (frwl->fw_prot != IPPROTO_UDP) &&
	    (IP_FW_GETNSRCP(frwl) || IP_FW_GETNDSTP(frwl))) {
		dprintf(("%s port(s) specified for non TCP/UDP rule\n",
		    err_prefix));
		return (EINVAL);
	}

	/*
	 *	Rather than modify the entry to make such entries work,
	 *	we reject this rule and require user level utilities
	 *	to enforce whatever policy they deem appropriate.
	 */
	if ((frwl->fw_src.s_addr & (~frwl->fw_smsk.s_addr)) ||
		(frwl->fw_dst.s_addr & (~frwl->fw_dmsk.s_addr))) {
		dprintf(("%s rule never matches\n", err_prefix));
		return (EINVAL);
	}

	if ((frwl->fw_flg & IP_FW_F_FRAG) &&
		(frwl->fw_prot == IPPROTO_UDP || frwl->fw_prot == IPPROTO_TCP)) {
		if (IP_FW_HAVEPORTS(frwl)) {
			dprintf(("%s cannot mix 'frag' and ports\n", err_prefix));
			return (EINVAL);
		}
		if (frwl->fw_prot == IPPROTO_TCP &&
			frwl->fw_tcpf != frwl->fw_tcpnf) {
			dprintf(("%s cannot mix 'frag' and TCP flags\n", err_prefix));
			return (EINVAL);
		}
	}

	if (frwl->fw_flg & (IP_FW_F_UID | IP_FW_F_GID)) {
		if ((frwl->fw_prot != IPPROTO_TCP) &&
		   (frwl->fw_prot != IPPROTO_UDP) &&
		   (frwl->fw_prot != IPPROTO_IP)) {
			dprintf(("%s cannot use uid/gid logic on non-TCP/UDP\n", err_prefix));
			return (EINVAL);
		}
	}

	/* Check command specific stuff */
	switch (frwl->fw_flg & IP_FW_F_COMMAND) {
	case IP_FW_F_REJECT:
		if (frwl->fw_reject_code >= 0x100
		    && !(frwl->fw_prot == IPPROTO_TCP
		      && frwl->fw_reject_code == IP_FW_REJECT_RST)) {
			dprintf(("%s unknown reject code\n", err_prefix));
			return (EINVAL);
		}
		break;
#ifdef IPDIVERT
	case IP_FW_F_DIVERT:		/* Diverting to port zero is invalid */
	case IP_FW_F_TEE:
#endif
	case IP_FW_F_PIPE:              /* pipe 0 is invalid */
	case IP_FW_F_QUEUE:             /* queue 0 is invalid */
		if (frwl->fw_divert_port == 0) {
			dprintf(("%s 0 is an invalid argument\n", err_prefix));
			return (EINVAL);
		}
		break;
	case IP_FW_F_DENY:
	case IP_FW_F_ACCEPT:
	case IP_FW_F_COUNT:
	case IP_FW_F_SKIPTO:
	case IP_FW_F_FWD:
		break;
	default:
		dprintf(("%s invalid command\n", err_prefix));
		return (EINVAL);
	}

	return 0;
}

static int
ip_fw_ctl(struct sockopt *sopt)
{
	int error, s;
	size_t size;
	struct ip_fw *fcp;
	struct ip_fw frwl, *bp , *buf;

	/*
	 * Disallow modifications in really-really secure mode, but still allow
	 * the logging counters to be reset.
	 */
	if (sopt->sopt_name == IP_FW_ADD ||
	    (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
		if (error)
			return (error);
	}

	error = 0;

	switch (sopt->sopt_name) {
	case IP_FW_GET:
		/*
		 * pass up a copy of the current rules. Static rules
		 * come first (the last of which has number 65535),
		 * followed by a possibly empty list of dynamic rule.
		 * The last dynamic rule has NULL in the "next" field.
		 */
		s = splimp();
		/* size of static rules */
		size = static_count * sizeof(struct ip_fw) ;
		if (ipfw_dyn_v)		/* add size of dyn.rules */
		    size += (dyn_count * sizeof(struct ipfw_dyn_rule));

		/*
		 * XXX todo: if the user passes a short length to know how
		 * much room is needed, do not
		 * bother filling up the buffer, just jump to the
		 * sooptcopyout.
		 */
		buf = malloc(size, M_TEMP, M_WAITOK);
		if (buf == 0) {
		    splx(s);
		    error = ENOBUFS;
		    break;
		}

		bp = buf ;
		LIST_FOREACH(fcp, &ip_fw_chain_head, next) {
		    bcopy(fcp, bp, sizeof *fcp);
		    bp++;
		}
		if (ipfw_dyn_v) {
		    int i ;
		    struct ipfw_dyn_rule *p, *dst, *last = NULL ;

		    dst = (struct ipfw_dyn_rule *)bp ;
		    for (i = 0 ; i < curr_dyn_buckets ; i++ )
			for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next, dst++ ) {
			    bcopy(p, dst, sizeof *p);
                            (int)dst->rule = p->rule->fw_number ;
			    /*
			     * store a non-null value in "next". The userland
			     * code will interpret a NULL here as a marker
			     * for the last dynamic rule.
			     */
			    dst->next = dst ;
			    last = dst ;
			    if (TIME_LEQ(dst->expire, time_second) )
				dst->expire = 0 ;
			    else
				dst->expire -= time_second ;
			    }
		    if (last != NULL)
			last->next = NULL ;	/* mark last dynamic rule */
		}
		splx(s);

		error = sooptcopyout(sopt, buf, size);
		free(buf, M_TEMP);
		break;

	case IP_FW_FLUSH:
		/*
		 * Normally we cannot release the lock on each iteration.
		 * We could do it here only because we start from the head all
		 * the times so there is no risk of missing some entries.
		 * On the other hand, the risk is that we end up with
		 * a very inconsistent ruleset, so better keep the lock
		 * around the whole cycle.
		 *
		 * XXX this code can be improved by resetting the head of
		 * the list to point to the default rule, and then freeing
		 * the old list without the need for a lock.
		 */

		s = splimp();
		while ( (fcp = LIST_FIRST(&ip_fw_chain_head)) &&
			fcp->fw_number != IPFW_DEFAULT_RULE )
		    free_chain(fcp);
		splx(s);
		break;

	case IP_FW_ADD:
		error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl);
		if (error || (error = check_ipfw_struct(&frwl)))
			break;

		if (frwl.fw_number == IPFW_DEFAULT_RULE) {
			dprintf(("%s can't add rule %u\n", err_prefix,
				 (unsigned)IPFW_DEFAULT_RULE));
			error = EINVAL;
		} else {
			error = add_entry(&ip_fw_chain_head, &frwl);
			if (!error && sopt->sopt_dir == SOPT_GET)
				error = sooptcopyout(sopt, &frwl, sizeof frwl);
		}
		break;

	case IP_FW_DEL:
		error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl);
		if (error)
			break;

		if (frwl.fw_number == IPFW_DEFAULT_RULE) {
			dprintf(("%s can't delete rule %u\n", err_prefix,
				 (unsigned)IPFW_DEFAULT_RULE));
			error = EINVAL;
		} else {
			error = del_entry(&ip_fw_chain_head, frwl.fw_number);
		}
		break;

	case IP_FW_ZERO:
	case IP_FW_RESETLOG:
	    {
		int cmd = (sopt->sopt_name == IP_FW_RESETLOG );
		void *arg = NULL ;

		if (sopt->sopt_val != 0) {
		    error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl);
		    if (error)
			break;
		    arg = &frwl ;
		}
		error = zero_entry(arg, cmd);
	    }
	    break;

	default:
		printf("ip_fw_ctl invalid option %d\n", sopt->sopt_name);
		error = EINVAL ;
	}

	return (error);
}

/**
 * dummynet needs a reference to the default rule, because rules can
 * be deleted while packets hold a reference to them (e.g. to resume
 * processing at the next rule). When this happens, dummynet changes
 * the reference to the default rule (probably it could well be a
 * NULL pointer, but this way we do not need to check for the special
 * case, plus here he have info on the default behaviour.
 */
struct ip_fw *ip_fw_default_rule ;

void
ip_fw_init(void)
{
	struct ip_fw default_rule;

	ip_fw_chk_ptr = ip_fw_chk;
	ip_fw_ctl_ptr = ip_fw_ctl;
	LIST_INIT(&ip_fw_chain_head);

	bzero(&default_rule, sizeof default_rule);
	default_rule.fw_prot = IPPROTO_IP;
	default_rule.fw_number = IPFW_DEFAULT_RULE;
#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
	default_rule.fw_flg |= IP_FW_F_ACCEPT;
#else
	default_rule.fw_flg |= IP_FW_F_DENY;
#endif
	default_rule.fw_flg |= IP_FW_F_IN | IP_FW_F_OUT;
	if (check_ipfw_struct(&default_rule) != 0 ||
	    add_entry(&ip_fw_chain_head, &default_rule))
		panic("ip_fw_init");

	ip_fw_default_rule = LIST_FIRST(&ip_fw_chain_head) ;
	printf("IP packet filtering initialized, "
#ifdef IPDIVERT
		"divert enabled, "
#else
		"divert disabled, "
#endif
		"rule-based forwarding enabled, "
#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
		"default to accept, ");
#else
		"default to deny, " );
#endif
#ifndef IPFIREWALL_VERBOSE
	printf("logging disabled\n");
#else
	if (fw_verbose_limit == 0)
		printf("unlimited logging\n");
	else
		printf("logging limited to %d packets/entry by default\n",
		    fw_verbose_limit);
#endif
}

static int
ipfw_modevent(module_t mod, int type, void *unused)
{
	int s;
	int err = 0 ;
#if defined(KLD_MODULE)
	struct ip_fw *fcp;
#endif

	switch (type) {
	case MOD_LOAD:
		s = splimp();
		if (IPFW_LOADED) {
			splx(s);
			printf("IP firewall already loaded\n");
			err = EEXIST ;
		} else {
			ip_fw_init();
			splx(s);
		}
		break ;
	case MOD_UNLOAD:
#if !defined(KLD_MODULE)
		printf("ipfw statically compiled, cannot unload\n");
		err = EBUSY;
#else
                s = splimp();
		ip_fw_chk_ptr = NULL ;
		ip_fw_ctl_ptr = NULL ;
		while ( (fcp = LIST_FIRST(&ip_fw_chain_head)) != NULL)
		    free_chain(fcp);
		splx(s);
		printf("IP firewall unloaded\n");
#endif
		break;
	default:
		break;
	}
	return err;
}

static moduledata_t ipfwmod = {
	"ipfw",
	ipfw_modevent,
	0
};
DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY);
MODULE_VERSION(ipfw, 1);