2005-01-07 01:45:51 +00:00
|
|
|
/*-
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1988, 1990, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
|
|
|
|
*/
|
|
|
|
|
2007-10-07 20:44:24 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
1998-07-06 05:04:33 +00:00
|
|
|
#include "opt_ipfw.h"
|
1999-12-22 19:13:38 +00:00
|
|
|
#include "opt_ipsec.h"
|
2009-02-27 14:12:05 +00:00
|
|
|
#include "opt_route.h"
|
2003-04-12 06:11:46 +00:00
|
|
|
#include "opt_mbuf_stress_test.h"
|
This patch provides the back end support for equal-cost multi-path
(ECMP) for both IPv4 and IPv6. Previously, multipath route insertion
is disallowed. For example,
route add -net 192.103.54.0/24 10.9.44.1
route add -net 192.103.54.0/24 10.9.44.2
The second route insertion will trigger an error message of
"add net 192.103.54.0/24: gateway 10.2.5.2: route already in table"
Multiple default routes can also be inserted. Here is the netstat
output:
default 10.2.5.1 UGS 0 3074 bge0 =>
default 10.2.5.2 UGS 0 0 bge0
When multipath routes exist, the "route delete" command requires
a specific gateway to be specified or else an error message would
be displayed. For example,
route delete default
would fail and trigger the following error message:
"route: writing to routing socket: No such process"
"delete net default: not in table"
On the other hand,
route delete default 10.2.5.2
would be successful: "delete net default: gateway 10.2.5.2"
One does not have to specify a gateway if there is only a single
route for a particular destination.
I need to perform more testings on address aliases and multiple
interfaces that have the same IP prefixes. This patch as it
stands today is not yet ready for prime time. Therefore, the ECMP
code fragments are fully guarded by the RADIX_MPATH macro.
Include the "options RADIX_MPATH" in the kernel configuration
to enable this feature.
Reviewed by: robert, sam, gnn, julian, kmacy
2008-04-13 05:45:14 +00:00
|
|
|
#include "opt_mpath.h"
|
2009-02-03 11:00:43 +00:00
|
|
|
#include "opt_sctp.h"
|
1997-11-05 20:17:23 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/param.h>
|
1994-05-25 09:21:21 +00:00
|
|
|
#include <sys/systm.h>
|
1998-12-21 21:36:40 +00:00
|
|
|
#include <sys/kernel.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/malloc.h>
|
|
|
|
#include <sys/mbuf.h>
|
2006-11-06 13:42:10 +00:00
|
|
|
#include <sys/priv.h>
|
2008-02-02 14:11:31 +00:00
|
|
|
#include <sys/proc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/protosw.h>
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/socketvar.h>
|
2003-03-25 05:45:05 +00:00
|
|
|
#include <sys/sysctl.h>
|
2008-02-02 14:11:31 +00:00
|
|
|
#include <sys/ucred.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
#include <net/if.h>
|
2009-08-14 23:44:59 +00:00
|
|
|
#include <net/if_llatbl.h>
|
2004-08-17 22:05:54 +00:00
|
|
|
#include <net/netisr.h>
|
2004-08-27 15:16:24 +00:00
|
|
|
#include <net/pfil.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <net/route.h>
|
2009-04-19 04:44:05 +00:00
|
|
|
#include <net/flowtable.h>
|
This patch provides the back end support for equal-cost multi-path
(ECMP) for both IPv4 and IPv6. Previously, multipath route insertion
is disallowed. For example,
route add -net 192.103.54.0/24 10.9.44.1
route add -net 192.103.54.0/24 10.9.44.2
The second route insertion will trigger an error message of
"add net 192.103.54.0/24: gateway 10.2.5.2: route already in table"
Multiple default routes can also be inserted. Here is the netstat
output:
default 10.2.5.1 UGS 0 3074 bge0 =>
default 10.2.5.2 UGS 0 0 bge0
When multipath routes exist, the "route delete" command requires
a specific gateway to be specified or else an error message would
be displayed. For example,
route delete default
would fail and trigger the following error message:
"route: writing to routing socket: No such process"
"delete net default: not in table"
On the other hand,
route delete default 10.2.5.2
would be successful: "delete net default: gateway 10.2.5.2"
One does not have to specify a gateway if there is only a single
route for a particular destination.
I need to perform more testings on address aliases and multiple
interfaces that have the same IP prefixes. This patch as it
stands today is not yet ready for prime time. Therefore, the ECMP
code fragments are fully guarded by the RADIX_MPATH macro.
Include the "options RADIX_MPATH" in the kernel configuration
to enable this feature.
Reviewed by: robert, sam, gnn, julian, kmacy
2008-04-13 05:45:14 +00:00
|
|
|
#ifdef RADIX_MPATH
|
|
|
|
#include <net/radix_mpath.h>
|
|
|
|
#endif
|
2008-12-02 21:37:28 +00:00
|
|
|
#include <net/vnet.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
#include <netinet/in.h>
|
|
|
|
#include <netinet/in_systm.h>
|
|
|
|
#include <netinet/ip.h>
|
|
|
|
#include <netinet/in_pcb.h>
|
|
|
|
#include <netinet/in_var.h>
|
|
|
|
#include <netinet/ip_var.h>
|
2005-11-18 20:12:40 +00:00
|
|
|
#include <netinet/ip_options.h>
|
2009-02-03 11:00:43 +00:00
|
|
|
#ifdef SCTP
|
|
|
|
#include <netinet/sctp.h>
|
|
|
|
#include <netinet/sctp_crc32.h>
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2007-07-03 12:13:45 +00:00
|
|
|
#ifdef IPSEC
|
2007-07-01 11:41:27 +00:00
|
|
|
#include <netinet/ip_ipsec.h>
|
2002-10-16 02:25:05 +00:00
|
|
|
#include <netipsec/ipsec.h>
|
2007-07-03 12:13:45 +00:00
|
|
|
#endif /* IPSEC*/
|
2006-02-01 13:55:03 +00:00
|
|
|
|
|
|
|
#include <machine/in_cksum.h>
|
|
|
|
|
2006-10-22 11:52:19 +00:00
|
|
|
#include <security/mac/mac_framework.h>
|
|
|
|
|
Remove (almost all) global variables that were used to hold
packet forwarding state ("annotations") during ip processing.
The code is considerably cleaner now.
The variables removed by this change are:
ip_divert_cookie used by divert sockets
ip_fw_fwd_addr used for transparent ip redirection
last_pkt used by dynamic pipes in dummynet
Removal of the first two has been done by carrying the annotations
into volatile structs prepended to the mbuf chains, and adding
appropriate code to add/remove annotations in the routines which
make use of them, i.e. ip_input(), ip_output(), tcp_input(),
bdg_forward(), ether_demux(), ether_output_frame(), div_output().
On passing, remove a bug in divert handling of fragmented packet.
Now it is the fragment at offset 0 which sets the divert status of
the whole packet, whereas formerly it was the last incoming fragment
to decide.
Removal of last_pkt required a change in the interface of ip_fw_chk()
and dummynet_io(). On passing, use the same mechanism for dummynet
annotations and for divert/forward annotations.
option IPFIREWALL_FORWARD is effectively useless, the code to
implement it is very small and is now in by default to avoid the
obfuscation of conditionally compiled code.
NOTES:
* there is at least one global variable left, sro_fwd, in ip_output().
I am not sure if/how this can be removed.
* I have deliberately avoided gratuitous style changes in this commit
to avoid cluttering the diffs. Minor stule cleanup will likely be
necessary
* this commit only focused on the IP layer. I am sure there is a
number of global variables used in the TCP and maybe UDP stack.
* despite the number of files touched, there are absolutely no API's
or data structures changed by this commit (except the interfaces of
ip_fw_chk() and dummynet_io(), which are internal anyways), so
an MFC is quite safe and unintrusive (and desirable, given the
improved readability of the code).
MFC after: 10 days
2002-06-22 11:51:02 +00:00
|
|
|
#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\
|
|
|
|
x, (ntohl(a.s_addr)>>24)&0xFF,\
|
|
|
|
(ntohl(a.s_addr)>>16)&0xFF,\
|
|
|
|
(ntohl(a.s_addr)>>8)&0xFF,\
|
|
|
|
(ntohl(a.s_addr))&0xFF, y);
|
1998-07-06 03:20:19 +00:00
|
|
|
|
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator
(DPCPU), as suggested by Peter Wemm, and implement a new per-virtual
network stack memory allocator. Modify vnet to use the allocator
instead of monolithic global container structures (vinet, ...). This
change solves many binary compatibility problems associated with
VIMAGE, and restores ELF symbols for virtualized global variables.
Each virtualized global variable exists as a "reference copy", and also
once per virtual network stack. Virtualized global variables are
tagged at compile-time, placing the in a special linker set, which is
loaded into a contiguous region of kernel memory. Virtualized global
variables in the base kernel are linked as normal, but those in modules
are copied and relocated to a reserved portion of the kernel's vnet
region with the help of a the kernel linker.
Virtualized global variables exist in per-vnet memory set up when the
network stack instance is created, and are initialized statically from
the reference copy. Run-time access occurs via an accessor macro, which
converts from the current vnet and requested symbol to a per-vnet
address. When "options VIMAGE" is not compiled into the kernel, normal
global ELF symbols will be used instead and indirection is avoided.
This change restores static initialization for network stack global
variables, restores support for non-global symbols and types, eliminates
the need for many subsystem constructors, eliminates large per-subsystem
structures that caused many binary compatibility issues both for
monitoring applications (netstat) and kernel modules, removes the
per-function INIT_VNET_*() macros throughout the stack, eliminates the
need for vnet_symmap ksym(2) munging, and eliminates duplicate
definitions of virtualized globals under VIMAGE_GLOBALS.
Bump __FreeBSD_version and update UPDATING.
Portions submitted by: bz
Reviewed by: bz, zec
Discussed with: gnn, jamie, jeff, jhb, julian, sam
Suggested by: peter
Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
|
|
|
VNET_DEFINE(u_short, ip_id);
|
1994-08-18 22:36:09 +00:00
|
|
|
|
2003-04-12 06:11:46 +00:00
|
|
|
#ifdef MBUF_STRESS_TEST
|
2003-03-25 05:45:05 +00:00
|
|
|
int mbuf_frag_size = 0;
|
|
|
|
SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
|
|
|
|
&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
|
|
|
|
#endif
|
|
|
|
|
1995-11-14 20:34:56 +00:00
|
|
|
static void ip_mloopback
|
2002-03-19 21:25:46 +00:00
|
|
|
(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
|
1997-02-10 11:45:37 +00:00
|
|
|
|
|
|
|
|
2009-03-04 03:45:34 +00:00
|
|
|
extern int in_mcast_loop;
|
1996-07-10 19:44:30 +00:00
|
|
|
extern struct protosw inetsw[];
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* IP output. The packet in mbuf chain m contains a skeletal IP
|
|
|
|
* header (with len, off, ttl, proto, tos, src, dst).
|
|
|
|
* The mbuf chain containing the packet will be freed.
|
|
|
|
* The mbuf opt, if present, will not be freed.
|
2003-11-03 18:03:05 +00:00
|
|
|
* In the IP forwarding case, the packet will arrive with options already
|
|
|
|
* inserted, so must have a NULL opt pointer.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
int
|
2007-05-10 15:58:48 +00:00
|
|
|
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
|
|
|
|
struct ip_moptions *imo, struct inpcb *inp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2003-08-07 18:16:59 +00:00
|
|
|
struct ip *ip;
|
Remove (almost all) global variables that were used to hold
packet forwarding state ("annotations") during ip processing.
The code is considerably cleaner now.
The variables removed by this change are:
ip_divert_cookie used by divert sockets
ip_fw_fwd_addr used for transparent ip redirection
last_pkt used by dynamic pipes in dummynet
Removal of the first two has been done by carrying the annotations
into volatile structs prepended to the mbuf chains, and adding
appropriate code to add/remove annotations in the routines which
make use of them, i.e. ip_input(), ip_output(), tcp_input(),
bdg_forward(), ether_demux(), ether_output_frame(), div_output().
On passing, remove a bug in divert handling of fragmented packet.
Now it is the fragment at offset 0 which sets the divert status of
the whole packet, whereas formerly it was the last incoming fragment
to decide.
Removal of last_pkt required a change in the interface of ip_fw_chk()
and dummynet_io(). On passing, use the same mechanism for dummynet
annotations and for divert/forward annotations.
option IPFIREWALL_FORWARD is effectively useless, the code to
implement it is very small and is now in by default to avoid the
obfuscation of conditionally compiled code.
NOTES:
* there is at least one global variable left, sro_fwd, in ip_output().
I am not sure if/how this can be removed.
* I have deliberately avoided gratuitous style changes in this commit
to avoid cluttering the diffs. Minor stule cleanup will likely be
necessary
* this commit only focused on the IP layer. I am sure there is a
number of global variables used in the TCP and maybe UDP stack.
* despite the number of files touched, there are absolutely no API's
or data structures changed by this commit (except the interfaces of
ip_fw_chk() and dummynet_io(), which are internal anyways), so
an MFC is quite safe and unintrusive (and desirable, given the
improved readability of the code).
MFC after: 10 days
2002-06-22 11:51:02 +00:00
|
|
|
struct ifnet *ifp = NULL; /* keep compiler happy */
|
2004-02-25 19:55:29 +00:00
|
|
|
struct mbuf *m0;
|
1996-04-03 13:52:20 +00:00
|
|
|
int hlen = sizeof (struct ip);
|
2006-09-10 17:49:09 +00:00
|
|
|
int mtu;
|
2004-08-17 22:05:54 +00:00
|
|
|
int len, error = 0;
|
2009-04-19 04:44:05 +00:00
|
|
|
int nortfree = 0;
|
Remove (almost all) global variables that were used to hold
packet forwarding state ("annotations") during ip processing.
The code is considerably cleaner now.
The variables removed by this change are:
ip_divert_cookie used by divert sockets
ip_fw_fwd_addr used for transparent ip redirection
last_pkt used by dynamic pipes in dummynet
Removal of the first two has been done by carrying the annotations
into volatile structs prepended to the mbuf chains, and adding
appropriate code to add/remove annotations in the routines which
make use of them, i.e. ip_input(), ip_output(), tcp_input(),
bdg_forward(), ether_demux(), ether_output_frame(), div_output().
On passing, remove a bug in divert handling of fragmented packet.
Now it is the fragment at offset 0 which sets the divert status of
the whole packet, whereas formerly it was the last incoming fragment
to decide.
Removal of last_pkt required a change in the interface of ip_fw_chk()
and dummynet_io(). On passing, use the same mechanism for dummynet
annotations and for divert/forward annotations.
option IPFIREWALL_FORWARD is effectively useless, the code to
implement it is very small and is now in by default to avoid the
obfuscation of conditionally compiled code.
NOTES:
* there is at least one global variable left, sro_fwd, in ip_output().
I am not sure if/how this can be removed.
* I have deliberately avoided gratuitous style changes in this commit
to avoid cluttering the diffs. Minor stule cleanup will likely be
necessary
* this commit only focused on the IP layer. I am sure there is a
number of global variables used in the TCP and maybe UDP stack.
* despite the number of files touched, there are absolutely no API's
or data structures changed by this commit (except the interfaces of
ip_fw_chk() and dummynet_io(), which are internal anyways), so
an MFC is quite safe and unintrusive (and desirable, given the
improved readability of the code).
MFC after: 10 days
2002-06-22 11:51:02 +00:00
|
|
|
struct sockaddr_in *dst = NULL; /* keep compiler happy */
|
2002-07-12 22:08:47 +00:00
|
|
|
struct in_ifaddr *ia = NULL;
|
2000-03-27 19:14:27 +00:00
|
|
|
int isbroadcast, sw_csum;
|
2002-03-22 16:45:54 +00:00
|
|
|
struct route iproute;
|
2004-08-17 22:05:54 +00:00
|
|
|
struct in_addr odst;
|
|
|
|
#ifdef IPFIREWALL_FORWARD
|
|
|
|
struct m_tag *fwd_tag = NULL;
|
2009-04-28 11:10:33 +00:00
|
|
|
#endif
|
|
|
|
#ifdef IPSEC
|
|
|
|
int no_route_but_check_spd = 0;
|
2004-08-17 22:05:54 +00:00
|
|
|
#endif
|
2003-04-08 14:25:47 +00:00
|
|
|
M_ASSERTPKTHDR(m);
|
2005-09-26 20:25:16 +00:00
|
|
|
|
2008-11-19 19:19:30 +00:00
|
|
|
if (inp != NULL) {
|
2008-04-19 14:35:17 +00:00
|
|
|
INP_LOCK_ASSERT(inp);
|
2009-05-21 09:45:47 +00:00
|
|
|
M_SETFIB(m, inp->inp_inc.inc_fibnum);
|
2009-04-10 06:16:14 +00:00
|
|
|
if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
|
|
|
|
m->m_pkthdr.flowid = inp->inp_flowid;
|
|
|
|
m->m_flags |= M_FLOWID;
|
|
|
|
}
|
2008-11-19 19:19:30 +00:00
|
|
|
}
|
2009-05-21 09:45:47 +00:00
|
|
|
|
|
|
|
if (ro == NULL) {
|
|
|
|
ro = &iproute;
|
|
|
|
bzero(ro, sizeof (*ro));
|
|
|
|
|
2009-06-12 20:46:36 +00:00
|
|
|
#ifdef FLOWTABLE
|
2009-05-21 09:45:47 +00:00
|
|
|
/*
|
|
|
|
* The flow table returns route entries valid for up to 30
|
|
|
|
* seconds; we rely on the remainder of ip_output() taking no
|
|
|
|
* longer than that long for the stability of ro_rt. The
|
|
|
|
* flow ID assignment must have happened before this point.
|
|
|
|
*/
|
2009-06-22 21:19:24 +00:00
|
|
|
if (flowtable_lookup(V_ip_ft, m, ro) == 0)
|
2009-04-19 04:44:05 +00:00
|
|
|
nortfree = 1;
|
2009-06-12 20:46:36 +00:00
|
|
|
#endif
|
2009-04-19 04:44:05 +00:00
|
|
|
}
|
2002-05-21 18:52:24 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
if (opt) {
|
2002-09-23 08:56:24 +00:00
|
|
|
len = 0;
|
1994-05-24 10:09:53 +00:00
|
|
|
m = ip_insertoptions(m, opt, &len);
|
2002-09-23 08:56:24 +00:00
|
|
|
if (len != 0)
|
2002-09-17 11:13:04 +00:00
|
|
|
hlen = len;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
ip = mtod(m, struct ip *);
|
2001-12-28 21:21:57 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2003-05-31 17:55:21 +00:00
|
|
|
* Fill in IP header. If we are not allowing fragmentation,
|
2004-01-08 11:13:40 +00:00
|
|
|
* then the ip_id field is meaningless, but we don't set it
|
|
|
|
* to zero. Doing so causes various problems when devices along
|
|
|
|
* the path (routers, load balancers, firewalls, etc.) illegally
|
|
|
|
* disable DF on our packet. Note that a 16-bit counter
|
2003-05-31 17:55:21 +00:00
|
|
|
* will wrap around in less than 10 seconds at 100 Mbit/s on a
|
|
|
|
* medium with MTU 1500. See Steven M. Bellovin, "A Technique
|
|
|
|
* for Counting NATted Hosts", Proc. IMW'02, available at
|
2006-06-29 13:38:36 +00:00
|
|
|
* <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
|
2002-10-20 22:52:07 +00:00
|
|
|
ip->ip_v = IPVERSION;
|
|
|
|
ip->ip_hl = hlen >> 2;
|
2004-08-14 15:32:40 +00:00
|
|
|
ip->ip_id = ip_newid();
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_localout);
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
2002-10-20 22:52:07 +00:00
|
|
|
hlen = ip->ip_hl << 2;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1996-04-18 15:49:06 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
dst = (struct sockaddr_in *)&ro->ro_dst;
|
2004-08-17 22:05:54 +00:00
|
|
|
again:
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* If there is a cached route,
|
|
|
|
* check that it is to the same destination
|
|
|
|
* and is still up. If not, free it and try again.
|
2002-01-21 20:04:22 +00:00
|
|
|
* The address family should also be checked in case of sharing the
|
|
|
|
* cache with IPv6.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
|
2002-01-21 20:04:22 +00:00
|
|
|
dst->sin_family != AF_INET ||
|
2004-08-17 22:05:54 +00:00
|
|
|
dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
|
2009-08-14 23:44:59 +00:00
|
|
|
if (!nortfree) {
|
2009-04-19 04:44:05 +00:00
|
|
|
RTFREE(ro->ro_rt);
|
2009-08-14 23:44:59 +00:00
|
|
|
LLE_FREE(ro->ro_lle);
|
|
|
|
}
|
2006-09-10 17:49:09 +00:00
|
|
|
ro->ro_rt = (struct rtentry *)NULL;
|
2009-08-14 23:44:59 +00:00
|
|
|
ro->ro_lle = (struct llentry *)NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2004-08-17 22:05:54 +00:00
|
|
|
#ifdef IPFIREWALL_FORWARD
|
|
|
|
if (ro->ro_rt == NULL && fwd_tag == NULL) {
|
|
|
|
#else
|
2004-08-11 10:46:15 +00:00
|
|
|
if (ro->ro_rt == NULL) {
|
2004-08-17 22:05:54 +00:00
|
|
|
#endif
|
2002-01-21 20:04:22 +00:00
|
|
|
bzero(dst, sizeof(*dst));
|
1994-05-24 10:09:53 +00:00
|
|
|
dst->sin_family = AF_INET;
|
|
|
|
dst->sin_len = sizeof(*dst);
|
2004-08-17 22:05:54 +00:00
|
|
|
dst->sin_addr = ip->ip_dst;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
/*
|
2007-03-01 13:29:30 +00:00
|
|
|
* If routing to interface only, short circuit routing lookup.
|
|
|
|
* The use of an all-ones broadcast address implies this; an
|
|
|
|
* interface is specified by the broadcast address of an interface,
|
|
|
|
* or the destination address of a ptp interface.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2007-03-01 13:29:30 +00:00
|
|
|
if (flags & IP_SENDONES) {
|
|
|
|
if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
|
|
|
|
(ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_noroute);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = ENETUNREACH;
|
|
|
|
goto bad;
|
|
|
|
}
|
2007-03-01 13:29:30 +00:00
|
|
|
ip->ip_dst.s_addr = INADDR_BROADCAST;
|
|
|
|
dst->sin_addr = ip->ip_dst;
|
1994-05-24 10:09:53 +00:00
|
|
|
ifp = ia->ia_ifp;
|
|
|
|
ip->ip_ttl = 1;
|
2007-03-01 13:29:30 +00:00
|
|
|
isbroadcast = 1;
|
|
|
|
} else if (flags & IP_ROUTETOIF) {
|
|
|
|
if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
|
|
|
|
(ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_noroute);
|
2006-09-06 17:12:10 +00:00
|
|
|
error = ENETUNREACH;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
ifp = ia->ia_ifp;
|
|
|
|
ip->ip_ttl = 1;
|
2007-03-01 13:29:30 +00:00
|
|
|
isbroadcast = in_broadcast(dst->sin_addr, ifp);
|
2001-07-17 18:47:48 +00:00
|
|
|
} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
|
2001-07-23 16:50:01 +00:00
|
|
|
imo != NULL && imo->imo_multicast_ifp != NULL) {
|
2001-07-17 18:47:48 +00:00
|
|
|
/*
|
2001-07-23 16:50:01 +00:00
|
|
|
* Bypass the normal routing lookup for multicast
|
|
|
|
* packets if the interface is specified.
|
2001-07-17 18:47:48 +00:00
|
|
|
*/
|
2001-07-23 16:50:01 +00:00
|
|
|
ifp = imo->imo_multicast_ifp;
|
|
|
|
IFP_TO_IA(ifp, ia);
|
|
|
|
isbroadcast = 0; /* fool gcc */
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
1994-12-13 23:08:12 +00:00
|
|
|
/*
|
2003-11-20 20:07:39 +00:00
|
|
|
* We want to do any cloning requested by the link layer,
|
|
|
|
* as this is probably required in all cases for correct
|
|
|
|
* operation (as it is for ARP).
|
1994-12-13 23:08:12 +00:00
|
|
|
*/
|
2004-08-11 10:46:15 +00:00
|
|
|
if (ro->ro_rt == NULL)
|
This patch provides the back end support for equal-cost multi-path
(ECMP) for both IPv4 and IPv6. Previously, multipath route insertion
is disallowed. For example,
route add -net 192.103.54.0/24 10.9.44.1
route add -net 192.103.54.0/24 10.9.44.2
The second route insertion will trigger an error message of
"add net 192.103.54.0/24: gateway 10.2.5.2: route already in table"
Multiple default routes can also be inserted. Here is the netstat
output:
default 10.2.5.1 UGS 0 3074 bge0 =>
default 10.2.5.2 UGS 0 0 bge0
When multipath routes exist, the "route delete" command requires
a specific gateway to be specified or else an error message would
be displayed. For example,
route delete default
would fail and trigger the following error message:
"route: writing to routing socket: No such process"
"delete net default: not in table"
On the other hand,
route delete default 10.2.5.2
would be successful: "delete net default: gateway 10.2.5.2"
One does not have to specify a gateway if there is only a single
route for a particular destination.
I need to perform more testings on address aliases and multiple
interfaces that have the same IP prefixes. This patch as it
stands today is not yet ready for prime time. Therefore, the ECMP
code fragments are fully guarded by the RADIX_MPATH macro.
Include the "options RADIX_MPATH" in the kernel configuration
to enable this feature.
Reviewed by: robert, sam, gnn, julian, kmacy
2008-04-13 05:45:14 +00:00
|
|
|
#ifdef RADIX_MPATH
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
rtalloc_mpath_fib(ro,
|
|
|
|
ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
|
|
|
|
inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
|
This patch provides the back end support for equal-cost multi-path
(ECMP) for both IPv4 and IPv6. Previously, multipath route insertion
is disallowed. For example,
route add -net 192.103.54.0/24 10.9.44.1
route add -net 192.103.54.0/24 10.9.44.2
The second route insertion will trigger an error message of
"add net 192.103.54.0/24: gateway 10.2.5.2: route already in table"
Multiple default routes can also be inserted. Here is the netstat
output:
default 10.2.5.1 UGS 0 3074 bge0 =>
default 10.2.5.2 UGS 0 0 bge0
When multipath routes exist, the "route delete" command requires
a specific gateway to be specified or else an error message would
be displayed. For example,
route delete default
would fail and trigger the following error message:
"route: writing to routing socket: No such process"
"delete net default: not in table"
On the other hand,
route delete default 10.2.5.2
would be successful: "delete net default: gateway 10.2.5.2"
One does not have to specify a gateway if there is only a single
route for a particular destination.
I need to perform more testings on address aliases and multiple
interfaces that have the same IP prefixes. This patch as it
stands today is not yet ready for prime time. Therefore, the ECMP
code fragments are fully guarded by the RADIX_MPATH macro.
Include the "options RADIX_MPATH" in the kernel configuration
to enable this feature.
Reviewed by: robert, sam, gnn, julian, kmacy
2008-04-13 05:45:14 +00:00
|
|
|
#else
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
in_rtalloc_ign(ro, 0,
|
|
|
|
inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
|
This patch provides the back end support for equal-cost multi-path
(ECMP) for both IPv4 and IPv6. Previously, multipath route insertion
is disallowed. For example,
route add -net 192.103.54.0/24 10.9.44.1
route add -net 192.103.54.0/24 10.9.44.2
The second route insertion will trigger an error message of
"add net 192.103.54.0/24: gateway 10.2.5.2: route already in table"
Multiple default routes can also be inserted. Here is the netstat
output:
default 10.2.5.1 UGS 0 3074 bge0 =>
default 10.2.5.2 UGS 0 0 bge0
When multipath routes exist, the "route delete" command requires
a specific gateway to be specified or else an error message would
be displayed. For example,
route delete default
would fail and trigger the following error message:
"route: writing to routing socket: No such process"
"delete net default: not in table"
On the other hand,
route delete default 10.2.5.2
would be successful: "delete net default: gateway 10.2.5.2"
One does not have to specify a gateway if there is only a single
route for a particular destination.
I need to perform more testings on address aliases and multiple
interfaces that have the same IP prefixes. This patch as it
stands today is not yet ready for prime time. Therefore, the ECMP
code fragments are fully guarded by the RADIX_MPATH macro.
Include the "options RADIX_MPATH" in the kernel configuration
to enable this feature.
Reviewed by: robert, sam, gnn, julian, kmacy
2008-04-13 05:45:14 +00:00
|
|
|
#endif
|
2004-08-11 10:46:15 +00:00
|
|
|
if (ro->ro_rt == NULL) {
|
2009-04-28 11:10:33 +00:00
|
|
|
#ifdef IPSEC
|
|
|
|
/*
|
|
|
|
* There is no route for this packet, but it is
|
|
|
|
* possible that a matching SPD entry exists.
|
|
|
|
*/
|
|
|
|
no_route_but_check_spd = 1;
|
|
|
|
mtu = 0; /* Silence GCC warning. */
|
|
|
|
goto sendit;
|
|
|
|
#endif
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_noroute);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = EHOSTUNREACH;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
ia = ifatoia(ro->ro_rt->rt_ifa);
|
2009-06-23 20:19:09 +00:00
|
|
|
ifa_ref(&ia->ia_ifa);
|
1994-05-24 10:09:53 +00:00
|
|
|
ifp = ro->ro_rt->rt_ifp;
|
2003-11-20 20:07:39 +00:00
|
|
|
ro->ro_rt->rt_rmx.rmx_pksent++;
|
1994-05-24 10:09:53 +00:00
|
|
|
if (ro->ro_rt->rt_flags & RTF_GATEWAY)
|
2001-07-19 07:10:30 +00:00
|
|
|
dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
|
1996-05-06 17:42:13 +00:00
|
|
|
if (ro->ro_rt->rt_flags & RTF_HOST)
|
2001-07-19 07:10:30 +00:00
|
|
|
isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
|
1996-05-06 17:42:13 +00:00
|
|
|
else
|
|
|
|
isbroadcast = in_broadcast(dst->sin_addr, ifp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2006-09-10 17:49:09 +00:00
|
|
|
/*
|
|
|
|
* Calculate MTU. If we have a route that is up, use that,
|
|
|
|
* otherwise use the interface's MTU.
|
|
|
|
*/
|
2006-09-11 19:56:10 +00:00
|
|
|
if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) {
|
2006-09-10 17:49:09 +00:00
|
|
|
/*
|
|
|
|
* This case can happen if the user changed the MTU
|
|
|
|
* of an interface after enabling IP on it. Because
|
|
|
|
* most netifs don't keep track of routes pointing to
|
|
|
|
* them, there is no way for one to update all its
|
|
|
|
* routes when the MTU is changed.
|
|
|
|
*/
|
|
|
|
if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)
|
|
|
|
ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
|
|
|
|
mtu = ro->ro_rt->rt_rmx.rmx_mtu;
|
|
|
|
} else {
|
|
|
|
mtu = ifp->if_mtu;
|
|
|
|
}
|
2004-08-17 22:05:54 +00:00
|
|
|
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
|
1994-05-24 10:09:53 +00:00
|
|
|
m->m_flags |= M_MCAST;
|
|
|
|
/*
|
|
|
|
* IP destination address is multicast. Make sure "dst"
|
|
|
|
* still points to the address in "ro". (It may have been
|
|
|
|
* changed to point to a gateway address, above.)
|
|
|
|
*/
|
|
|
|
dst = (struct sockaddr_in *)&ro->ro_dst;
|
|
|
|
/*
|
|
|
|
* See if the caller provided any multicast options
|
|
|
|
*/
|
|
|
|
if (imo != NULL) {
|
|
|
|
ip->ip_ttl = imo->imo_multicast_ttl;
|
1995-06-13 17:51:16 +00:00
|
|
|
if (imo->imo_multicast_vif != -1)
|
|
|
|
ip->ip_src.s_addr =
|
Massive cleanup of the ip_mroute code.
No functional changes, but:
+ the mrouting module now should behave the same as the compiled-in
version (it did not before, some of the rsvp code was not loaded
properly);
+ netinet/ip_mroute.c is now truly optional;
+ removed some redundant/unused code;
+ changed many instances of '0' to NULL and INADDR_ANY as appropriate;
+ removed several static variables to make the code more SMP-friendly;
+ fixed some minor bugs in the mrouting code (mostly, incorrect return
values from functions).
This commit is also a prerequisite to the addition of support for PIM,
which i would like to put in before DP2 (it does not change any of
the existing APIs, anyways).
Note, in the process we found out that some device drivers fail to
properly handle changes in IFF_ALLMULTI, leading to interesting
behaviour when a multicast router is started. This bug is not
corrected by this commit, and will be fixed with a separate commit.
Detailed changes:
--------------------
netinet/ip_mroute.c all the above.
conf/files make ip_mroute.c optional
net/route.c fix mrt_ioctl hook
netinet/ip_input.c fix ip_mforward hook, move rsvp_input() here
together with other rsvp code, and a couple
of indentation fixes.
netinet/ip_output.c fix ip_mforward and ip_mcast_src hooks
netinet/ip_var.h rsvp function hooks
netinet/raw_ip.c hooks for mrouting and rsvp functions, plus
interface cleanup.
netinet/ip_mroute.h remove an unused and optional field from a struct
Most of the code is from Pavlin Radoslavov and the XORP project
Reviewed by: sam
MFC after: 1 week
2002-11-15 22:53:53 +00:00
|
|
|
ip_mcast_src ?
|
|
|
|
ip_mcast_src(imo->imo_multicast_vif) :
|
|
|
|
INADDR_ANY;
|
1994-05-24 10:09:53 +00:00
|
|
|
} else
|
|
|
|
ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
|
|
|
|
/*
|
|
|
|
* Confirm that the outgoing interface supports multicast.
|
|
|
|
*/
|
1995-06-13 17:51:16 +00:00
|
|
|
if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
|
|
|
|
if ((ifp->if_flags & IFF_MULTICAST) == 0) {
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_noroute);
|
1995-06-13 17:51:16 +00:00
|
|
|
error = ENETUNREACH;
|
|
|
|
goto bad;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If source address not specified yet, use address
|
|
|
|
* of outgoing interface.
|
|
|
|
*/
|
|
|
|
if (ip->ip_src.s_addr == INADDR_ANY) {
|
2001-07-23 16:50:01 +00:00
|
|
|
/* Interface may have no addresses. */
|
|
|
|
if (ia != NULL)
|
|
|
|
ip->ip_src = IA_SIN(ia)->sin_addr;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2009-03-04 03:45:34 +00:00
|
|
|
if ((imo == NULL && in_mcast_loop) ||
|
|
|
|
(imo && imo->imo_multicast_loop)) {
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2009-03-04 03:45:34 +00:00
|
|
|
* Loop back multicast datagram if not expressly
|
|
|
|
* forbidden to do so, even if we are not a member
|
|
|
|
* of the group; ip_input() will filter it later,
|
|
|
|
* thus deferring a hash lookup and mutex acquisition
|
|
|
|
* at the expense of a cheap copy using m_copym().
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1997-05-06 21:22:04 +00:00
|
|
|
ip_mloopback(ifp, m, dst, hlen);
|
2009-03-04 03:45:34 +00:00
|
|
|
} else {
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* If we are acting as a multicast router, perform
|
|
|
|
* multicast forwarding as if the packet had just
|
|
|
|
* arrived on the interface to which we are about
|
|
|
|
* to send. The multicast forwarding function
|
|
|
|
* recursively calls this function, using the
|
|
|
|
* IP_FORWARDING flag to prevent infinite recursion.
|
|
|
|
*
|
|
|
|
* Multicasts that are looped back by ip_mloopback(),
|
|
|
|
* above, will be forwarded by the ip_input() routine,
|
|
|
|
* if necessary.
|
|
|
|
*/
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
|
1994-09-06 22:42:31 +00:00
|
|
|
/*
|
Massive cleanup of the ip_mroute code.
No functional changes, but:
+ the mrouting module now should behave the same as the compiled-in
version (it did not before, some of the rsvp code was not loaded
properly);
+ netinet/ip_mroute.c is now truly optional;
+ removed some redundant/unused code;
+ changed many instances of '0' to NULL and INADDR_ANY as appropriate;
+ removed several static variables to make the code more SMP-friendly;
+ fixed some minor bugs in the mrouting code (mostly, incorrect return
values from functions).
This commit is also a prerequisite to the addition of support for PIM,
which i would like to put in before DP2 (it does not change any of
the existing APIs, anyways).
Note, in the process we found out that some device drivers fail to
properly handle changes in IFF_ALLMULTI, leading to interesting
behaviour when a multicast router is started. This bug is not
corrected by this commit, and will be fixed with a separate commit.
Detailed changes:
--------------------
netinet/ip_mroute.c all the above.
conf/files make ip_mroute.c optional
net/route.c fix mrt_ioctl hook
netinet/ip_input.c fix ip_mforward hook, move rsvp_input() here
together with other rsvp code, and a couple
of indentation fixes.
netinet/ip_output.c fix ip_mforward and ip_mcast_src hooks
netinet/ip_var.h rsvp function hooks
netinet/raw_ip.c hooks for mrouting and rsvp functions, plus
interface cleanup.
netinet/ip_mroute.h remove an unused and optional field from a struct
Most of the code is from Pavlin Radoslavov and the XORP project
Reviewed by: sam
MFC after: 1 week
2002-11-15 22:53:53 +00:00
|
|
|
* If rsvp daemon is not running, do not
|
1994-09-06 22:42:31 +00:00
|
|
|
* set ip_moptions. This ensures that the packet
|
|
|
|
* is multicast and not just sent down one link
|
|
|
|
* as prescribed by rsvpd.
|
|
|
|
*/
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
if (!V_rsvp_on)
|
Massive cleanup of the ip_mroute code.
No functional changes, but:
+ the mrouting module now should behave the same as the compiled-in
version (it did not before, some of the rsvp code was not loaded
properly);
+ netinet/ip_mroute.c is now truly optional;
+ removed some redundant/unused code;
+ changed many instances of '0' to NULL and INADDR_ANY as appropriate;
+ removed several static variables to make the code more SMP-friendly;
+ fixed some minor bugs in the mrouting code (mostly, incorrect return
values from functions).
This commit is also a prerequisite to the addition of support for PIM,
which i would like to put in before DP2 (it does not change any of
the existing APIs, anyways).
Note, in the process we found out that some device drivers fail to
properly handle changes in IFF_ALLMULTI, leading to interesting
behaviour when a multicast router is started. This bug is not
corrected by this commit, and will be fixed with a separate commit.
Detailed changes:
--------------------
netinet/ip_mroute.c all the above.
conf/files make ip_mroute.c optional
net/route.c fix mrt_ioctl hook
netinet/ip_input.c fix ip_mforward hook, move rsvp_input() here
together with other rsvp code, and a couple
of indentation fixes.
netinet/ip_output.c fix ip_mforward and ip_mcast_src hooks
netinet/ip_var.h rsvp function hooks
netinet/raw_ip.c hooks for mrouting and rsvp functions, plus
interface cleanup.
netinet/ip_mroute.h remove an unused and optional field from a struct
Most of the code is from Pavlin Radoslavov and the XORP project
Reviewed by: sam
MFC after: 1 week
2002-11-15 22:53:53 +00:00
|
|
|
imo = NULL;
|
|
|
|
if (ip_mforward &&
|
|
|
|
ip_mforward(ip, ifp, m, imo) != 0) {
|
1994-05-24 10:09:53 +00:00
|
|
|
m_freem(m);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
1994-09-14 03:10:15 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Multicasts with a time-to-live of zero may be looped-
|
|
|
|
* back, above, but must not be transmitted on a network.
|
|
|
|
* Also, multicasts addressed to the loopback interface
|
|
|
|
* are not sent -- the above call to ip_mloopback() will
|
2009-03-04 03:45:34 +00:00
|
|
|
* loop back a copy. ip_input() will drop the copy if
|
|
|
|
* this host does not belong to the destination group on
|
|
|
|
* the loopback interface.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1995-04-26 18:10:58 +00:00
|
|
|
if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
|
1994-05-24 10:09:53 +00:00
|
|
|
m_freem(m);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
goto sendit;
|
|
|
|
}
|
2006-09-29 16:44:45 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
Remove (almost all) global variables that were used to hold
packet forwarding state ("annotations") during ip processing.
The code is considerably cleaner now.
The variables removed by this change are:
ip_divert_cookie used by divert sockets
ip_fw_fwd_addr used for transparent ip redirection
last_pkt used by dynamic pipes in dummynet
Removal of the first two has been done by carrying the annotations
into volatile structs prepended to the mbuf chains, and adding
appropriate code to add/remove annotations in the routines which
make use of them, i.e. ip_input(), ip_output(), tcp_input(),
bdg_forward(), ether_demux(), ether_output_frame(), div_output().
On passing, remove a bug in divert handling of fragmented packet.
Now it is the fragment at offset 0 which sets the divert status of
the whole packet, whereas formerly it was the last incoming fragment
to decide.
Removal of last_pkt required a change in the interface of ip_fw_chk()
and dummynet_io(). On passing, use the same mechanism for dummynet
annotations and for divert/forward annotations.
option IPFIREWALL_FORWARD is effectively useless, the code to
implement it is very small and is now in by default to avoid the
obfuscation of conditionally compiled code.
NOTES:
* there is at least one global variable left, sro_fwd, in ip_output().
I am not sure if/how this can be removed.
* I have deliberately avoided gratuitous style changes in this commit
to avoid cluttering the diffs. Minor stule cleanup will likely be
necessary
* this commit only focused on the IP layer. I am sure there is a
number of global variables used in the TCP and maybe UDP stack.
* despite the number of files touched, there are absolutely no API's
or data structures changed by this commit (except the interfaces of
ip_fw_chk() and dummynet_io(), which are internal anyways), so
an MFC is quite safe and unintrusive (and desirable, given the
improved readability of the code).
MFC after: 10 days
2002-06-22 11:51:02 +00:00
|
|
|
* If the source address is not specified yet, use the address
|
2004-09-06 15:48:38 +00:00
|
|
|
* of the outoing interface.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1998-07-06 03:20:19 +00:00
|
|
|
if (ip->ip_src.s_addr == INADDR_ANY) {
|
2001-07-23 16:50:01 +00:00
|
|
|
/* Interface may have no addresses. */
|
|
|
|
if (ia != NULL) {
|
|
|
|
ip->ip_src = IA_SIN(ia)->sin_addr;
|
|
|
|
}
|
1998-07-06 03:20:19 +00:00
|
|
|
}
|
2006-09-29 16:44:45 +00:00
|
|
|
|
1994-08-01 12:01:45 +00:00
|
|
|
/*
|
2004-08-22 16:42:28 +00:00
|
|
|
* Verify that we have any chance at all of being able to queue the
|
|
|
|
* packet or packet fragments, unless ALTQ is enabled on the given
|
|
|
|
* interface in which case packetdrop should be done by queueing.
|
1994-08-01 12:01:45 +00:00
|
|
|
*/
|
2004-08-22 16:42:28 +00:00
|
|
|
#ifdef ALTQ
|
|
|
|
if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
|
2006-09-10 17:49:09 +00:00
|
|
|
((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
|
2004-08-22 16:42:28 +00:00
|
|
|
ifp->if_snd.ifq_maxlen))
|
|
|
|
#else
|
2006-09-10 17:49:09 +00:00
|
|
|
if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
|
2004-08-22 16:42:28 +00:00
|
|
|
ifp->if_snd.ifq_maxlen)
|
|
|
|
#endif /* ALTQ */
|
|
|
|
{
|
|
|
|
error = ENOBUFS;
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_odropped);
|
2005-12-06 11:16:11 +00:00
|
|
|
ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
|
2004-08-22 16:42:28 +00:00
|
|
|
goto bad;
|
1994-08-01 12:01:45 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Look for broadcast address and
|
2002-01-21 13:59:42 +00:00
|
|
|
* verify user is allowed to send
|
1994-05-24 10:09:53 +00:00
|
|
|
* such a packet.
|
|
|
|
*/
|
1996-05-06 17:42:13 +00:00
|
|
|
if (isbroadcast) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
|
|
|
|
error = EADDRNOTAVAIL;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
if ((flags & IP_ALLOWBROADCAST) == 0) {
|
|
|
|
error = EACCES;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
/* don't allow broadcast messages to be fragmented */
|
2006-09-10 17:49:09 +00:00
|
|
|
if (ip->ip_len > mtu) {
|
1994-05-24 10:09:53 +00:00
|
|
|
error = EMSGSIZE;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
m->m_flags |= M_BCAST;
|
1996-05-06 17:42:13 +00:00
|
|
|
} else {
|
1994-05-24 10:09:53 +00:00
|
|
|
m->m_flags &= ~M_BCAST;
|
1996-05-06 17:42:13 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1997-02-19 14:02:27 +00:00
|
|
|
sendit:
|
2007-07-03 12:13:45 +00:00
|
|
|
#ifdef IPSEC
|
2009-06-05 23:53:17 +00:00
|
|
|
switch(ip_ipsec_output(&m, inp, &flags, &error, &ifp)) {
|
2006-02-01 13:55:03 +00:00
|
|
|
case 1:
|
2001-06-11 12:39:29 +00:00
|
|
|
goto bad;
|
2006-02-01 13:55:03 +00:00
|
|
|
case -1:
|
|
|
|
goto done;
|
|
|
|
case 0:
|
2001-06-11 12:39:29 +00:00
|
|
|
default:
|
2006-02-01 13:55:03 +00:00
|
|
|
break; /* Continue with packet processing. */
|
2001-06-11 12:39:29 +00:00
|
|
|
}
|
2009-04-28 11:10:33 +00:00
|
|
|
/*
|
|
|
|
* Check if there was a route for this packet; return error if not.
|
|
|
|
*/
|
|
|
|
if (no_route_but_check_spd) {
|
|
|
|
IPSTAT_INC(ips_noroute);
|
|
|
|
error = EHOSTUNREACH;
|
|
|
|
goto bad;
|
|
|
|
}
|
2006-02-01 13:55:03 +00:00
|
|
|
/* Update variables that are affected by ipsec4_output(). */
|
2001-06-11 12:39:29 +00:00
|
|
|
ip = mtod(m, struct ip *);
|
|
|
|
hlen = ip->ip_hl << 2;
|
2007-07-03 12:13:45 +00:00
|
|
|
#endif /* IPSEC */
|
2001-06-11 12:39:29 +00:00
|
|
|
|
2004-08-27 15:16:24 +00:00
|
|
|
/* Jump over all PFIL processing if hooks are not active. */
|
2006-02-02 03:13:16 +00:00
|
|
|
if (!PFIL_HOOKED(&inet_pfil_hook))
|
2004-08-27 15:16:24 +00:00
|
|
|
goto passout;
|
|
|
|
|
|
|
|
/* Run through list of hooks for output packets. */
|
2004-08-17 22:05:54 +00:00
|
|
|
odst.s_addr = ip->ip_dst.s_addr;
|
2004-09-29 04:54:33 +00:00
|
|
|
error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
|
2003-09-23 17:54:04 +00:00
|
|
|
if (error != 0 || m == NULL)
|
|
|
|
goto done;
|
1996-08-21 21:37:07 +00:00
|
|
|
|
2004-08-17 22:05:54 +00:00
|
|
|
ip = mtod(m, struct ip *);
|
1998-12-14 18:09:13 +00:00
|
|
|
|
2004-08-17 22:05:54 +00:00
|
|
|
/* See if destination IP address was changed by packet filter. */
|
|
|
|
if (odst.s_addr != ip->ip_dst.s_addr) {
|
|
|
|
m->m_flags |= M_SKIP_FIREWALL;
|
2004-09-13 17:09:06 +00:00
|
|
|
/* If destination is now ourself drop to ip_input(). */
|
2004-08-17 22:05:54 +00:00
|
|
|
if (in_localip(ip->ip_dst)) {
|
|
|
|
m->m_flags |= M_FASTFWD_OURS;
|
|
|
|
if (m->m_pkthdr.rcvif == NULL)
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
m->m_pkthdr.rcvif = V_loif;
|
2004-08-17 22:05:54 +00:00
|
|
|
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
|
|
|
|
m->m_pkthdr.csum_flags |=
|
|
|
|
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
|
|
|
|
m->m_pkthdr.csum_data = 0xffff;
|
|
|
|
}
|
|
|
|
m->m_pkthdr.csum_flags |=
|
|
|
|
CSUM_IP_CHECKED | CSUM_IP_VALID;
|
2009-02-03 11:00:43 +00:00
|
|
|
#ifdef SCTP
|
|
|
|
if (m->m_pkthdr.csum_flags & CSUM_SCTP)
|
|
|
|
m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
|
|
|
|
#endif
|
2004-08-17 22:05:54 +00:00
|
|
|
error = netisr_queue(NETISR_IP, m);
|
2001-12-14 19:34:11 +00:00
|
|
|
goto done;
|
2004-08-17 22:05:54 +00:00
|
|
|
} else
|
2004-09-13 17:09:06 +00:00
|
|
|
goto again; /* Redo the routing table lookup. */
|
2004-08-17 22:05:54 +00:00
|
|
|
}
|
Remove (almost all) global variables that were used to hold
packet forwarding state ("annotations") during ip processing.
The code is considerably cleaner now.
The variables removed by this change are:
ip_divert_cookie used by divert sockets
ip_fw_fwd_addr used for transparent ip redirection
last_pkt used by dynamic pipes in dummynet
Removal of the first two has been done by carrying the annotations
into volatile structs prepended to the mbuf chains, and adding
appropriate code to add/remove annotations in the routines which
make use of them, i.e. ip_input(), ip_output(), tcp_input(),
bdg_forward(), ether_demux(), ether_output_frame(), div_output().
On passing, remove a bug in divert handling of fragmented packet.
Now it is the fragment at offset 0 which sets the divert status of
the whole packet, whereas formerly it was the last incoming fragment
to decide.
Removal of last_pkt required a change in the interface of ip_fw_chk()
and dummynet_io(). On passing, use the same mechanism for dummynet
annotations and for divert/forward annotations.
option IPFIREWALL_FORWARD is effectively useless, the code to
implement it is very small and is now in by default to avoid the
obfuscation of conditionally compiled code.
NOTES:
* there is at least one global variable left, sro_fwd, in ip_output().
I am not sure if/how this can be removed.
* I have deliberately avoided gratuitous style changes in this commit
to avoid cluttering the diffs. Minor stule cleanup will likely be
necessary
* this commit only focused on the IP layer. I am sure there is a
number of global variables used in the TCP and maybe UDP stack.
* despite the number of files touched, there are absolutely no API's
or data structures changed by this commit (except the interfaces of
ip_fw_chk() and dummynet_io(), which are internal anyways), so
an MFC is quite safe and unintrusive (and desirable, given the
improved readability of the code).
MFC after: 10 days
2002-06-22 11:51:02 +00:00
|
|
|
|
2004-08-17 22:05:54 +00:00
|
|
|
#ifdef IPFIREWALL_FORWARD
|
|
|
|
/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
|
|
|
|
if (m->m_flags & M_FASTFWD_OURS) {
|
|
|
|
if (m->m_pkthdr.rcvif == NULL)
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
m->m_pkthdr.rcvif = V_loif;
|
2004-08-17 22:05:54 +00:00
|
|
|
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
|
|
|
|
m->m_pkthdr.csum_flags |=
|
|
|
|
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
|
|
|
|
m->m_pkthdr.csum_data = 0xffff;
|
1997-06-02 05:02:37 +00:00
|
|
|
}
|
2009-02-03 11:00:43 +00:00
|
|
|
#ifdef SCTP
|
|
|
|
if (m->m_pkthdr.csum_flags & CSUM_SCTP)
|
|
|
|
m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
|
|
|
|
#endif
|
2004-08-17 22:05:54 +00:00
|
|
|
m->m_pkthdr.csum_flags |=
|
|
|
|
CSUM_IP_CHECKED | CSUM_IP_VALID;
|
1999-12-06 00:43:07 +00:00
|
|
|
|
2004-08-17 22:05:54 +00:00
|
|
|
error = netisr_queue(NETISR_IP, m);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
/* Or forward to some other address? */
|
|
|
|
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
|
|
|
|
if (fwd_tag) {
|
2006-08-17 00:37:03 +00:00
|
|
|
dst = (struct sockaddr_in *)&ro->ro_dst;
|
|
|
|
bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
|
|
|
|
m->m_flags |= M_SKIP_FIREWALL;
|
|
|
|
m_tag_delete(m, fwd_tag);
|
|
|
|
goto again;
|
2005-02-22 17:40:40 +00:00
|
|
|
}
|
|
|
|
#endif /* IPFIREWALL_FORWARD */
|
1998-12-14 18:09:13 +00:00
|
|
|
|
2004-08-27 15:16:24 +00:00
|
|
|
passout:
|
2002-02-15 12:19:03 +00:00
|
|
|
/* 127/8 must not appear on wire - RFC1122. */
|
|
|
|
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
|
|
|
|
(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
|
|
|
|
if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_badaddr);
|
2002-02-15 12:19:03 +00:00
|
|
|
error = EADDRNOTAVAIL;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
RFC768 (UDP) requires that "if the computed checksum is zero, it
is transmitted as all ones". This got broken after introduction
of delayed checksums as follows. Some guys (including Jonathan)
think that it is allowed to transmit all ones in place of a zero
checksum for TCP the same way as for UDP. (The discussion still
takes place on -net.) Thus, the 0 -> 0xffff checksum fixup was
first moved from udp_output() (see udp_usrreq.c, 1.64 -> 1.65)
to in_cksum_skip() (see sys/i386/i386/in_cksum.c, 1.17 -> 1.18,
INVERT expression). Besides that I disagree that it is valid for
TCP, there was no real problem until in_cksum.c,v 1.20, where the
in_cksum() was made just a special version of in_cksum_skip().
The side effect was that now every incoming IP datagram failed to
pass the checksum test (in_cksum() returned 0xffff when it should
actually return zero). It was fixed next day in revision 1.21,
by removing the INVERT expression. The latter also broke the
0 -> 0xffff fixup for UDP checksums.
Before this change:
: tcpdump: listening on lo0
: 127.0.0.1.33005 > 127.0.0.1.33006: udp 0 (ttl 64, id 1)
: 4500 001c 0001 0000 4011 7cce 7f00 0001
: 7f00 0001 80ed 80ee 0008 0000
After this change:
: tcpdump: listening on lo0
: 127.0.0.1.33005 > 127.0.0.1.33006: udp 0 (ttl 64, id 1)
: 4500 001c 0001 0000 4011 7cce 7f00 0001
: 7f00 0001 80ed 80ee 0008 ffff
2001-03-13 17:07:06 +00:00
|
|
|
m->m_pkthdr.csum_flags |= CSUM_IP;
|
|
|
|
sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
|
2000-03-27 19:14:27 +00:00
|
|
|
if (sw_csum & CSUM_DELAY_DATA) {
|
|
|
|
in_delayed_cksum(m);
|
|
|
|
sw_csum &= ~CSUM_DELAY_DATA;
|
|
|
|
}
|
2009-02-03 11:00:43 +00:00
|
|
|
#ifdef SCTP
|
|
|
|
if (sw_csum & CSUM_SCTP) {
|
|
|
|
sctp_delayed_cksum(m);
|
|
|
|
sw_csum &= ~CSUM_SCTP;
|
|
|
|
}
|
|
|
|
#endif
|
RFC768 (UDP) requires that "if the computed checksum is zero, it
is transmitted as all ones". This got broken after introduction
of delayed checksums as follows. Some guys (including Jonathan)
think that it is allowed to transmit all ones in place of a zero
checksum for TCP the same way as for UDP. (The discussion still
takes place on -net.) Thus, the 0 -> 0xffff checksum fixup was
first moved from udp_output() (see udp_usrreq.c, 1.64 -> 1.65)
to in_cksum_skip() (see sys/i386/i386/in_cksum.c, 1.17 -> 1.18,
INVERT expression). Besides that I disagree that it is valid for
TCP, there was no real problem until in_cksum.c,v 1.20, where the
in_cksum() was made just a special version of in_cksum_skip().
The side effect was that now every incoming IP datagram failed to
pass the checksum test (in_cksum() returned 0xffff when it should
actually return zero). It was fixed next day in revision 1.21,
by removing the INVERT expression. The latter also broke the
0 -> 0xffff fixup for UDP checksums.
Before this change:
: tcpdump: listening on lo0
: 127.0.0.1.33005 > 127.0.0.1.33006: udp 0 (ttl 64, id 1)
: 4500 001c 0001 0000 4011 7cce 7f00 0001
: 7f00 0001 80ed 80ee 0008 0000
After this change:
: tcpdump: listening on lo0
: 127.0.0.1.33005 > 127.0.0.1.33006: udp 0 (ttl 64, id 1)
: 4500 001c 0001 0000 4011 7cce 7f00 0001
: 7f00 0001 80ed 80ee 0008 ffff
2001-03-13 17:07:06 +00:00
|
|
|
m->m_pkthdr.csum_flags &= ifp->if_hwassist;
|
2000-03-27 19:14:27 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2000-03-27 19:14:27 +00:00
|
|
|
* If small enough for interface, or the interface will take
|
2006-09-06 21:51:59 +00:00
|
|
|
* care of the fragmentation for us, we can just send directly.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2006-09-10 17:49:09 +00:00
|
|
|
if (ip->ip_len <= mtu ||
|
2006-09-06 21:51:59 +00:00
|
|
|
(m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
|
|
|
|
((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
|
2002-02-18 20:35:27 +00:00
|
|
|
ip->ip_len = htons(ip->ip_len);
|
|
|
|
ip->ip_off = htons(ip->ip_off);
|
1994-05-24 10:09:53 +00:00
|
|
|
ip->ip_sum = 0;
|
2002-10-20 22:52:07 +00:00
|
|
|
if (sw_csum & CSUM_DELAY_IP)
|
|
|
|
ip->ip_sum = in_cksum(m, hlen);
|
2000-10-19 23:15:54 +00:00
|
|
|
|
2006-09-06 21:51:59 +00:00
|
|
|
/*
|
|
|
|
* Record statistics for this interface address.
|
|
|
|
* With CSUM_TSO the byte/packet count will be slightly
|
|
|
|
* incorrect because we count the IP+TCP headers only
|
|
|
|
* once instead of for every generated packet.
|
|
|
|
*/
|
2001-07-23 16:50:01 +00:00
|
|
|
if (!(flags & IP_FORWARDING) && ia) {
|
2006-09-06 21:51:59 +00:00
|
|
|
if (m->m_pkthdr.csum_flags & CSUM_TSO)
|
2006-12-10 13:44:00 +00:00
|
|
|
ia->ia_ifa.if_opackets +=
|
2006-09-06 21:51:59 +00:00
|
|
|
m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
|
|
|
|
else
|
2006-12-10 13:44:00 +00:00
|
|
|
ia->ia_ifa.if_opackets++;
|
|
|
|
ia->ia_ifa.if_obytes += m->m_pkthdr.len;
|
2000-10-19 23:15:54 +00:00
|
|
|
}
|
2003-04-12 06:11:46 +00:00
|
|
|
#ifdef MBUF_STRESS_TEST
|
2003-09-01 05:55:37 +00:00
|
|
|
if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
|
|
|
|
m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
|
2003-03-25 05:45:05 +00:00
|
|
|
#endif
|
2005-11-18 16:23:26 +00:00
|
|
|
/*
|
|
|
|
* Reset layer specific mbuf flags
|
|
|
|
* to avoid confusing lower layers.
|
|
|
|
*/
|
|
|
|
m->m_flags &= ~(M_PROTOFLAGS);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = (*ifp->if_output)(ifp, m,
|
2009-04-16 20:30:28 +00:00
|
|
|
(struct sockaddr *)dst, ro);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto done;
|
|
|
|
}
|
2003-08-07 18:16:59 +00:00
|
|
|
|
2006-09-06 21:51:59 +00:00
|
|
|
/* Balk when DF bit is set or the interface didn't support TSO. */
|
|
|
|
if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
|
1994-05-24 10:09:53 +00:00
|
|
|
error = EMSGSIZE;
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_cantfrag);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
|
|
|
}
|
2003-08-07 18:16:59 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Too large for interface; fragment if possible. If successful,
|
|
|
|
* on return, m will point to a list of packets to be sent.
|
|
|
|
*/
|
2006-09-10 17:49:09 +00:00
|
|
|
error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
|
2003-08-07 18:16:59 +00:00
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
2003-08-07 18:16:59 +00:00
|
|
|
for (; m; m = m0) {
|
|
|
|
m0 = m->m_nextpkt;
|
|
|
|
m->m_nextpkt = 0;
|
|
|
|
if (error == 0) {
|
|
|
|
/* Record statistics for this interface address. */
|
|
|
|
if (ia != NULL) {
|
2006-12-10 13:44:00 +00:00
|
|
|
ia->ia_ifa.if_opackets++;
|
|
|
|
ia->ia_ifa.if_obytes += m->m_pkthdr.len;
|
2003-08-07 18:16:59 +00:00
|
|
|
}
|
2005-11-18 16:23:26 +00:00
|
|
|
/*
|
|
|
|
* Reset layer specific mbuf flags
|
|
|
|
* to avoid confusing upper layers.
|
|
|
|
*/
|
|
|
|
m->m_flags &= ~(M_PROTOFLAGS);
|
|
|
|
|
2003-08-07 18:16:59 +00:00
|
|
|
error = (*ifp->if_output)(ifp, m,
|
2009-04-16 20:30:28 +00:00
|
|
|
(struct sockaddr *)dst, ro);
|
2003-08-07 18:16:59 +00:00
|
|
|
} else
|
|
|
|
m_freem(m);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2003-08-07 18:16:59 +00:00
|
|
|
if (error == 0)
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_fragmented);
|
2003-08-07 18:16:59 +00:00
|
|
|
|
|
|
|
done:
|
2009-04-19 04:44:05 +00:00
|
|
|
if (ro == &iproute && ro->ro_rt && !nortfree) {
|
2003-08-07 18:16:59 +00:00
|
|
|
RTFREE(ro->ro_rt);
|
2004-02-25 19:55:29 +00:00
|
|
|
}
|
2009-06-23 20:19:09 +00:00
|
|
|
if (ia != NULL)
|
|
|
|
ifa_free(&ia->ia_ifa);
|
2003-08-07 18:16:59 +00:00
|
|
|
return (error);
|
|
|
|
bad:
|
|
|
|
m_freem(m);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a chain of fragments which fit the given mtu. m_frag points to the
|
|
|
|
* mbuf to be fragmented; on return it points to the chain with the fragments.
|
|
|
|
* Return 0 if no error. If error, m_frag may contain a partially built
|
|
|
|
* chain of fragments that should be freed by the caller.
|
|
|
|
*
|
|
|
|
* if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
|
|
|
|
* sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
|
2007-05-10 15:58:48 +00:00
|
|
|
u_long if_hwassist_flags, int sw_csum)
|
2003-08-07 18:16:59 +00:00
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
int hlen = ip->ip_hl << 2;
|
|
|
|
int len = (mtu - hlen) & ~7; /* size of payload in each fragment */
|
|
|
|
int off;
|
|
|
|
struct mbuf *m0 = *m_frag; /* the original packet */
|
|
|
|
int firstlen;
|
|
|
|
struct mbuf **mnext;
|
|
|
|
int nfrags;
|
|
|
|
|
|
|
|
if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_cantfrag);
|
2003-08-07 18:16:59 +00:00
|
|
|
return EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Must be able to put at least 8 bytes per fragment.
|
|
|
|
*/
|
|
|
|
if (len < 8)
|
|
|
|
return EMSGSIZE;
|
|
|
|
|
2000-03-27 19:14:27 +00:00
|
|
|
/*
|
2003-08-07 18:16:59 +00:00
|
|
|
* If the interface will not calculate checksums on
|
2000-03-27 19:14:27 +00:00
|
|
|
* fragmented packets, then do it here.
|
|
|
|
*/
|
2003-08-07 18:16:59 +00:00
|
|
|
if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
|
|
|
|
(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
|
|
|
|
in_delayed_cksum(m0);
|
|
|
|
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
|
2000-03-27 19:14:27 +00:00
|
|
|
}
|
2009-02-03 11:00:43 +00:00
|
|
|
#ifdef SCTP
|
|
|
|
if (m0->m_pkthdr.csum_flags & CSUM_SCTP &&
|
|
|
|
(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
|
|
|
|
sctp_delayed_cksum(m0);
|
|
|
|
m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
|
|
|
|
}
|
|
|
|
#endif
|
At long last, commit the zero copy sockets code.
MAKEDEV: Add MAKEDEV glue for the ti(4) device nodes.
ti.4: Update the ti(4) man page to include information on the
TI_JUMBO_HDRSPLIT and TI_PRIVATE_JUMBOS kernel options,
and also include information about the new character
device interface and the associated ioctls.
man9/Makefile: Add jumbo.9 and zero_copy.9 man pages and associated
links.
jumbo.9: New man page describing the jumbo buffer allocator
interface and operation.
zero_copy.9: New man page describing the general characteristics of
the zero copy send and receive code, and what an
application author should do to take advantage of the
zero copy functionality.
NOTES: Add entries for ZERO_COPY_SOCKETS, TI_PRIVATE_JUMBOS,
TI_JUMBO_HDRSPLIT, MSIZE, and MCLSHIFT.
conf/files: Add uipc_jumbo.c and uipc_cow.c.
conf/options: Add the 5 options mentioned above.
kern_subr.c: Receive side zero copy implementation. This takes
"disposable" pages attached to an mbuf, gives them to
a user process, and then recycles the user's page.
This is only active when ZERO_COPY_SOCKETS is turned on
and the kern.ipc.zero_copy.receive sysctl variable is
set to 1.
uipc_cow.c: Send side zero copy functions. Takes a page written
by the user and maps it copy on write and assigns it
kernel virtual address space. Removes copy on write
mapping once the buffer has been freed by the network
stack.
uipc_jumbo.c: Jumbo disposable page allocator code. This allocates
(optionally) disposable pages for network drivers that
want to give the user the option of doing zero copy
receive.
uipc_socket.c: Add kern.ipc.zero_copy.{send,receive} sysctls that are
enabled if ZERO_COPY_SOCKETS is turned on.
Add zero copy send support to sosend() -- pages get
mapped into the kernel instead of getting copied if
they meet size and alignment restrictions.
uipc_syscalls.c:Un-staticize some of the sf* functions so that they
can be used elsewhere. (uipc_cow.c)
if_media.c: In the SIOCGIFMEDIA ioctl in ifmedia_ioctl(), avoid
calling malloc() with M_WAITOK. Return an error if
the M_NOWAIT malloc fails.
The ti(4) driver and the wi(4) driver, at least, call
this with a mutex held. This causes witness warnings
for 'ifconfig -a' with a wi(4) or ti(4) board in the
system. (I've only verified for ti(4)).
ip_output.c: Fragment large datagrams so that each segment contains
a multiple of PAGE_SIZE amount of data plus headers.
This allows the receiver to potentially do page
flipping on receives.
if_ti.c: Add zero copy receive support to the ti(4) driver. If
TI_PRIVATE_JUMBOS is not defined, it now uses the
jumbo(9) buffer allocator for jumbo receive buffers.
Add a new character device interface for the ti(4)
driver for the new debugging interface. This allows
(a patched version of) gdb to talk to the Tigon board
and debug the firmware. There are also a few additional
debugging ioctls available through this interface.
Add header splitting support to the ti(4) driver.
Tweak some of the default interrupt coalescing
parameters to more useful defaults.
Add hooks for supporting transmit flow control, but
leave it turned off with a comment describing why it
is turned off.
if_tireg.h: Change the firmware rev to 12.4.11, since we're really
at 12.4.11 plus fixes from 12.4.13.
Add defines needed for debugging.
Remove the ti_stats structure, it is now defined in
sys/tiio.h.
ti_fw.h: 12.4.11 firmware.
ti_fw2.h: 12.4.11 firmware, plus selected fixes from 12.4.13,
and my header splitting patches. Revision 12.4.13
doesn't handle 10/100 negotiation properly. (This
firmware is the same as what was in the tree previously,
with the addition of header splitting support.)
sys/jumbo.h: Jumbo buffer allocator interface.
sys/mbuf.h: Add a new external mbuf type, EXT_DISPOSABLE, to
indicate that the payload buffer can be thrown away /
flipped to a userland process.
socketvar.h: Add prototype for socow_setup.
tiio.h: ioctl interface to the character portion of the ti(4)
driver, plus associated structure/type definitions.
uio.h: Change prototype for uiomoveco() so that we'll know
whether the source page is disposable.
ufs_readwrite.c:Update for new prototype of uiomoveco().
vm_fault.c: In vm_fault(), check to see whether we need to do a page
based copy on write fault.
vm_object.c: Add a new function, vm_object_allocate_wait(). This
does the same thing that vm_object allocate does, except
that it gives the caller the opportunity to specify whether
it should wait on the uma_zalloc() of the object structre.
This allows vm objects to be allocated while holding a
mutex. (Without generating WITNESS warnings.)
vm_object_allocate() is implemented as a call to
vm_object_allocate_wait() with the malloc flag set to
M_WAITOK.
vm_object.h: Add prototype for vm_object_allocate_wait().
vm_page.c: Add page-based copy on write setup, clear and fault
routines.
vm_page.h: Add page based COW function prototypes and variable in
the vm_page structure.
Many thanks to Drew Gallatin, who wrote the zero copy send and receive
code, and to all the other folks who have tested and reviewed this code
over the years.
2002-06-26 03:37:47 +00:00
|
|
|
if (len > PAGE_SIZE) {
|
|
|
|
/*
|
2003-08-07 18:16:59 +00:00
|
|
|
* Fragment large datagrams such that each segment
|
At long last, commit the zero copy sockets code.
MAKEDEV: Add MAKEDEV glue for the ti(4) device nodes.
ti.4: Update the ti(4) man page to include information on the
TI_JUMBO_HDRSPLIT and TI_PRIVATE_JUMBOS kernel options,
and also include information about the new character
device interface and the associated ioctls.
man9/Makefile: Add jumbo.9 and zero_copy.9 man pages and associated
links.
jumbo.9: New man page describing the jumbo buffer allocator
interface and operation.
zero_copy.9: New man page describing the general characteristics of
the zero copy send and receive code, and what an
application author should do to take advantage of the
zero copy functionality.
NOTES: Add entries for ZERO_COPY_SOCKETS, TI_PRIVATE_JUMBOS,
TI_JUMBO_HDRSPLIT, MSIZE, and MCLSHIFT.
conf/files: Add uipc_jumbo.c and uipc_cow.c.
conf/options: Add the 5 options mentioned above.
kern_subr.c: Receive side zero copy implementation. This takes
"disposable" pages attached to an mbuf, gives them to
a user process, and then recycles the user's page.
This is only active when ZERO_COPY_SOCKETS is turned on
and the kern.ipc.zero_copy.receive sysctl variable is
set to 1.
uipc_cow.c: Send side zero copy functions. Takes a page written
by the user and maps it copy on write and assigns it
kernel virtual address space. Removes copy on write
mapping once the buffer has been freed by the network
stack.
uipc_jumbo.c: Jumbo disposable page allocator code. This allocates
(optionally) disposable pages for network drivers that
want to give the user the option of doing zero copy
receive.
uipc_socket.c: Add kern.ipc.zero_copy.{send,receive} sysctls that are
enabled if ZERO_COPY_SOCKETS is turned on.
Add zero copy send support to sosend() -- pages get
mapped into the kernel instead of getting copied if
they meet size and alignment restrictions.
uipc_syscalls.c:Un-staticize some of the sf* functions so that they
can be used elsewhere. (uipc_cow.c)
if_media.c: In the SIOCGIFMEDIA ioctl in ifmedia_ioctl(), avoid
calling malloc() with M_WAITOK. Return an error if
the M_NOWAIT malloc fails.
The ti(4) driver and the wi(4) driver, at least, call
this with a mutex held. This causes witness warnings
for 'ifconfig -a' with a wi(4) or ti(4) board in the
system. (I've only verified for ti(4)).
ip_output.c: Fragment large datagrams so that each segment contains
a multiple of PAGE_SIZE amount of data plus headers.
This allows the receiver to potentially do page
flipping on receives.
if_ti.c: Add zero copy receive support to the ti(4) driver. If
TI_PRIVATE_JUMBOS is not defined, it now uses the
jumbo(9) buffer allocator for jumbo receive buffers.
Add a new character device interface for the ti(4)
driver for the new debugging interface. This allows
(a patched version of) gdb to talk to the Tigon board
and debug the firmware. There are also a few additional
debugging ioctls available through this interface.
Add header splitting support to the ti(4) driver.
Tweak some of the default interrupt coalescing
parameters to more useful defaults.
Add hooks for supporting transmit flow control, but
leave it turned off with a comment describing why it
is turned off.
if_tireg.h: Change the firmware rev to 12.4.11, since we're really
at 12.4.11 plus fixes from 12.4.13.
Add defines needed for debugging.
Remove the ti_stats structure, it is now defined in
sys/tiio.h.
ti_fw.h: 12.4.11 firmware.
ti_fw2.h: 12.4.11 firmware, plus selected fixes from 12.4.13,
and my header splitting patches. Revision 12.4.13
doesn't handle 10/100 negotiation properly. (This
firmware is the same as what was in the tree previously,
with the addition of header splitting support.)
sys/jumbo.h: Jumbo buffer allocator interface.
sys/mbuf.h: Add a new external mbuf type, EXT_DISPOSABLE, to
indicate that the payload buffer can be thrown away /
flipped to a userland process.
socketvar.h: Add prototype for socow_setup.
tiio.h: ioctl interface to the character portion of the ti(4)
driver, plus associated structure/type definitions.
uio.h: Change prototype for uiomoveco() so that we'll know
whether the source page is disposable.
ufs_readwrite.c:Update for new prototype of uiomoveco().
vm_fault.c: In vm_fault(), check to see whether we need to do a page
based copy on write fault.
vm_object.c: Add a new function, vm_object_allocate_wait(). This
does the same thing that vm_object allocate does, except
that it gives the caller the opportunity to specify whether
it should wait on the uma_zalloc() of the object structre.
This allows vm objects to be allocated while holding a
mutex. (Without generating WITNESS warnings.)
vm_object_allocate() is implemented as a call to
vm_object_allocate_wait() with the malloc flag set to
M_WAITOK.
vm_object.h: Add prototype for vm_object_allocate_wait().
vm_page.c: Add page-based copy on write setup, clear and fault
routines.
vm_page.h: Add page based COW function prototypes and variable in
the vm_page structure.
Many thanks to Drew Gallatin, who wrote the zero copy send and receive
code, and to all the other folks who have tested and reviewed this code
over the years.
2002-06-26 03:37:47 +00:00
|
|
|
* contains a multiple of PAGE_SIZE amount of data,
|
|
|
|
* plus headers. This enables a receiver to perform
|
|
|
|
* page-flipping zero-copy optimizations.
|
2003-08-07 18:16:59 +00:00
|
|
|
*
|
|
|
|
* XXX When does this help given that sender and receiver
|
|
|
|
* could have different page sizes, and also mtu could
|
|
|
|
* be less than the receiver's page size ?
|
At long last, commit the zero copy sockets code.
MAKEDEV: Add MAKEDEV glue for the ti(4) device nodes.
ti.4: Update the ti(4) man page to include information on the
TI_JUMBO_HDRSPLIT and TI_PRIVATE_JUMBOS kernel options,
and also include information about the new character
device interface and the associated ioctls.
man9/Makefile: Add jumbo.9 and zero_copy.9 man pages and associated
links.
jumbo.9: New man page describing the jumbo buffer allocator
interface and operation.
zero_copy.9: New man page describing the general characteristics of
the zero copy send and receive code, and what an
application author should do to take advantage of the
zero copy functionality.
NOTES: Add entries for ZERO_COPY_SOCKETS, TI_PRIVATE_JUMBOS,
TI_JUMBO_HDRSPLIT, MSIZE, and MCLSHIFT.
conf/files: Add uipc_jumbo.c and uipc_cow.c.
conf/options: Add the 5 options mentioned above.
kern_subr.c: Receive side zero copy implementation. This takes
"disposable" pages attached to an mbuf, gives them to
a user process, and then recycles the user's page.
This is only active when ZERO_COPY_SOCKETS is turned on
and the kern.ipc.zero_copy.receive sysctl variable is
set to 1.
uipc_cow.c: Send side zero copy functions. Takes a page written
by the user and maps it copy on write and assigns it
kernel virtual address space. Removes copy on write
mapping once the buffer has been freed by the network
stack.
uipc_jumbo.c: Jumbo disposable page allocator code. This allocates
(optionally) disposable pages for network drivers that
want to give the user the option of doing zero copy
receive.
uipc_socket.c: Add kern.ipc.zero_copy.{send,receive} sysctls that are
enabled if ZERO_COPY_SOCKETS is turned on.
Add zero copy send support to sosend() -- pages get
mapped into the kernel instead of getting copied if
they meet size and alignment restrictions.
uipc_syscalls.c:Un-staticize some of the sf* functions so that they
can be used elsewhere. (uipc_cow.c)
if_media.c: In the SIOCGIFMEDIA ioctl in ifmedia_ioctl(), avoid
calling malloc() with M_WAITOK. Return an error if
the M_NOWAIT malloc fails.
The ti(4) driver and the wi(4) driver, at least, call
this with a mutex held. This causes witness warnings
for 'ifconfig -a' with a wi(4) or ti(4) board in the
system. (I've only verified for ti(4)).
ip_output.c: Fragment large datagrams so that each segment contains
a multiple of PAGE_SIZE amount of data plus headers.
This allows the receiver to potentially do page
flipping on receives.
if_ti.c: Add zero copy receive support to the ti(4) driver. If
TI_PRIVATE_JUMBOS is not defined, it now uses the
jumbo(9) buffer allocator for jumbo receive buffers.
Add a new character device interface for the ti(4)
driver for the new debugging interface. This allows
(a patched version of) gdb to talk to the Tigon board
and debug the firmware. There are also a few additional
debugging ioctls available through this interface.
Add header splitting support to the ti(4) driver.
Tweak some of the default interrupt coalescing
parameters to more useful defaults.
Add hooks for supporting transmit flow control, but
leave it turned off with a comment describing why it
is turned off.
if_tireg.h: Change the firmware rev to 12.4.11, since we're really
at 12.4.11 plus fixes from 12.4.13.
Add defines needed for debugging.
Remove the ti_stats structure, it is now defined in
sys/tiio.h.
ti_fw.h: 12.4.11 firmware.
ti_fw2.h: 12.4.11 firmware, plus selected fixes from 12.4.13,
and my header splitting patches. Revision 12.4.13
doesn't handle 10/100 negotiation properly. (This
firmware is the same as what was in the tree previously,
with the addition of header splitting support.)
sys/jumbo.h: Jumbo buffer allocator interface.
sys/mbuf.h: Add a new external mbuf type, EXT_DISPOSABLE, to
indicate that the payload buffer can be thrown away /
flipped to a userland process.
socketvar.h: Add prototype for socow_setup.
tiio.h: ioctl interface to the character portion of the ti(4)
driver, plus associated structure/type definitions.
uio.h: Change prototype for uiomoveco() so that we'll know
whether the source page is disposable.
ufs_readwrite.c:Update for new prototype of uiomoveco().
vm_fault.c: In vm_fault(), check to see whether we need to do a page
based copy on write fault.
vm_object.c: Add a new function, vm_object_allocate_wait(). This
does the same thing that vm_object allocate does, except
that it gives the caller the opportunity to specify whether
it should wait on the uma_zalloc() of the object structre.
This allows vm objects to be allocated while holding a
mutex. (Without generating WITNESS warnings.)
vm_object_allocate() is implemented as a call to
vm_object_allocate_wait() with the malloc flag set to
M_WAITOK.
vm_object.h: Add prototype for vm_object_allocate_wait().
vm_page.c: Add page-based copy on write setup, clear and fault
routines.
vm_page.h: Add page based COW function prototypes and variable in
the vm_page structure.
Many thanks to Drew Gallatin, who wrote the zero copy send and receive
code, and to all the other folks who have tested and reviewed this code
over the years.
2002-06-26 03:37:47 +00:00
|
|
|
*/
|
|
|
|
int newlen;
|
2003-08-07 18:16:59 +00:00
|
|
|
struct mbuf *m;
|
|
|
|
|
|
|
|
for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
|
|
|
|
off += m->m_len;
|
At long last, commit the zero copy sockets code.
MAKEDEV: Add MAKEDEV glue for the ti(4) device nodes.
ti.4: Update the ti(4) man page to include information on the
TI_JUMBO_HDRSPLIT and TI_PRIVATE_JUMBOS kernel options,
and also include information about the new character
device interface and the associated ioctls.
man9/Makefile: Add jumbo.9 and zero_copy.9 man pages and associated
links.
jumbo.9: New man page describing the jumbo buffer allocator
interface and operation.
zero_copy.9: New man page describing the general characteristics of
the zero copy send and receive code, and what an
application author should do to take advantage of the
zero copy functionality.
NOTES: Add entries for ZERO_COPY_SOCKETS, TI_PRIVATE_JUMBOS,
TI_JUMBO_HDRSPLIT, MSIZE, and MCLSHIFT.
conf/files: Add uipc_jumbo.c and uipc_cow.c.
conf/options: Add the 5 options mentioned above.
kern_subr.c: Receive side zero copy implementation. This takes
"disposable" pages attached to an mbuf, gives them to
a user process, and then recycles the user's page.
This is only active when ZERO_COPY_SOCKETS is turned on
and the kern.ipc.zero_copy.receive sysctl variable is
set to 1.
uipc_cow.c: Send side zero copy functions. Takes a page written
by the user and maps it copy on write and assigns it
kernel virtual address space. Removes copy on write
mapping once the buffer has been freed by the network
stack.
uipc_jumbo.c: Jumbo disposable page allocator code. This allocates
(optionally) disposable pages for network drivers that
want to give the user the option of doing zero copy
receive.
uipc_socket.c: Add kern.ipc.zero_copy.{send,receive} sysctls that are
enabled if ZERO_COPY_SOCKETS is turned on.
Add zero copy send support to sosend() -- pages get
mapped into the kernel instead of getting copied if
they meet size and alignment restrictions.
uipc_syscalls.c:Un-staticize some of the sf* functions so that they
can be used elsewhere. (uipc_cow.c)
if_media.c: In the SIOCGIFMEDIA ioctl in ifmedia_ioctl(), avoid
calling malloc() with M_WAITOK. Return an error if
the M_NOWAIT malloc fails.
The ti(4) driver and the wi(4) driver, at least, call
this with a mutex held. This causes witness warnings
for 'ifconfig -a' with a wi(4) or ti(4) board in the
system. (I've only verified for ti(4)).
ip_output.c: Fragment large datagrams so that each segment contains
a multiple of PAGE_SIZE amount of data plus headers.
This allows the receiver to potentially do page
flipping on receives.
if_ti.c: Add zero copy receive support to the ti(4) driver. If
TI_PRIVATE_JUMBOS is not defined, it now uses the
jumbo(9) buffer allocator for jumbo receive buffers.
Add a new character device interface for the ti(4)
driver for the new debugging interface. This allows
(a patched version of) gdb to talk to the Tigon board
and debug the firmware. There are also a few additional
debugging ioctls available through this interface.
Add header splitting support to the ti(4) driver.
Tweak some of the default interrupt coalescing
parameters to more useful defaults.
Add hooks for supporting transmit flow control, but
leave it turned off with a comment describing why it
is turned off.
if_tireg.h: Change the firmware rev to 12.4.11, since we're really
at 12.4.11 plus fixes from 12.4.13.
Add defines needed for debugging.
Remove the ti_stats structure, it is now defined in
sys/tiio.h.
ti_fw.h: 12.4.11 firmware.
ti_fw2.h: 12.4.11 firmware, plus selected fixes from 12.4.13,
and my header splitting patches. Revision 12.4.13
doesn't handle 10/100 negotiation properly. (This
firmware is the same as what was in the tree previously,
with the addition of header splitting support.)
sys/jumbo.h: Jumbo buffer allocator interface.
sys/mbuf.h: Add a new external mbuf type, EXT_DISPOSABLE, to
indicate that the payload buffer can be thrown away /
flipped to a userland process.
socketvar.h: Add prototype for socow_setup.
tiio.h: ioctl interface to the character portion of the ti(4)
driver, plus associated structure/type definitions.
uio.h: Change prototype for uiomoveco() so that we'll know
whether the source page is disposable.
ufs_readwrite.c:Update for new prototype of uiomoveco().
vm_fault.c: In vm_fault(), check to see whether we need to do a page
based copy on write fault.
vm_object.c: Add a new function, vm_object_allocate_wait(). This
does the same thing that vm_object allocate does, except
that it gives the caller the opportunity to specify whether
it should wait on the uma_zalloc() of the object structre.
This allows vm objects to be allocated while holding a
mutex. (Without generating WITNESS warnings.)
vm_object_allocate() is implemented as a call to
vm_object_allocate_wait() with the malloc flag set to
M_WAITOK.
vm_object.h: Add prototype for vm_object_allocate_wait().
vm_page.c: Add page-based copy on write setup, clear and fault
routines.
vm_page.h: Add page based COW function prototypes and variable in
the vm_page structure.
Many thanks to Drew Gallatin, who wrote the zero copy send and receive
code, and to all the other folks who have tested and reviewed this code
over the years.
2002-06-26 03:37:47 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* firstlen (off - hlen) must be aligned on an
|
|
|
|
* 8-byte boundary
|
|
|
|
*/
|
|
|
|
if (off < hlen)
|
|
|
|
goto smart_frag_failure;
|
|
|
|
off = ((off - hlen) & ~7) + hlen;
|
2003-08-07 18:16:59 +00:00
|
|
|
newlen = (~PAGE_MASK) & mtu;
|
|
|
|
if ((newlen + sizeof (struct ip)) > mtu) {
|
At long last, commit the zero copy sockets code.
MAKEDEV: Add MAKEDEV glue for the ti(4) device nodes.
ti.4: Update the ti(4) man page to include information on the
TI_JUMBO_HDRSPLIT and TI_PRIVATE_JUMBOS kernel options,
and also include information about the new character
device interface and the associated ioctls.
man9/Makefile: Add jumbo.9 and zero_copy.9 man pages and associated
links.
jumbo.9: New man page describing the jumbo buffer allocator
interface and operation.
zero_copy.9: New man page describing the general characteristics of
the zero copy send and receive code, and what an
application author should do to take advantage of the
zero copy functionality.
NOTES: Add entries for ZERO_COPY_SOCKETS, TI_PRIVATE_JUMBOS,
TI_JUMBO_HDRSPLIT, MSIZE, and MCLSHIFT.
conf/files: Add uipc_jumbo.c and uipc_cow.c.
conf/options: Add the 5 options mentioned above.
kern_subr.c: Receive side zero copy implementation. This takes
"disposable" pages attached to an mbuf, gives them to
a user process, and then recycles the user's page.
This is only active when ZERO_COPY_SOCKETS is turned on
and the kern.ipc.zero_copy.receive sysctl variable is
set to 1.
uipc_cow.c: Send side zero copy functions. Takes a page written
by the user and maps it copy on write and assigns it
kernel virtual address space. Removes copy on write
mapping once the buffer has been freed by the network
stack.
uipc_jumbo.c: Jumbo disposable page allocator code. This allocates
(optionally) disposable pages for network drivers that
want to give the user the option of doing zero copy
receive.
uipc_socket.c: Add kern.ipc.zero_copy.{send,receive} sysctls that are
enabled if ZERO_COPY_SOCKETS is turned on.
Add zero copy send support to sosend() -- pages get
mapped into the kernel instead of getting copied if
they meet size and alignment restrictions.
uipc_syscalls.c:Un-staticize some of the sf* functions so that they
can be used elsewhere. (uipc_cow.c)
if_media.c: In the SIOCGIFMEDIA ioctl in ifmedia_ioctl(), avoid
calling malloc() with M_WAITOK. Return an error if
the M_NOWAIT malloc fails.
The ti(4) driver and the wi(4) driver, at least, call
this with a mutex held. This causes witness warnings
for 'ifconfig -a' with a wi(4) or ti(4) board in the
system. (I've only verified for ti(4)).
ip_output.c: Fragment large datagrams so that each segment contains
a multiple of PAGE_SIZE amount of data plus headers.
This allows the receiver to potentially do page
flipping on receives.
if_ti.c: Add zero copy receive support to the ti(4) driver. If
TI_PRIVATE_JUMBOS is not defined, it now uses the
jumbo(9) buffer allocator for jumbo receive buffers.
Add a new character device interface for the ti(4)
driver for the new debugging interface. This allows
(a patched version of) gdb to talk to the Tigon board
and debug the firmware. There are also a few additional
debugging ioctls available through this interface.
Add header splitting support to the ti(4) driver.
Tweak some of the default interrupt coalescing
parameters to more useful defaults.
Add hooks for supporting transmit flow control, but
leave it turned off with a comment describing why it
is turned off.
if_tireg.h: Change the firmware rev to 12.4.11, since we're really
at 12.4.11 plus fixes from 12.4.13.
Add defines needed for debugging.
Remove the ti_stats structure, it is now defined in
sys/tiio.h.
ti_fw.h: 12.4.11 firmware.
ti_fw2.h: 12.4.11 firmware, plus selected fixes from 12.4.13,
and my header splitting patches. Revision 12.4.13
doesn't handle 10/100 negotiation properly. (This
firmware is the same as what was in the tree previously,
with the addition of header splitting support.)
sys/jumbo.h: Jumbo buffer allocator interface.
sys/mbuf.h: Add a new external mbuf type, EXT_DISPOSABLE, to
indicate that the payload buffer can be thrown away /
flipped to a userland process.
socketvar.h: Add prototype for socow_setup.
tiio.h: ioctl interface to the character portion of the ti(4)
driver, plus associated structure/type definitions.
uio.h: Change prototype for uiomoveco() so that we'll know
whether the source page is disposable.
ufs_readwrite.c:Update for new prototype of uiomoveco().
vm_fault.c: In vm_fault(), check to see whether we need to do a page
based copy on write fault.
vm_object.c: Add a new function, vm_object_allocate_wait(). This
does the same thing that vm_object allocate does, except
that it gives the caller the opportunity to specify whether
it should wait on the uma_zalloc() of the object structre.
This allows vm objects to be allocated while holding a
mutex. (Without generating WITNESS warnings.)
vm_object_allocate() is implemented as a call to
vm_object_allocate_wait() with the malloc flag set to
M_WAITOK.
vm_object.h: Add prototype for vm_object_allocate_wait().
vm_page.c: Add page-based copy on write setup, clear and fault
routines.
vm_page.h: Add page based COW function prototypes and variable in
the vm_page structure.
Many thanks to Drew Gallatin, who wrote the zero copy send and receive
code, and to all the other folks who have tested and reviewed this code
over the years.
2002-06-26 03:37:47 +00:00
|
|
|
/* we failed, go back the default */
|
|
|
|
smart_frag_failure:
|
|
|
|
newlen = len;
|
|
|
|
off = hlen + len;
|
|
|
|
}
|
|
|
|
len = newlen;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
off = hlen + len;
|
|
|
|
}
|
|
|
|
|
2003-08-07 18:16:59 +00:00
|
|
|
firstlen = off - hlen;
|
|
|
|
mnext = &m0->m_nextpkt; /* pointer to next packet */
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop through length of segment after first fragment,
|
|
|
|
* make new header and copy data of each part and link onto chain.
|
2003-08-07 18:16:59 +00:00
|
|
|
* Here, m0 is the original packet, m is the fragment being created.
|
|
|
|
* The fragments are linked off the m_nextpkt of the original
|
|
|
|
* packet, which after processing serves as the first fragment.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2003-08-07 18:16:59 +00:00
|
|
|
for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
|
|
|
|
struct ip *mhip; /* ip header on the fragment */
|
|
|
|
struct mbuf *m;
|
|
|
|
int mhlen = sizeof (struct ip);
|
|
|
|
|
2005-11-02 13:46:32 +00:00
|
|
|
MGETHDR(m, M_DONTWAIT, MT_DATA);
|
2004-08-11 10:46:15 +00:00
|
|
|
if (m == NULL) {
|
1994-05-24 10:09:53 +00:00
|
|
|
error = ENOBUFS;
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_odropped);
|
2003-08-07 18:16:59 +00:00
|
|
|
goto done;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2000-03-27 19:14:27 +00:00
|
|
|
m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
|
2003-08-07 18:16:59 +00:00
|
|
|
/*
|
|
|
|
* In the first mbuf, leave room for the link header, then
|
|
|
|
* copy the original IP header including options. The payload
|
2009-03-04 03:45:34 +00:00
|
|
|
* goes into an additional mbuf chain returned by m_copym().
|
2003-08-07 18:16:59 +00:00
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
m->m_data += max_linkhdr;
|
|
|
|
mhip = mtod(m, struct ip *);
|
|
|
|
*mhip = *ip;
|
|
|
|
if (hlen > sizeof (struct ip)) {
|
|
|
|
mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
|
2002-10-20 22:52:07 +00:00
|
|
|
mhip->ip_v = IPVERSION;
|
|
|
|
mhip->ip_hl = mhlen >> 2;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
m->m_len = mhlen;
|
2003-08-07 18:16:59 +00:00
|
|
|
/* XXX do we need to add ip->ip_off below ? */
|
2000-10-20 14:10:37 +00:00
|
|
|
mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
|
2003-08-07 18:16:59 +00:00
|
|
|
if (off + len >= ip->ip_len) { /* last fragment */
|
|
|
|
len = ip->ip_len - off;
|
|
|
|
m->m_flags |= M_LASTFRAG;
|
|
|
|
} else
|
1994-05-24 10:09:53 +00:00
|
|
|
mhip->ip_off |= IP_MF;
|
|
|
|
mhip->ip_len = htons((u_short)(len + mhlen));
|
2009-03-04 03:45:34 +00:00
|
|
|
m->m_next = m_copym(m0, off, len, M_DONTWAIT);
|
2004-08-11 10:46:15 +00:00
|
|
|
if (m->m_next == NULL) { /* copy failed */
|
2003-08-07 18:16:59 +00:00
|
|
|
m_free(m);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = ENOBUFS; /* ??? */
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_INC(ips_odropped);
|
2003-08-07 18:16:59 +00:00
|
|
|
goto done;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
m->m_pkthdr.len = mhlen + len;
|
2005-06-10 16:49:24 +00:00
|
|
|
m->m_pkthdr.rcvif = NULL;
|
2002-07-31 17:21:01 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_netinet_fragment(m0, m);
|
2002-07-31 17:21:01 +00:00
|
|
|
#endif
|
2000-03-27 19:14:27 +00:00
|
|
|
m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
|
2002-02-18 20:35:27 +00:00
|
|
|
mhip->ip_off = htons(mhip->ip_off);
|
1994-05-24 10:09:53 +00:00
|
|
|
mhip->ip_sum = 0;
|
2002-10-20 22:52:07 +00:00
|
|
|
if (sw_csum & CSUM_DELAY_IP)
|
|
|
|
mhip->ip_sum = in_cksum(m, mhlen);
|
1994-05-24 10:09:53 +00:00
|
|
|
*mnext = m;
|
|
|
|
mnext = &m->m_nextpkt;
|
|
|
|
}
|
2009-04-11 23:35:20 +00:00
|
|
|
IPSTAT_ADD(ips_ofragments, nfrags);
|
2000-03-27 19:14:27 +00:00
|
|
|
|
2003-08-07 18:16:59 +00:00
|
|
|
/* set first marker for fragment chain */
|
2000-03-27 19:14:27 +00:00
|
|
|
m0->m_flags |= M_FIRSTFRAG | M_FRAG;
|
|
|
|
m0->m_pkthdr.csum_data = nfrags;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2002-11-20 18:56:25 +00:00
|
|
|
* Update first fragment by trimming what's been copied out
|
2003-08-07 18:16:59 +00:00
|
|
|
* and updating header.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2003-08-07 18:16:59 +00:00
|
|
|
m_adj(m0, hlen + firstlen - ip->ip_len);
|
|
|
|
m0->m_pkthdr.len = hlen + firstlen;
|
|
|
|
ip->ip_len = htons((u_short)m0->m_pkthdr.len);
|
Fixed broken ICMP error generation, unified conversion of IP header
fields between host and network byte order. The details:
o icmp_error() now does not add IP header length. This fixes the problem
when icmp_error() is called from ip_forward(). In this case the ip_len
of the original IP datagram returned with ICMP error was wrong.
o icmp_error() expects all three fields, ip_len, ip_id and ip_off in host
byte order, so DTRT and convert these fields back to network byte order
before sending a message. This fixes the problem described in PR 16240
and PR 20877 (ip_id field was returned in host byte order).
o ip_ttl decrement operation in ip_forward() was moved down to make sure
that it does not corrupt the copy of original IP datagram passed later
to icmp_error().
o A copy of original IP datagram in ip_forward() was made a read-write,
independent copy. This fixes the problem I first reported to Garrett
Wollman and Bill Fenner and later put in audit trail of PR 16240:
ip_output() (not always) converts fields of original datagram to network
byte order, but because copy (mcopy) and its original (m) most likely
share the same mbuf cluster, ip_output()'s manipulations on original
also corrupted the copy.
o ip_output() now expects all three fields, ip_len, ip_off and (what is
significant) ip_id in host byte order. It was a headache for years that
ip_id was handled differently. The only compatibility issue here is the
raw IP socket interface with IP_HDRINCL socket option set and a non-zero
ip_id field, but ip.4 manual page was unclear on whether in this case
ip_id field should be in host or network byte order.
2000-09-01 12:33:03 +00:00
|
|
|
ip->ip_off |= IP_MF;
|
2002-02-18 20:35:27 +00:00
|
|
|
ip->ip_off = htons(ip->ip_off);
|
1994-05-24 10:09:53 +00:00
|
|
|
ip->ip_sum = 0;
|
2002-10-20 22:52:07 +00:00
|
|
|
if (sw_csum & CSUM_DELAY_IP)
|
2003-08-07 18:16:59 +00:00
|
|
|
ip->ip_sum = in_cksum(m0, hlen);
|
2002-11-20 18:56:25 +00:00
|
|
|
|
|
|
|
done:
|
2003-08-07 18:16:59 +00:00
|
|
|
*m_frag = m0;
|
|
|
|
return error;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2000-05-21 21:26:06 +00:00
|
|
|
void
|
2000-03-27 19:14:27 +00:00
|
|
|
in_delayed_cksum(struct mbuf *m)
|
|
|
|
{
|
|
|
|
struct ip *ip;
|
|
|
|
u_short csum, offset;
|
|
|
|
|
|
|
|
ip = mtod(m, struct ip *);
|
2002-10-20 22:52:07 +00:00
|
|
|
offset = ip->ip_hl << 2 ;
|
2000-03-27 19:14:27 +00:00
|
|
|
csum = in_cksum_skip(m, ip->ip_len, offset);
|
RFC768 (UDP) requires that "if the computed checksum is zero, it
is transmitted as all ones". This got broken after introduction
of delayed checksums as follows. Some guys (including Jonathan)
think that it is allowed to transmit all ones in place of a zero
checksum for TCP the same way as for UDP. (The discussion still
takes place on -net.) Thus, the 0 -> 0xffff checksum fixup was
first moved from udp_output() (see udp_usrreq.c, 1.64 -> 1.65)
to in_cksum_skip() (see sys/i386/i386/in_cksum.c, 1.17 -> 1.18,
INVERT expression). Besides that I disagree that it is valid for
TCP, there was no real problem until in_cksum.c,v 1.20, where the
in_cksum() was made just a special version of in_cksum_skip().
The side effect was that now every incoming IP datagram failed to
pass the checksum test (in_cksum() returned 0xffff when it should
actually return zero). It was fixed next day in revision 1.21,
by removing the INVERT expression. The latter also broke the
0 -> 0xffff fixup for UDP checksums.
Before this change:
: tcpdump: listening on lo0
: 127.0.0.1.33005 > 127.0.0.1.33006: udp 0 (ttl 64, id 1)
: 4500 001c 0001 0000 4011 7cce 7f00 0001
: 7f00 0001 80ed 80ee 0008 0000
After this change:
: tcpdump: listening on lo0
: 127.0.0.1.33005 > 127.0.0.1.33006: udp 0 (ttl 64, id 1)
: 4500 001c 0001 0000 4011 7cce 7f00 0001
: 7f00 0001 80ed 80ee 0008 ffff
2001-03-13 17:07:06 +00:00
|
|
|
if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
|
|
|
|
csum = 0xffff;
|
2000-03-27 19:14:27 +00:00
|
|
|
offset += m->m_pkthdr.csum_data; /* checksum offset */
|
|
|
|
|
|
|
|
if (offset + sizeof(u_short) > m->m_len) {
|
|
|
|
printf("delayed m_pullup, m->len: %d off: %d p: %d\n",
|
|
|
|
m->m_len, offset, ip->ip_p);
|
|
|
|
/*
|
|
|
|
* XXX
|
|
|
|
* this shouldn't happen, but if it does, the
|
|
|
|
* correct behavior may be to insert the checksum
|
2006-01-18 18:49:16 +00:00
|
|
|
* in the appropriate next mbuf in the chain.
|
2000-03-27 19:14:27 +00:00
|
|
|
*/
|
2006-01-18 18:49:16 +00:00
|
|
|
return;
|
2000-03-27 19:14:27 +00:00
|
|
|
}
|
|
|
|
*(u_short *)(m->m_data + offset) = csum;
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* IP socket option processing.
|
|
|
|
*/
|
|
|
|
int
|
2007-05-10 15:58:48 +00:00
|
|
|
ip_ctloutput(struct socket *so, struct sockopt *sopt)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1998-08-23 03:07:17 +00:00
|
|
|
struct inpcb *inp = sotoinpcb(so);
|
|
|
|
int error, optval;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
error = optval = 0;
|
|
|
|
if (sopt->sopt_level != IPPROTO_IP) {
|
2008-11-19 19:19:30 +00:00
|
|
|
if ((sopt->sopt_level == SOL_SOCKET) &&
|
|
|
|
(sopt->sopt_name == SO_SETFIB)) {
|
|
|
|
inp->inp_inc.inc_fibnum = so->so_fibnum;
|
|
|
|
return (0);
|
|
|
|
}
|
1998-08-23 03:07:17 +00:00
|
|
|
return (EINVAL);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_dir) {
|
|
|
|
case SOPT_SET:
|
|
|
|
switch (sopt->sopt_name) {
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_OPTIONS:
|
|
|
|
#ifdef notyet
|
|
|
|
case IP_RETOPTS:
|
|
|
|
#endif
|
1998-08-23 03:07:17 +00:00
|
|
|
{
|
|
|
|
struct mbuf *m;
|
|
|
|
if (sopt->sopt_valsize > MLEN) {
|
|
|
|
error = EMSGSIZE;
|
|
|
|
break;
|
|
|
|
}
|
2008-03-25 09:39:02 +00:00
|
|
|
MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
|
2004-08-11 10:46:15 +00:00
|
|
|
if (m == NULL) {
|
1998-08-23 03:07:17 +00:00
|
|
|
error = ENOBUFS;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
m->m_len = sopt->sopt_valsize;
|
|
|
|
error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
|
|
|
|
m->m_len);
|
2006-05-21 17:52:08 +00:00
|
|
|
if (error) {
|
|
|
|
m_free(m);
|
|
|
|
break;
|
|
|
|
}
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WLOCK(inp);
|
2004-12-05 19:11:09 +00:00
|
|
|
error = ip_pcbopts(inp, sopt->sopt_name, m);
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp);
|
2004-12-05 19:11:09 +00:00
|
|
|
return (error);
|
1998-08-23 03:07:17 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2009-06-01 10:30:00 +00:00
|
|
|
case IP_BINDANY:
|
|
|
|
if (sopt->sopt_td != NULL) {
|
|
|
|
error = priv_check(sopt->sopt_td,
|
|
|
|
PRIV_NETINET_BINDANY);
|
|
|
|
if (error)
|
|
|
|
break;
|
2009-01-09 17:21:22 +00:00
|
|
|
}
|
|
|
|
/* FALLTHROUGH */
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_TOS:
|
|
|
|
case IP_TTL:
|
2005-08-22 16:13:08 +00:00
|
|
|
case IP_MINTTL:
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_RECVOPTS:
|
|
|
|
case IP_RECVRETOPTS:
|
|
|
|
case IP_RECVDSTADDR:
|
2003-04-29 21:36:18 +00:00
|
|
|
case IP_RECVTTL:
|
1996-11-11 04:56:32 +00:00
|
|
|
case IP_RECVIF:
|
1999-12-22 19:13:38 +00:00
|
|
|
case IP_FAITH:
|
2003-08-20 14:46:40 +00:00
|
|
|
case IP_ONESBCAST:
|
2005-09-26 20:25:16 +00:00
|
|
|
case IP_DONTFRAG:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
|
|
|
sizeof optval);
|
|
|
|
if (error)
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
|
|
|
case IP_TOS:
|
|
|
|
inp->inp_ip_tos = optval;
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
case IP_TTL:
|
|
|
|
inp->inp_ip_ttl = optval;
|
|
|
|
break;
|
2005-08-22 16:13:08 +00:00
|
|
|
|
|
|
|
case IP_MINTTL:
|
2009-01-03 11:35:31 +00:00
|
|
|
if (optval >= 0 && optval <= MAXTTL)
|
2005-08-22 16:13:08 +00:00
|
|
|
inp->inp_ip_minttl = optval;
|
|
|
|
else
|
|
|
|
error = EINVAL;
|
|
|
|
break;
|
|
|
|
|
2004-06-24 02:05:47 +00:00
|
|
|
#define OPTSET(bit) do { \
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WLOCK(inp); \
|
2004-06-24 02:05:47 +00:00
|
|
|
if (optval) \
|
|
|
|
inp->inp_flags |= bit; \
|
|
|
|
else \
|
|
|
|
inp->inp_flags &= ~bit; \
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp); \
|
2004-06-24 02:05:47 +00:00
|
|
|
} while (0)
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
case IP_RECVOPTS:
|
|
|
|
OPTSET(INP_RECVOPTS);
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
case IP_RECVRETOPTS:
|
|
|
|
OPTSET(INP_RECVRETOPTS);
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
case IP_RECVDSTADDR:
|
|
|
|
OPTSET(INP_RECVDSTADDR);
|
|
|
|
break;
|
1996-11-11 04:56:32 +00:00
|
|
|
|
2003-04-29 21:36:18 +00:00
|
|
|
case IP_RECVTTL:
|
|
|
|
OPTSET(INP_RECVTTL);
|
|
|
|
break;
|
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
case IP_RECVIF:
|
|
|
|
OPTSET(INP_RECVIF);
|
|
|
|
break;
|
1999-12-22 19:13:38 +00:00
|
|
|
|
|
|
|
case IP_FAITH:
|
|
|
|
OPTSET(INP_FAITH);
|
|
|
|
break;
|
2003-08-20 14:46:40 +00:00
|
|
|
|
|
|
|
case IP_ONESBCAST:
|
|
|
|
OPTSET(INP_ONESBCAST);
|
|
|
|
break;
|
2005-09-26 20:25:16 +00:00
|
|
|
case IP_DONTFRAG:
|
|
|
|
OPTSET(INP_DONTFRAG);
|
|
|
|
break;
|
2009-06-01 10:30:00 +00:00
|
|
|
case IP_BINDANY:
|
|
|
|
OPTSET(INP_BINDANY);
|
2009-01-09 16:02:19 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
#undef OPTSET
|
|
|
|
|
Import rewrite of IPv4 socket multicast layer to support source-specific
and protocol-independent host mode multicast. The code is written to
accomodate IPv6, IGMPv3 and MLDv2 with only a little additional work.
This change only pertains to FreeBSD's use as a multicast end-station and
does not concern multicast routing; for an IGMPv3/MLDv2 router
implementation, consider the XORP project.
The work is based on Wilbert de Graaf's IGMPv3 code drop for FreeBSD 4.6,
which is available at: http://www.kloosterhof.com/wilbert/igmpv3.html
Summary
* IPv4 multicast socket processing is now moved out of ip_output.c
into a new module, in_mcast.c.
* The in_mcast.c module implements the IPv4 legacy any-source API in
terms of the protocol-independent source-specific API.
* Source filters are lazy allocated as the common case does not use them.
They are part of per inpcb state and are covered by the inpcb lock.
* struct ip_mreqn is now supported to allow applications to specify
multicast joins by interface index in the legacy IPv4 any-source API.
* In UDP, an incoming multicast datagram only requires that the source
port matches the 4-tuple if the socket was already bound by source port.
An unbound socket SHOULD be able to receive multicasts sent from an
ephemeral source port.
* The UDP socket multicast filter mode defaults to exclusive, that is,
sources present in the per-socket list will be blocked from delivery.
* The RFC 3678 userland functions have been added to libc: setsourcefilter,
getsourcefilter, setipv4sourcefilter, getipv4sourcefilter.
* Definitions for IGMPv3 are merged but not yet used.
* struct sockaddr_storage is now referenced from <netinet/in.h>. It
is therefore defined there if not already declared in the same way
as for the C99 types.
* The RFC 1724 hack (specify 0.0.0.0/8 addresses to IP_MULTICAST_IF
which are then interpreted as interface indexes) is now deprecated.
* A patch for the Rhyolite.com routed in the FreeBSD base system
is available in the -net archives. This only affects individuals
running RIPv1 or RIPv2 via point-to-point and/or unnumbered interfaces.
* Make IPv6 detach path similar to IPv4's in code flow; functionally same.
* Bump __FreeBSD_version to 700048; see UPDATING.
This work was financially supported by another FreeBSD committer.
Obtained from: p4://bms_netdev
Submitted by: Wilbert de Graaf (original work)
Reviewed by: rwatson (locking), silence from fenner,
net@ (but with encouragement)
2007-06-12 16:24:56 +00:00
|
|
|
/*
|
|
|
|
* Multicast socket options are processed by the in_mcast
|
|
|
|
* module.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_MULTICAST_IF:
|
1994-09-06 22:42:31 +00:00
|
|
|
case IP_MULTICAST_VIF:
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_MULTICAST_TTL:
|
|
|
|
case IP_MULTICAST_LOOP:
|
|
|
|
case IP_ADD_MEMBERSHIP:
|
|
|
|
case IP_DROP_MEMBERSHIP:
|
Import rewrite of IPv4 socket multicast layer to support source-specific
and protocol-independent host mode multicast. The code is written to
accomodate IPv6, IGMPv3 and MLDv2 with only a little additional work.
This change only pertains to FreeBSD's use as a multicast end-station and
does not concern multicast routing; for an IGMPv3/MLDv2 router
implementation, consider the XORP project.
The work is based on Wilbert de Graaf's IGMPv3 code drop for FreeBSD 4.6,
which is available at: http://www.kloosterhof.com/wilbert/igmpv3.html
Summary
* IPv4 multicast socket processing is now moved out of ip_output.c
into a new module, in_mcast.c.
* The in_mcast.c module implements the IPv4 legacy any-source API in
terms of the protocol-independent source-specific API.
* Source filters are lazy allocated as the common case does not use them.
They are part of per inpcb state and are covered by the inpcb lock.
* struct ip_mreqn is now supported to allow applications to specify
multicast joins by interface index in the legacy IPv4 any-source API.
* In UDP, an incoming multicast datagram only requires that the source
port matches the 4-tuple if the socket was already bound by source port.
An unbound socket SHOULD be able to receive multicasts sent from an
ephemeral source port.
* The UDP socket multicast filter mode defaults to exclusive, that is,
sources present in the per-socket list will be blocked from delivery.
* The RFC 3678 userland functions have been added to libc: setsourcefilter,
getsourcefilter, setipv4sourcefilter, getipv4sourcefilter.
* Definitions for IGMPv3 are merged but not yet used.
* struct sockaddr_storage is now referenced from <netinet/in.h>. It
is therefore defined there if not already declared in the same way
as for the C99 types.
* The RFC 1724 hack (specify 0.0.0.0/8 addresses to IP_MULTICAST_IF
which are then interpreted as interface indexes) is now deprecated.
* A patch for the Rhyolite.com routed in the FreeBSD base system
is available in the -net archives. This only affects individuals
running RIPv1 or RIPv2 via point-to-point and/or unnumbered interfaces.
* Make IPv6 detach path similar to IPv4's in code flow; functionally same.
* Bump __FreeBSD_version to 700048; see UPDATING.
This work was financially supported by another FreeBSD committer.
Obtained from: p4://bms_netdev
Submitted by: Wilbert de Graaf (original work)
Reviewed by: rwatson (locking), silence from fenner,
net@ (but with encouragement)
2007-06-12 16:24:56 +00:00
|
|
|
case IP_ADD_SOURCE_MEMBERSHIP:
|
|
|
|
case IP_DROP_SOURCE_MEMBERSHIP:
|
|
|
|
case IP_BLOCK_SOURCE:
|
|
|
|
case IP_UNBLOCK_SOURCE:
|
|
|
|
case IP_MSFILTER:
|
|
|
|
case MCAST_JOIN_GROUP:
|
|
|
|
case MCAST_LEAVE_GROUP:
|
|
|
|
case MCAST_JOIN_SOURCE_GROUP:
|
|
|
|
case MCAST_LEAVE_SOURCE_GROUP:
|
|
|
|
case MCAST_BLOCK_SOURCE:
|
|
|
|
case MCAST_UNBLOCK_SOURCE:
|
|
|
|
error = inp_setmoptions(inp, sopt);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
1996-02-22 21:32:23 +00:00
|
|
|
case IP_PORTRANGE:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
|
|
|
sizeof optval);
|
|
|
|
if (error)
|
|
|
|
break;
|
1996-02-22 21:32:23 +00:00
|
|
|
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WLOCK(inp);
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (optval) {
|
|
|
|
case IP_PORTRANGE_DEFAULT:
|
|
|
|
inp->inp_flags &= ~(INP_LOWPORT);
|
|
|
|
inp->inp_flags &= ~(INP_HIGHPORT);
|
|
|
|
break;
|
1996-02-22 21:32:23 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
case IP_PORTRANGE_HIGH:
|
|
|
|
inp->inp_flags &= ~(INP_LOWPORT);
|
|
|
|
inp->inp_flags |= INP_HIGHPORT;
|
|
|
|
break;
|
1996-02-22 21:32:23 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
case IP_PORTRANGE_LOW:
|
|
|
|
inp->inp_flags &= ~(INP_HIGHPORT);
|
|
|
|
inp->inp_flags |= INP_LOWPORT;
|
|
|
|
break;
|
1996-02-22 21:32:23 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
default:
|
|
|
|
error = EINVAL;
|
|
|
|
break;
|
1996-02-22 21:32:23 +00:00
|
|
|
}
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp);
|
1996-05-21 20:47:31 +00:00
|
|
|
break;
|
1996-02-22 21:32:23 +00:00
|
|
|
|
2007-07-03 12:13:45 +00:00
|
|
|
#ifdef IPSEC
|
1999-12-22 19:13:38 +00:00
|
|
|
case IP_IPSEC_POLICY:
|
|
|
|
{
|
|
|
|
caddr_t req;
|
|
|
|
struct mbuf *m;
|
|
|
|
|
|
|
|
if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
|
|
|
|
break;
|
|
|
|
if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
|
|
|
|
break;
|
|
|
|
req = mtod(m, caddr_t);
|
2009-02-08 09:27:07 +00:00
|
|
|
error = ipsec_set_policy(inp, sopt->sopt_name, req,
|
2008-02-02 14:11:31 +00:00
|
|
|
m->m_len, (sopt->sopt_td != NULL) ?
|
|
|
|
sopt->sopt_td->td_ucred : NULL);
|
1999-12-22 19:13:38 +00:00
|
|
|
m_freem(m);
|
|
|
|
break;
|
|
|
|
}
|
2007-07-03 12:13:45 +00:00
|
|
|
#endif /* IPSEC */
|
1999-12-22 19:13:38 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
default:
|
|
|
|
error = ENOPROTOOPT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
case SOPT_GET:
|
|
|
|
switch (sopt->sopt_name) {
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_OPTIONS:
|
|
|
|
case IP_RETOPTS:
|
1998-08-23 03:07:17 +00:00
|
|
|
if (inp->inp_options)
|
|
|
|
error = sooptcopyout(sopt,
|
|
|
|
mtod(inp->inp_options,
|
|
|
|
char *),
|
|
|
|
inp->inp_options->m_len);
|
|
|
|
else
|
|
|
|
sopt->sopt_valsize = 0;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case IP_TOS:
|
|
|
|
case IP_TTL:
|
2005-08-22 16:13:08 +00:00
|
|
|
case IP_MINTTL:
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_RECVOPTS:
|
|
|
|
case IP_RECVRETOPTS:
|
|
|
|
case IP_RECVDSTADDR:
|
2003-04-29 21:36:18 +00:00
|
|
|
case IP_RECVTTL:
|
1996-11-11 04:56:32 +00:00
|
|
|
case IP_RECVIF:
|
1998-08-23 03:07:17 +00:00
|
|
|
case IP_PORTRANGE:
|
1999-12-22 19:13:38 +00:00
|
|
|
case IP_FAITH:
|
2003-08-20 14:46:40 +00:00
|
|
|
case IP_ONESBCAST:
|
2005-09-26 20:25:16 +00:00
|
|
|
case IP_DONTFRAG:
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case IP_TOS:
|
1997-04-03 05:14:45 +00:00
|
|
|
optval = inp->inp_ip_tos;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case IP_TTL:
|
1997-04-03 05:14:45 +00:00
|
|
|
optval = inp->inp_ip_ttl;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
2005-08-22 16:13:08 +00:00
|
|
|
case IP_MINTTL:
|
|
|
|
optval = inp->inp_ip_minttl;
|
|
|
|
break;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
|
|
|
|
|
|
|
|
case IP_RECVOPTS:
|
|
|
|
optval = OPTBIT(INP_RECVOPTS);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case IP_RECVRETOPTS:
|
|
|
|
optval = OPTBIT(INP_RECVRETOPTS);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case IP_RECVDSTADDR:
|
|
|
|
optval = OPTBIT(INP_RECVDSTADDR);
|
|
|
|
break;
|
1996-11-11 04:56:32 +00:00
|
|
|
|
2003-04-29 21:36:18 +00:00
|
|
|
case IP_RECVTTL:
|
|
|
|
optval = OPTBIT(INP_RECVTTL);
|
|
|
|
break;
|
|
|
|
|
1996-11-11 04:56:32 +00:00
|
|
|
case IP_RECVIF:
|
|
|
|
optval = OPTBIT(INP_RECVIF);
|
|
|
|
break;
|
1998-08-23 03:07:17 +00:00
|
|
|
|
|
|
|
case IP_PORTRANGE:
|
|
|
|
if (inp->inp_flags & INP_HIGHPORT)
|
|
|
|
optval = IP_PORTRANGE_HIGH;
|
|
|
|
else if (inp->inp_flags & INP_LOWPORT)
|
|
|
|
optval = IP_PORTRANGE_LOW;
|
|
|
|
else
|
|
|
|
optval = 0;
|
|
|
|
break;
|
1999-12-22 19:13:38 +00:00
|
|
|
|
|
|
|
case IP_FAITH:
|
|
|
|
optval = OPTBIT(INP_FAITH);
|
|
|
|
break;
|
2003-08-20 14:46:40 +00:00
|
|
|
|
|
|
|
case IP_ONESBCAST:
|
|
|
|
optval = OPTBIT(INP_ONESBCAST);
|
|
|
|
break;
|
2005-09-26 20:25:16 +00:00
|
|
|
case IP_DONTFRAG:
|
|
|
|
optval = OPTBIT(INP_DONTFRAG);
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyout(sopt, &optval, sizeof optval);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
Import rewrite of IPv4 socket multicast layer to support source-specific
and protocol-independent host mode multicast. The code is written to
accomodate IPv6, IGMPv3 and MLDv2 with only a little additional work.
This change only pertains to FreeBSD's use as a multicast end-station and
does not concern multicast routing; for an IGMPv3/MLDv2 router
implementation, consider the XORP project.
The work is based on Wilbert de Graaf's IGMPv3 code drop for FreeBSD 4.6,
which is available at: http://www.kloosterhof.com/wilbert/igmpv3.html
Summary
* IPv4 multicast socket processing is now moved out of ip_output.c
into a new module, in_mcast.c.
* The in_mcast.c module implements the IPv4 legacy any-source API in
terms of the protocol-independent source-specific API.
* Source filters are lazy allocated as the common case does not use them.
They are part of per inpcb state and are covered by the inpcb lock.
* struct ip_mreqn is now supported to allow applications to specify
multicast joins by interface index in the legacy IPv4 any-source API.
* In UDP, an incoming multicast datagram only requires that the source
port matches the 4-tuple if the socket was already bound by source port.
An unbound socket SHOULD be able to receive multicasts sent from an
ephemeral source port.
* The UDP socket multicast filter mode defaults to exclusive, that is,
sources present in the per-socket list will be blocked from delivery.
* The RFC 3678 userland functions have been added to libc: setsourcefilter,
getsourcefilter, setipv4sourcefilter, getipv4sourcefilter.
* Definitions for IGMPv3 are merged but not yet used.
* struct sockaddr_storage is now referenced from <netinet/in.h>. It
is therefore defined there if not already declared in the same way
as for the C99 types.
* The RFC 1724 hack (specify 0.0.0.0/8 addresses to IP_MULTICAST_IF
which are then interpreted as interface indexes) is now deprecated.
* A patch for the Rhyolite.com routed in the FreeBSD base system
is available in the -net archives. This only affects individuals
running RIPv1 or RIPv2 via point-to-point and/or unnumbered interfaces.
* Make IPv6 detach path similar to IPv4's in code flow; functionally same.
* Bump __FreeBSD_version to 700048; see UPDATING.
This work was financially supported by another FreeBSD committer.
Obtained from: p4://bms_netdev
Submitted by: Wilbert de Graaf (original work)
Reviewed by: rwatson (locking), silence from fenner,
net@ (but with encouragement)
2007-06-12 16:24:56 +00:00
|
|
|
/*
|
|
|
|
* Multicast socket options are processed by the in_mcast
|
|
|
|
* module.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_MULTICAST_IF:
|
1994-09-06 22:42:31 +00:00
|
|
|
case IP_MULTICAST_VIF:
|
1994-05-24 10:09:53 +00:00
|
|
|
case IP_MULTICAST_TTL:
|
|
|
|
case IP_MULTICAST_LOOP:
|
Import rewrite of IPv4 socket multicast layer to support source-specific
and protocol-independent host mode multicast. The code is written to
accomodate IPv6, IGMPv3 and MLDv2 with only a little additional work.
This change only pertains to FreeBSD's use as a multicast end-station and
does not concern multicast routing; for an IGMPv3/MLDv2 router
implementation, consider the XORP project.
The work is based on Wilbert de Graaf's IGMPv3 code drop for FreeBSD 4.6,
which is available at: http://www.kloosterhof.com/wilbert/igmpv3.html
Summary
* IPv4 multicast socket processing is now moved out of ip_output.c
into a new module, in_mcast.c.
* The in_mcast.c module implements the IPv4 legacy any-source API in
terms of the protocol-independent source-specific API.
* Source filters are lazy allocated as the common case does not use them.
They are part of per inpcb state and are covered by the inpcb lock.
* struct ip_mreqn is now supported to allow applications to specify
multicast joins by interface index in the legacy IPv4 any-source API.
* In UDP, an incoming multicast datagram only requires that the source
port matches the 4-tuple if the socket was already bound by source port.
An unbound socket SHOULD be able to receive multicasts sent from an
ephemeral source port.
* The UDP socket multicast filter mode defaults to exclusive, that is,
sources present in the per-socket list will be blocked from delivery.
* The RFC 3678 userland functions have been added to libc: setsourcefilter,
getsourcefilter, setipv4sourcefilter, getipv4sourcefilter.
* Definitions for IGMPv3 are merged but not yet used.
* struct sockaddr_storage is now referenced from <netinet/in.h>. It
is therefore defined there if not already declared in the same way
as for the C99 types.
* The RFC 1724 hack (specify 0.0.0.0/8 addresses to IP_MULTICAST_IF
which are then interpreted as interface indexes) is now deprecated.
* A patch for the Rhyolite.com routed in the FreeBSD base system
is available in the -net archives. This only affects individuals
running RIPv1 or RIPv2 via point-to-point and/or unnumbered interfaces.
* Make IPv6 detach path similar to IPv4's in code flow; functionally same.
* Bump __FreeBSD_version to 700048; see UPDATING.
This work was financially supported by another FreeBSD committer.
Obtained from: p4://bms_netdev
Submitted by: Wilbert de Graaf (original work)
Reviewed by: rwatson (locking), silence from fenner,
net@ (but with encouragement)
2007-06-12 16:24:56 +00:00
|
|
|
case IP_MSFILTER:
|
|
|
|
error = inp_getmoptions(inp, sopt);
|
1996-02-22 21:32:23 +00:00
|
|
|
break;
|
|
|
|
|
2007-07-03 12:13:45 +00:00
|
|
|
#ifdef IPSEC
|
1999-12-22 19:13:38 +00:00
|
|
|
case IP_IPSEC_POLICY:
|
|
|
|
{
|
2000-03-09 14:57:16 +00:00
|
|
|
struct mbuf *m = NULL;
|
1999-12-22 19:13:38 +00:00
|
|
|
caddr_t req = NULL;
|
2000-07-04 16:35:15 +00:00
|
|
|
size_t len = 0;
|
1999-12-22 19:13:38 +00:00
|
|
|
|
2000-07-04 16:35:15 +00:00
|
|
|
if (m != 0) {
|
1999-12-22 19:13:38 +00:00
|
|
|
req = mtod(m, caddr_t);
|
2000-07-04 16:35:15 +00:00
|
|
|
len = m->m_len;
|
|
|
|
}
|
2009-02-08 09:27:07 +00:00
|
|
|
error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
|
1999-12-22 19:13:38 +00:00
|
|
|
if (error == 0)
|
|
|
|
error = soopt_mcopyout(sopt, m); /* XXX */
|
2000-03-09 14:57:16 +00:00
|
|
|
if (error == 0)
|
|
|
|
m_freem(m);
|
1999-12-22 19:13:38 +00:00
|
|
|
break;
|
|
|
|
}
|
2007-07-03 12:13:45 +00:00
|
|
|
#endif /* IPSEC */
|
1999-12-22 19:13:38 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
default:
|
|
|
|
error = ENOPROTOOPT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routine called from ip_output() to loop back a copy of an IP multicast
|
|
|
|
* packet to the input queue of a specified interface. Note that this
|
|
|
|
* calls the output routine of the loopback "driver", but with an interface
|
1995-04-26 18:10:58 +00:00
|
|
|
* pointer that might NOT be a loopback interface -- evil, but easier than
|
|
|
|
* replicating that code here.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-05-10 15:58:48 +00:00
|
|
|
ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
|
|
|
|
int hlen)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-11-20 18:56:25 +00:00
|
|
|
register struct ip *ip;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct mbuf *copym;
|
|
|
|
|
2008-08-29 20:42:58 +00:00
|
|
|
/*
|
|
|
|
* Make a deep copy of the packet because we're going to
|
|
|
|
* modify the pack in order to generate checksums.
|
|
|
|
*/
|
|
|
|
copym = m_dup(m, M_DONTWAIT);
|
1997-05-06 21:22:04 +00:00
|
|
|
if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
|
|
|
|
copym = m_pullup(copym, hlen);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (copym != NULL) {
|
2004-04-07 10:01:39 +00:00
|
|
|
/* If needed, compute the checksum and mark it as valid. */
|
|
|
|
if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
|
|
|
|
in_delayed_cksum(copym);
|
|
|
|
copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
|
|
|
|
copym->m_pkthdr.csum_flags |=
|
|
|
|
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
|
|
|
|
copym->m_pkthdr.csum_data = 0xffff;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* We don't bother to fragment if the IP length is greater
|
|
|
|
* than the interface's MTU. Can this possibly matter?
|
|
|
|
*/
|
|
|
|
ip = mtod(copym, struct ip *);
|
2002-02-18 20:35:27 +00:00
|
|
|
ip->ip_len = htons(ip->ip_len);
|
|
|
|
ip->ip_off = htons(ip->ip_off);
|
1994-05-24 10:09:53 +00:00
|
|
|
ip->ip_sum = 0;
|
2002-10-20 22:52:07 +00:00
|
|
|
ip->ip_sum = in_cksum(copym, hlen);
|
1998-06-14 20:58:17 +00:00
|
|
|
#if 1 /* XXX */
|
|
|
|
if (dst->sin_family != AF_INET) {
|
1998-06-15 00:35:47 +00:00
|
|
|
printf("ip_mloopback: bad address family %d\n",
|
1998-06-14 20:58:17 +00:00
|
|
|
dst->sin_family);
|
|
|
|
dst->sin_family = AF_INET;
|
|
|
|
}
|
|
|
|
#endif
|
2000-05-24 21:16:56 +00:00
|
|
|
if_simloop(ifp, copym, dst->sin_family, 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|