2005-01-07 01:45:51 +00:00
|
|
|
/*-
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1990, 1993
|
2008-12-09 10:21:38 +00:00
|
|
|
* The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)in_pcb.h 8.1 (Berkeley) 6/10/93
|
1999-08-28 01:08:13 +00:00
|
|
|
* $FreeBSD$
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
|
1994-08-21 05:27:42 +00:00
|
|
|
#ifndef _NETINET_IN_PCB_H_
|
|
|
|
#define _NETINET_IN_PCB_H_
|
|
|
|
|
1995-12-05 21:26:34 +00:00
|
|
|
#include <sys/queue.h>
|
2002-09-05 19:48:52 +00:00
|
|
|
#include <sys/_lock.h>
|
|
|
|
#include <sys/_mutex.h>
|
2008-04-17 21:38:18 +00:00
|
|
|
#include <sys/_rwlock.h>
|
1995-12-05 21:26:34 +00:00
|
|
|
|
2001-11-27 17:36:39 +00:00
|
|
|
#include <net/route.h>
|
1999-11-22 02:45:11 +00:00
|
|
|
|
2008-04-17 21:38:18 +00:00
|
|
|
#ifdef _KERNEL
|
|
|
|
#include <sys/rwlock.h>
|
|
|
|
#endif
|
|
|
|
|
1999-11-22 02:45:11 +00:00
|
|
|
#define in6pcb inpcb /* for KAME src sync over BSD*'s */
|
|
|
|
#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */
|
2002-10-16 02:25:05 +00:00
|
|
|
struct inpcbpolicy;
|
1999-11-22 02:45:11 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2008-09-29 13:48:48 +00:00
|
|
|
* struct inpcb is the common protocol control block structure used in most
|
|
|
|
* IP transport protocols.
|
2007-04-30 23:12:05 +00:00
|
|
|
*
|
|
|
|
* Pointers to local and foreign host table entries, local and foreign socket
|
|
|
|
* numbers, and pointers up (to a socket structure) and down (to a
|
|
|
|
* protocol-specific control block) are stored here.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2000-05-26 02:09:24 +00:00
|
|
|
LIST_HEAD(inpcbhead, inpcb);
|
|
|
|
LIST_HEAD(inpcbporthead, inpcbport);
|
1998-05-15 20:11:40 +00:00
|
|
|
typedef u_quad_t inp_gen_t;
|
1995-04-09 01:29:31 +00:00
|
|
|
|
1999-11-22 02:45:11 +00:00
|
|
|
/*
|
|
|
|
* PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
|
2007-04-30 23:12:05 +00:00
|
|
|
* So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
|
|
|
|
* the following structure.
|
1999-11-22 02:45:11 +00:00
|
|
|
*/
|
|
|
|
struct in_addr_4in6 {
|
|
|
|
u_int32_t ia46_pad32[3];
|
|
|
|
struct in_addr ia46_addr4;
|
|
|
|
};
|
|
|
|
|
2001-11-22 04:50:44 +00:00
|
|
|
/*
|
2007-04-30 23:12:05 +00:00
|
|
|
* NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
|
|
|
|
* some extra padding to accomplish this.
|
2001-11-22 04:50:44 +00:00
|
|
|
*/
|
|
|
|
struct in_endpoints {
|
|
|
|
u_int16_t ie_fport; /* foreign port */
|
|
|
|
u_int16_t ie_lport; /* local port */
|
|
|
|
/* protocol dependent part, local and foreign addr */
|
|
|
|
union {
|
|
|
|
/* foreign host table entry */
|
|
|
|
struct in_addr_4in6 ie46_foreign;
|
|
|
|
struct in6_addr ie6_foreign;
|
|
|
|
} ie_dependfaddr;
|
|
|
|
union {
|
|
|
|
/* local host table entry */
|
|
|
|
struct in_addr_4in6 ie46_local;
|
|
|
|
struct in6_addr ie6_local;
|
|
|
|
} ie_dependladdr;
|
2008-12-09 10:21:38 +00:00
|
|
|
};
|
2001-11-22 04:50:44 +00:00
|
|
|
#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
|
|
|
|
#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
|
|
|
|
#define ie6_faddr ie_dependfaddr.ie6_foreign
|
|
|
|
#define ie6_laddr ie_dependladdr.ie6_local
|
|
|
|
|
|
|
|
/*
|
2007-04-30 23:12:05 +00:00
|
|
|
* XXX The defines for inc_* are hacks and should be changed to direct
|
|
|
|
* references.
|
2001-11-22 04:50:44 +00:00
|
|
|
*/
|
|
|
|
struct in_conninfo {
|
|
|
|
u_int8_t inc_flags;
|
|
|
|
u_int8_t inc_len;
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */
|
2003-11-20 20:07:39 +00:00
|
|
|
/* protocol dependent part */
|
2001-11-22 04:50:44 +00:00
|
|
|
struct in_endpoints inc_ie;
|
|
|
|
};
|
2008-12-17 12:52:34 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Flags for inc_flags.
|
|
|
|
*/
|
|
|
|
#define INC_ISIPV6 0x01
|
|
|
|
|
2001-11-22 04:50:44 +00:00
|
|
|
#define inc_isipv6 inc_flags /* temp compatability */
|
|
|
|
#define inc_fport inc_ie.ie_fport
|
|
|
|
#define inc_lport inc_ie.ie_lport
|
|
|
|
#define inc_faddr inc_ie.ie_faddr
|
|
|
|
#define inc_laddr inc_ie.ie_laddr
|
|
|
|
#define inc6_faddr inc_ie.ie6_faddr
|
|
|
|
#define inc6_laddr inc_ie.ie6_laddr
|
|
|
|
|
1999-11-22 02:45:11 +00:00
|
|
|
struct icmp6_filter;
|
|
|
|
|
2008-07-08 17:22:59 +00:00
|
|
|
/*-
|
|
|
|
* struct inpcb captures the network layer state for TCP, UDP, and raw IPv4
|
|
|
|
* and IPv6 sockets. In the case of TCP, further per-connection state is
|
|
|
|
* hung off of inp_ppcb most of the time. Almost all fields of struct inpcb
|
|
|
|
* are static after creation or protected by a per-inpcb rwlock, inp_lock. A
|
|
|
|
* few fields also require the global pcbinfo lock for the inpcb to be held,
|
|
|
|
* when modified, such as the global connection lists and hashes, as well as
|
|
|
|
* binding information (which affects which hash a connection is on). This
|
|
|
|
* model means that connections can be looked up without holding the
|
|
|
|
* per-connection lock, which is important for performance when attempting to
|
|
|
|
* find the connection for a packet given its IP and port tuple. Writing to
|
|
|
|
* these fields that write locks be held on both the inpcb and global locks.
|
|
|
|
*
|
|
|
|
* Key:
|
|
|
|
* (c) - Constant after initialization
|
|
|
|
* (i) - Protected by the inpcb lock
|
|
|
|
* (p) - Protected by the pcbinfo lock for the inpcb
|
|
|
|
* (s) - Protected by another subsystem's locks
|
|
|
|
* (x) - Undefined locking
|
|
|
|
*
|
|
|
|
* A few other notes:
|
|
|
|
*
|
|
|
|
* When a read lock is held, stability of the field is guaranteed; to write
|
|
|
|
* to a field, a write lock must generally be held.
|
|
|
|
*
|
|
|
|
* netinet/netinet6-layer code should not assume that the inp_socket pointer
|
|
|
|
* is safe to dereference without inp_lock being held, even for protocols
|
|
|
|
* other than TCP (where the inpcb persists during TIMEWAIT even after the
|
|
|
|
* socket has been freed), or there may be close(2)-related races.
|
|
|
|
*
|
|
|
|
* The inp_vflag field is overloaded, and would otherwise ideally be (c).
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
struct inpcb {
|
2008-07-08 17:22:59 +00:00
|
|
|
LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */
|
|
|
|
LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */
|
|
|
|
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
|
|
|
|
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
|
2008-12-09 10:21:38 +00:00
|
|
|
struct socket *inp_socket; /* (i) back pointer to socket */
|
2008-10-04 15:06:34 +00:00
|
|
|
struct ucred *inp_cred; /* (c) cache of socket cred */
|
2008-12-09 10:21:38 +00:00
|
|
|
u_int32_t inp_flow; /* (i) IPv6 flow information */
|
2008-07-08 17:22:59 +00:00
|
|
|
int inp_flags; /* (i) generic IP/datagram flags */
|
2009-04-15 22:09:42 +00:00
|
|
|
int inp_flags2; /* (i) generic IP/datagram flags #2*/
|
2008-07-08 17:22:59 +00:00
|
|
|
u_char inp_vflag; /* (i) IP version flag (v4/v6) */
|
|
|
|
u_char inp_ip_ttl; /* (i) time to live proto */
|
|
|
|
u_char inp_ip_p; /* (c) protocol proto */
|
|
|
|
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
|
2009-04-10 06:16:14 +00:00
|
|
|
uint32_t inp_flowid; /* (x) flow id / queue id */
|
Add a reference count to struct inpcb, which may be explicitly
incremented using in_pcbref(), and decremented using in_pcbfree()
or inpcbrele(). Protocols using only current in_pcballoc() and
in_pcbfree() calls will see the same semantics, but it is now
possible for TCP to call in_pcbref() and in_pcbrele() to prevent
an inpcb from being freed when both tcbinfo and per-inpcb locks
are released. This makes it possible to safely transition from
holding only the inpcb lock to both tcbinfo and inpcb lock
without re-looking up a connection in the input path, timer
path, etc.
Notice that in_pcbrele() does not unlock the connection after
decrementing the refcount, if the connection remains, so that
the caller can continue to use it; in_pcbrele() returns a flag
indicating whether or not the inpcb pointer is still valid, and
in_pcbfee() is now a simple wrapper around in_pcbrele().
MFC after: 1 month
Discussed with: bz, kmacy
Reviewed by: bz, gnn, kmacy
Tested by: kmacy
2008-12-08 20:18:50 +00:00
|
|
|
u_int inp_refcount; /* (i) refcount */
|
2009-04-16 23:02:56 +00:00
|
|
|
void *inp_pspare[2]; /* (x) rtentry / general use */
|
2007-12-07 01:46:13 +00:00
|
|
|
|
|
|
|
/* Local and foreign ports, local and foreign addr. */
|
2008-12-09 10:21:38 +00:00
|
|
|
struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */
|
2007-12-07 01:46:13 +00:00
|
|
|
|
2008-12-09 10:21:38 +00:00
|
|
|
/* MAC and IPSEC policy information. */
|
2008-07-08 17:22:59 +00:00
|
|
|
struct label *inp_label; /* (i) MAC label */
|
|
|
|
struct inpcbpolicy *inp_sp; /* (s) for IPSEC */
|
1999-11-22 02:45:11 +00:00
|
|
|
|
2007-04-30 23:12:05 +00:00
|
|
|
/* Protocol-dependent part; options. */
|
1999-11-22 02:45:11 +00:00
|
|
|
struct {
|
2008-07-08 17:22:59 +00:00
|
|
|
u_char inp4_ip_tos; /* (i) type of service proto */
|
|
|
|
struct mbuf *inp4_options; /* (i) IP options */
|
2008-12-09 10:21:38 +00:00
|
|
|
struct ip_moptions *inp4_moptions; /* (i) IP mcast options */
|
1999-11-22 02:45:11 +00:00
|
|
|
} inp_depend4;
|
|
|
|
struct {
|
2008-07-08 17:22:59 +00:00
|
|
|
/* (i) IP options */
|
1999-11-22 02:45:11 +00:00
|
|
|
struct mbuf *inp6_options;
|
2008-07-08 17:22:59 +00:00
|
|
|
/* (i) IP6 options for outgoing packets */
|
1999-11-22 02:45:11 +00:00
|
|
|
struct ip6_pktopts *inp6_outputopts;
|
2008-07-08 17:22:59 +00:00
|
|
|
/* (i) IP multicast options */
|
1999-11-22 02:45:11 +00:00
|
|
|
struct ip6_moptions *inp6_moptions;
|
2008-07-08 17:22:59 +00:00
|
|
|
/* (i) ICMPv6 code type filter */
|
1999-11-22 02:45:11 +00:00
|
|
|
struct icmp6_filter *inp6_icmp6filt;
|
2008-07-08 17:22:59 +00:00
|
|
|
/* (i) IPV6_CHECKSUM setsockopt */
|
1999-11-22 02:45:11 +00:00
|
|
|
int inp6_cksum;
|
|
|
|
short inp6_hops;
|
|
|
|
} inp_depend6;
|
2008-07-08 17:22:59 +00:00
|
|
|
LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */
|
|
|
|
struct inpcbport *inp_phd; /* (i/p) head of this list */
|
2006-07-18 22:34:27 +00:00
|
|
|
#define inp_zero_size offsetof(struct inpcb, inp_gencnt)
|
2008-12-09 10:21:38 +00:00
|
|
|
inp_gen_t inp_gencnt; /* (c) generation count */
|
2009-04-16 22:47:43 +00:00
|
|
|
struct llentry *inp_lle; /* cached L2 information */
|
|
|
|
struct rtentry *inp_rt; /* cached L3 information */
|
2008-04-17 21:38:18 +00:00
|
|
|
struct rwlock inp_lock;
|
2008-12-09 10:21:38 +00:00
|
|
|
};
|
|
|
|
#define inp_fport inp_inc.inc_fport
|
|
|
|
#define inp_lport inp_inc.inc_lport
|
|
|
|
#define inp_faddr inp_inc.inc_faddr
|
|
|
|
#define inp_laddr inp_inc.inc_laddr
|
|
|
|
#define inp_ip_tos inp_depend4.inp4_ip_tos
|
|
|
|
#define inp_options inp_depend4.inp4_options
|
|
|
|
#define inp_moptions inp_depend4.inp4_moptions
|
2002-06-10 20:05:46 +00:00
|
|
|
|
2001-11-22 04:50:44 +00:00
|
|
|
#define in6p_faddr inp_inc.inc6_faddr
|
|
|
|
#define in6p_laddr inp_inc.inc6_laddr
|
1999-11-22 02:45:11 +00:00
|
|
|
#define in6p_hops inp_depend6.inp6_hops /* default hop limit */
|
|
|
|
#define in6p_flowinfo inp_flow
|
|
|
|
#define in6p_options inp_depend6.inp6_options
|
|
|
|
#define in6p_outputopts inp_depend6.inp6_outputopts
|
|
|
|
#define in6p_moptions inp_depend6.inp6_moptions
|
|
|
|
#define in6p_icmp6filt inp_depend6.inp6_icmp6filt
|
|
|
|
#define in6p_cksum inp_depend6.inp6_cksum
|
2008-12-09 10:21:38 +00:00
|
|
|
|
Permit buiding kernels with options VIMAGE, restricted to only a single
active network stack instance. Turning on options VIMAGE at compile
time yields the following changes relative to default kernel build:
1) V_ accessor macros for virtualized variables resolve to structure
fields via base pointers, instead of being resolved as fields in global
structs or plain global variables. As an example, V_ifnet becomes:
options VIMAGE: ((struct vnet_net *) vnet_net)->_ifnet
default build: vnet_net_0._ifnet
options VIMAGE_GLOBALS: ifnet
2) INIT_VNET_* macros will declare and set up base pointers to be used
by V_ accessor macros, instead of resolving to whitespace:
INIT_VNET_NET(ifp->if_vnet); becomes
struct vnet_net *vnet_net = (ifp->if_vnet)->mod_data[VNET_MOD_NET];
3) Memory for vnet modules registered via vnet_mod_register() is now
allocated at run time in sys/kern/kern_vimage.c, instead of per vnet
module structs being declared as globals. If required, vnet modules
can now request the framework to provide them with allocated bzeroed
memory by filling in the vmi_size field in their vmi_modinfo structures.
4) structs socket, ifnet, inpcbinfo, tcpcb and syncache_head are
extended to hold a pointer to the parent vnet. options VIMAGE builds
will fill in those fields as required.
5) curvnet is introduced as a new global variable in options VIMAGE
builds, always pointing to the default and only struct vnet.
6) struct sysctl_oid has been extended with additional two fields to
store major and minor virtualization module identifiers, oid_v_subs and
oid_v_mod. SYSCTL_V_* family of macros will fill in those fields
accordingly, and store the offset in the appropriate vnet container
struct in oid_arg1.
In sysctl handlers dealing with virtualized sysctls, the
SYSCTL_RESOLVE_V_ARG1() macro will compute the address of the target
variable and make it available in arg1 variable for further processing.
Unused fields in structs vnet_inet, vnet_inet6 and vnet_ipfw have
been deleted.
Reviewed by: bz, rwatson
Approved by: julian (mentor)
2009-04-30 13:36:26 +00:00
|
|
|
#define inp_vnet inp_pcbinfo->ipi_vnet
|
|
|
|
|
1998-03-24 18:06:34 +00:00
|
|
|
/*
|
2007-04-30 23:12:05 +00:00
|
|
|
* The range of the generation count, as used in this implementation, is 9e19.
|
|
|
|
* We would have to create 300 billion connections per second for this number
|
|
|
|
* to roll over in a year. This seems sufficiently unlikely that we simply
|
|
|
|
* don't concern ourselves with that possibility.
|
1998-03-24 18:06:34 +00:00
|
|
|
*/
|
Improved connection establishment performance by doing local port lookups via
a hashed port list. In the new scheme, in_pcblookup() goes away and is
replaced by a new routine, in_pcblookup_local() for doing the local port
check. Note that this implementation is space inefficient in that the PCB
struct is now too large to fit into 128 bytes. I might deal with this in the
future by using the new zone allocator, but I wanted these changes to be
extensively tested in their current form first.
Also:
1) Fixed off-by-one errors in the port lookup loops in in_pcbbind().
2) Got rid of some unneeded rehashing. Adding a new routine, in_pcbinshash()
to do the initialial hash insertion.
3) Renamed in_pcblookuphash() to in_pcblookup_hash() for easier readability.
4) Added a new routine, in_pcbremlists() to remove the PCB from the various
hash lists.
5) Added/deleted comments where appropriate.
6) Removed unnecessary splnet() locking. In general, the PCB functions should
be called at splnet()...there are unfortunately a few exceptions, however.
7) Reorganized a few structs for better cache line behavior.
8) Killed my TCP_ACK_HACK kludge. It may come back in a different form in
the future, however.
These changes have been tested on wcarchive for more than a month. In tests
done here, connection establishment overhead is reduced by more than 50
times, thus getting rid of one of the major networking scalability problems.
Still to do: make tcp_fastimo/tcp_slowtimo scale well for systems with a
large number of connections. tcp_fastimo is easy; tcp_slowtimo is difficult.
WARNING: Anything that knows about inpcb and tcpcb structs will have to be
recompiled; at the very least, this includes netstat(1).
1998-01-27 09:15:13 +00:00
|
|
|
|
1998-05-15 20:11:40 +00:00
|
|
|
/*
|
2007-04-30 23:12:05 +00:00
|
|
|
* Interface exported to userland by various protocols which use inpcbs. Hack
|
|
|
|
* alert -- only define if struct xsocket is in scope.
|
1998-05-15 20:11:40 +00:00
|
|
|
*/
|
|
|
|
#ifdef _SYS_SOCKETVAR_H_
|
|
|
|
struct xinpcb {
|
|
|
|
size_t xi_len; /* length of this structure */
|
|
|
|
struct inpcb xi_inp;
|
|
|
|
struct xsocket xi_socket;
|
|
|
|
u_quad_t xi_alignment_hack;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct xinpgen {
|
|
|
|
size_t xig_len; /* length of this structure */
|
|
|
|
u_int xig_count; /* number of PCBs at this time */
|
|
|
|
inp_gen_t xig_gen; /* generation count at this time */
|
|
|
|
so_gen_t xig_sogen; /* socket generation count at this time */
|
|
|
|
};
|
|
|
|
#endif /* _SYS_SOCKETVAR_H_ */
|
|
|
|
|
Improved connection establishment performance by doing local port lookups via
a hashed port list. In the new scheme, in_pcblookup() goes away and is
replaced by a new routine, in_pcblookup_local() for doing the local port
check. Note that this implementation is space inefficient in that the PCB
struct is now too large to fit into 128 bytes. I might deal with this in the
future by using the new zone allocator, but I wanted these changes to be
extensively tested in their current form first.
Also:
1) Fixed off-by-one errors in the port lookup loops in in_pcbbind().
2) Got rid of some unneeded rehashing. Adding a new routine, in_pcbinshash()
to do the initialial hash insertion.
3) Renamed in_pcblookuphash() to in_pcblookup_hash() for easier readability.
4) Added a new routine, in_pcbremlists() to remove the PCB from the various
hash lists.
5) Added/deleted comments where appropriate.
6) Removed unnecessary splnet() locking. In general, the PCB functions should
be called at splnet()...there are unfortunately a few exceptions, however.
7) Reorganized a few structs for better cache line behavior.
8) Killed my TCP_ACK_HACK kludge. It may come back in a different form in
the future, however.
These changes have been tested on wcarchive for more than a month. In tests
done here, connection establishment overhead is reduced by more than 50
times, thus getting rid of one of the major networking scalability problems.
Still to do: make tcp_fastimo/tcp_slowtimo scale well for systems with a
large number of connections. tcp_fastimo is easy; tcp_slowtimo is difficult.
WARNING: Anything that knows about inpcb and tcpcb structs will have to be
recompiled; at the very least, this includes netstat(1).
1998-01-27 09:15:13 +00:00
|
|
|
struct inpcbport {
|
2000-05-26 02:09:24 +00:00
|
|
|
LIST_ENTRY(inpcbport) phd_hash;
|
Improved connection establishment performance by doing local port lookups via
a hashed port list. In the new scheme, in_pcblookup() goes away and is
replaced by a new routine, in_pcblookup_local() for doing the local port
check. Note that this implementation is space inefficient in that the PCB
struct is now too large to fit into 128 bytes. I might deal with this in the
future by using the new zone allocator, but I wanted these changes to be
extensively tested in their current form first.
Also:
1) Fixed off-by-one errors in the port lookup loops in in_pcbbind().
2) Got rid of some unneeded rehashing. Adding a new routine, in_pcbinshash()
to do the initialial hash insertion.
3) Renamed in_pcblookuphash() to in_pcblookup_hash() for easier readability.
4) Added a new routine, in_pcbremlists() to remove the PCB from the various
hash lists.
5) Added/deleted comments where appropriate.
6) Removed unnecessary splnet() locking. In general, the PCB functions should
be called at splnet()...there are unfortunately a few exceptions, however.
7) Reorganized a few structs for better cache line behavior.
8) Killed my TCP_ACK_HACK kludge. It may come back in a different form in
the future, however.
These changes have been tested on wcarchive for more than a month. In tests
done here, connection establishment overhead is reduced by more than 50
times, thus getting rid of one of the major networking scalability problems.
Still to do: make tcp_fastimo/tcp_slowtimo scale well for systems with a
large number of connections. tcp_fastimo is easy; tcp_slowtimo is difficult.
WARNING: Anything that knows about inpcb and tcpcb structs will have to be
recompiled; at the very least, this includes netstat(1).
1998-01-27 09:15:13 +00:00
|
|
|
struct inpcbhead phd_pcblist;
|
|
|
|
u_short phd_port;
|
1994-05-24 10:09:53 +00:00
|
|
|
};
|
|
|
|
|
2007-04-30 23:12:05 +00:00
|
|
|
/*
|
|
|
|
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
|
|
|
|
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
|
|
|
|
*/
|
|
|
|
struct inpcbinfo {
|
|
|
|
/*
|
|
|
|
* Global list of inpcbs on the protocol.
|
|
|
|
*/
|
|
|
|
struct inpcbhead *ipi_listhead;
|
|
|
|
u_int ipi_count;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global hash of inpcbs, hashed by local and foreign addresses and
|
|
|
|
* port numbers.
|
|
|
|
*/
|
|
|
|
struct inpcbhead *ipi_hashbase;
|
|
|
|
u_long ipi_hashmask;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global hash of inpcbs, hashed by only local port number.
|
|
|
|
*/
|
|
|
|
struct inpcbporthead *ipi_porthashbase;
|
|
|
|
u_long ipi_porthashmask;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fields associated with port lookup and allocation.
|
|
|
|
*/
|
|
|
|
u_short ipi_lastport;
|
|
|
|
u_short ipi_lastlow;
|
|
|
|
u_short ipi_lasthi;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* UMA zone from which inpcbs are allocated for this protocol.
|
|
|
|
*/
|
|
|
|
struct uma_zone *ipi_zone;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generation count--incremented each time a connection is allocated
|
|
|
|
* or freed.
|
|
|
|
*/
|
|
|
|
u_quad_t ipi_gencnt;
|
2008-04-17 21:38:18 +00:00
|
|
|
struct rwlock ipi_lock;
|
2007-12-07 01:46:13 +00:00
|
|
|
|
|
|
|
/*
|
Permit buiding kernels with options VIMAGE, restricted to only a single
active network stack instance. Turning on options VIMAGE at compile
time yields the following changes relative to default kernel build:
1) V_ accessor macros for virtualized variables resolve to structure
fields via base pointers, instead of being resolved as fields in global
structs or plain global variables. As an example, V_ifnet becomes:
options VIMAGE: ((struct vnet_net *) vnet_net)->_ifnet
default build: vnet_net_0._ifnet
options VIMAGE_GLOBALS: ifnet
2) INIT_VNET_* macros will declare and set up base pointers to be used
by V_ accessor macros, instead of resolving to whitespace:
INIT_VNET_NET(ifp->if_vnet); becomes
struct vnet_net *vnet_net = (ifp->if_vnet)->mod_data[VNET_MOD_NET];
3) Memory for vnet modules registered via vnet_mod_register() is now
allocated at run time in sys/kern/kern_vimage.c, instead of per vnet
module structs being declared as globals. If required, vnet modules
can now request the framework to provide them with allocated bzeroed
memory by filling in the vmi_size field in their vmi_modinfo structures.
4) structs socket, ifnet, inpcbinfo, tcpcb and syncache_head are
extended to hold a pointer to the parent vnet. options VIMAGE builds
will fill in those fields as required.
5) curvnet is introduced as a new global variable in options VIMAGE
builds, always pointing to the default and only struct vnet.
6) struct sysctl_oid has been extended with additional two fields to
store major and minor virtualization module identifiers, oid_v_subs and
oid_v_mod. SYSCTL_V_* family of macros will fill in those fields
accordingly, and store the offset in the appropriate vnet container
struct in oid_arg1.
In sysctl handlers dealing with virtualized sysctls, the
SYSCTL_RESOLVE_V_ARG1() macro will compute the address of the target
variable and make it available in arg1 variable for further processing.
Unused fields in structs vnet_inet, vnet_inet6 and vnet_ipfw have
been deleted.
Reviewed by: bz, rwatson
Approved by: julian (mentor)
2009-04-30 13:36:26 +00:00
|
|
|
* Pointer to network stack instance
|
|
|
|
*/
|
|
|
|
struct vnet *ipi_vnet;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* general use 2
|
2007-12-07 01:46:13 +00:00
|
|
|
*/
|
2008-08-07 09:06:04 +00:00
|
|
|
void *ipi_pspare[2];
|
1995-04-09 01:29:31 +00:00
|
|
|
};
|
|
|
|
|
2003-11-26 01:40:44 +00:00
|
|
|
#define INP_LOCK_INIT(inp, d, t) \
|
2008-04-17 21:38:18 +00:00
|
|
|
rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK)
|
|
|
|
#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
|
|
|
|
#define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock)
|
|
|
|
#define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock)
|
Merge last of a series of rwlock conversion changes to UDP, which
completes the move to a fully parallel UDP transmit path by using
global read, rather than write, locking of inpcbinfo in further
semi-connected cases:
- Add macros to allow try-locking of inpcb and inpcbinfo.
- Always acquire an incpcb read lock in udp_output(), which stablizes the
local inpcb address and port bindings in order to determine what further
locking is required:
- If the inpcb is currently not bound (at all) and are implicitly
connecting, we require inpcbinfo and inpcb write locks, so drop the
read lock and re-acquire.
- If the inpcb is bound for at least one of the port or address, but an
explicit source or destination is requested, trylock the inpcbinfo
lock, and if that fails, drop the inpcb lock, lock the global lock,
and relock the inpcb lock.
- Otherwise, no further locking is required (common case).
- Update comments.
In practice, this means that the vast majority of consumers of UDP sockets
will not acquire any exclusive locks at the socket or UDP levels of the
network stack. This leads to a marked performance improvement in several
important workloads, including BIND, nsd, and memcached over UDP, as well
as significant improvements in pps microbenchmarks.
The plan is to MFC all of the rwlock changes to RELENG_7 once they have
settled for a weeks in the tree.
Tested by: ps, kris (older revision), bde
MFC after: 3 weeks
2008-07-15 15:38:47 +00:00
|
|
|
#define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock)
|
|
|
|
#define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock)
|
2008-04-17 21:38:18 +00:00
|
|
|
#define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock)
|
|
|
|
#define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock)
|
2009-04-15 21:39:56 +00:00
|
|
|
#define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock)
|
|
|
|
#define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock)
|
|
|
|
#define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock)
|
|
|
|
#define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED)
|
2008-04-17 21:38:18 +00:00
|
|
|
#define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED)
|
|
|
|
#define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED)
|
|
|
|
#define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
|
2002-06-10 20:05:46 +00:00
|
|
|
|
2008-03-24 20:24:04 +00:00
|
|
|
#ifdef _KERNEL
|
2008-03-23 22:34:16 +00:00
|
|
|
/*
|
2008-08-07 09:06:04 +00:00
|
|
|
* These locking functions are for inpcb consumers outside of sys/netinet,
|
2008-03-23 22:34:16 +00:00
|
|
|
* more specifically, they were added for the benefit of TOE drivers. The
|
|
|
|
* macros are reserved for use by the stack.
|
|
|
|
*/
|
|
|
|
void inp_wlock(struct inpcb *);
|
|
|
|
void inp_wunlock(struct inpcb *);
|
|
|
|
void inp_rlock(struct inpcb *);
|
|
|
|
void inp_runlock(struct inpcb *);
|
|
|
|
|
|
|
|
#ifdef INVARIANTS
|
2008-03-24 20:24:04 +00:00
|
|
|
void inp_lock_assert(struct inpcb *);
|
|
|
|
void inp_unlock_assert(struct inpcb *);
|
2008-03-23 22:34:16 +00:00
|
|
|
#else
|
|
|
|
static __inline void
|
2008-03-24 20:24:04 +00:00
|
|
|
inp_lock_assert(struct inpcb *inp __unused)
|
2008-03-23 22:34:16 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static __inline void
|
2008-03-24 20:24:04 +00:00
|
|
|
inp_unlock_assert(struct inpcb *inp __unused)
|
2008-03-23 22:34:16 +00:00
|
|
|
{
|
|
|
|
}
|
2008-03-24 20:24:04 +00:00
|
|
|
|
2008-03-23 22:34:16 +00:00
|
|
|
#endif
|
2008-07-21 00:08:34 +00:00
|
|
|
|
2008-07-21 22:11:39 +00:00
|
|
|
void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg);
|
|
|
|
int inp_ip_tos_get(const struct inpcb *inp);
|
|
|
|
void inp_ip_tos_set(struct inpcb *inp, int val);
|
|
|
|
struct socket *
|
|
|
|
inp_inpcbtosocket(struct inpcb *inp);
|
|
|
|
struct tcpcb *
|
|
|
|
inp_inpcbtotcpcb(struct inpcb *inp);
|
2008-08-07 09:06:04 +00:00
|
|
|
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
|
2008-07-21 22:11:39 +00:00
|
|
|
uint32_t *faddr, uint16_t *fp);
|
2008-07-21 00:08:34 +00:00
|
|
|
|
2008-03-24 20:24:04 +00:00
|
|
|
#endif /* _KERNEL */
|
|
|
|
|
2002-06-10 20:05:46 +00:00
|
|
|
#define INP_INFO_LOCK_INIT(ipi, d) \
|
2008-04-17 21:38:18 +00:00
|
|
|
rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
|
|
|
|
#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock)
|
|
|
|
#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock)
|
|
|
|
#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock)
|
Merge last of a series of rwlock conversion changes to UDP, which
completes the move to a fully parallel UDP transmit path by using
global read, rather than write, locking of inpcbinfo in further
semi-connected cases:
- Add macros to allow try-locking of inpcb and inpcbinfo.
- Always acquire an incpcb read lock in udp_output(), which stablizes the
local inpcb address and port bindings in order to determine what further
locking is required:
- If the inpcb is currently not bound (at all) and are implicitly
connecting, we require inpcbinfo and inpcb write locks, so drop the
read lock and re-acquire.
- If the inpcb is bound for at least one of the port or address, but an
explicit source or destination is requested, trylock the inpcbinfo
lock, and if that fails, drop the inpcb lock, lock the global lock,
and relock the inpcb lock.
- Otherwise, no further locking is required (common case).
- Update comments.
In practice, this means that the vast majority of consumers of UDP sockets
will not acquire any exclusive locks at the socket or UDP levels of the
network stack. This leads to a marked performance improvement in several
important workloads, including BIND, nsd, and memcached over UDP, as well
as significant improvements in pps microbenchmarks.
The plan is to MFC all of the rwlock changes to RELENG_7 once they have
settled for a weeks in the tree.
Tested by: ps, kris (older revision), bde
MFC after: 3 weeks
2008-07-15 15:38:47 +00:00
|
|
|
#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock)
|
|
|
|
#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock)
|
2008-04-17 21:38:18 +00:00
|
|
|
#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock)
|
|
|
|
#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock)
|
|
|
|
#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
|
|
|
|
#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
|
|
|
|
#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
|
|
|
|
#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
|
2002-06-10 20:05:46 +00:00
|
|
|
|
1997-03-03 09:23:37 +00:00
|
|
|
#define INP_PCBHASH(faddr, lport, fport, mask) \
|
Improved connection establishment performance by doing local port lookups via
a hashed port list. In the new scheme, in_pcblookup() goes away and is
replaced by a new routine, in_pcblookup_local() for doing the local port
check. Note that this implementation is space inefficient in that the PCB
struct is now too large to fit into 128 bytes. I might deal with this in the
future by using the new zone allocator, but I wanted these changes to be
extensively tested in their current form first.
Also:
1) Fixed off-by-one errors in the port lookup loops in in_pcbbind().
2) Got rid of some unneeded rehashing. Adding a new routine, in_pcbinshash()
to do the initialial hash insertion.
3) Renamed in_pcblookuphash() to in_pcblookup_hash() for easier readability.
4) Added a new routine, in_pcbremlists() to remove the PCB from the various
hash lists.
5) Added/deleted comments where appropriate.
6) Removed unnecessary splnet() locking. In general, the PCB functions should
be called at splnet()...there are unfortunately a few exceptions, however.
7) Reorganized a few structs for better cache line behavior.
8) Killed my TCP_ACK_HACK kludge. It may come back in a different form in
the future, however.
These changes have been tested on wcarchive for more than a month. In tests
done here, connection establishment overhead is reduced by more than 50
times, thus getting rid of one of the major networking scalability problems.
Still to do: make tcp_fastimo/tcp_slowtimo scale well for systems with a
large number of connections. tcp_fastimo is easy; tcp_slowtimo is difficult.
WARNING: Anything that knows about inpcb and tcpcb structs will have to be
recompiled; at the very least, this includes netstat(1).
1998-01-27 09:15:13 +00:00
|
|
|
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
|
|
|
|
#define INP_PCBPORTHASH(lport, mask) \
|
|
|
|
(ntohs((lport)) & (mask))
|
1997-03-03 09:23:37 +00:00
|
|
|
|
2008-12-09 10:21:38 +00:00
|
|
|
/*
|
2009-04-15 22:09:42 +00:00
|
|
|
* Flags for inp_vflags -- historically version flags only
|
2008-12-09 10:21:38 +00:00
|
|
|
*/
|
|
|
|
#define INP_IPV4 0x1
|
|
|
|
#define INP_IPV6 0x2
|
|
|
|
#define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */
|
|
|
|
|
|
|
|
/*
|
2009-04-15 22:09:42 +00:00
|
|
|
* Flags for inp_flags.
|
2008-12-09 10:21:38 +00:00
|
|
|
*/
|
2009-03-15 09:58:31 +00:00
|
|
|
#define INP_RECVOPTS 0x00000001 /* receive incoming IP options */
|
|
|
|
#define INP_RECVRETOPTS 0x00000002 /* receive IP options for reply */
|
|
|
|
#define INP_RECVDSTADDR 0x00000004 /* receive IP dst address */
|
|
|
|
#define INP_HDRINCL 0x00000008 /* user supplies entire IP header */
|
|
|
|
#define INP_HIGHPORT 0x00000010 /* user wants "high" port binding */
|
|
|
|
#define INP_LOWPORT 0x00000020 /* user wants "low" port binding */
|
|
|
|
#define INP_ANONPORT 0x00000040 /* port chosen for user */
|
|
|
|
#define INP_RECVIF 0x00000080 /* receive incoming interface */
|
|
|
|
#define INP_MTUDISC 0x00000100 /* user can do MTU discovery */
|
|
|
|
#define INP_FAITH 0x00000200 /* accept FAITH'ed connections */
|
|
|
|
#define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */
|
|
|
|
#define INP_DONTFRAG 0x00000800 /* don't fragment packet */
|
|
|
|
#define INP_NONLOCALOK 0x00001000 /* Allow bind to spoof any address */
|
2009-01-09 16:02:19 +00:00
|
|
|
/* - requires options IP_NONLOCALBIND */
|
2009-03-15 09:58:31 +00:00
|
|
|
#define INP_INHASHLIST 0x00002000 /* in_pcbinshash() has been called */
|
|
|
|
#define IN6P_IPV6_V6ONLY 0x00008000 /* restrict AF_INET6 socket for v6 */
|
|
|
|
#define IN6P_PKTINFO 0x00010000 /* receive IP6 dst and I/F */
|
|
|
|
#define IN6P_HOPLIMIT 0x00020000 /* receive hoplimit */
|
|
|
|
#define IN6P_HOPOPTS 0x00040000 /* receive hop-by-hop options */
|
|
|
|
#define IN6P_DSTOPTS 0x00080000 /* receive dst options after rthdr */
|
|
|
|
#define IN6P_RTHDR 0x00100000 /* receive routing header */
|
|
|
|
#define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */
|
|
|
|
#define IN6P_TCLASS 0x00400000 /* receive traffic class value */
|
|
|
|
#define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */
|
|
|
|
#define INP_TIMEWAIT 0x01000000 /* in TIMEWAIT, ppcb is tcptw */
|
|
|
|
#define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */
|
|
|
|
#define INP_DROPPED 0x04000000 /* protocol drop flag */
|
|
|
|
#define INP_SOCKREF 0x08000000 /* strong socket reference */
|
2009-04-10 06:16:14 +00:00
|
|
|
#define INP_SW_FLOWID 0x10000000 /* software generated flow id */
|
|
|
|
#define INP_HW_FLOWID 0x20000000 /* hardware generated flow id */
|
2003-10-24 19:51:49 +00:00
|
|
|
#define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */
|
|
|
|
#define IN6P_MTU 0x80000000 /* receive path MTU */
|
2001-06-11 12:39:29 +00:00
|
|
|
|
1996-11-11 04:56:32 +00:00
|
|
|
#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
|
2003-04-29 21:36:18 +00:00
|
|
|
INP_RECVIF|INP_RECVTTL|\
|
2001-06-11 12:39:29 +00:00
|
|
|
IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
|
|
|
|
IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
|
2003-10-24 18:26:30 +00:00
|
|
|
IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
|
|
|
|
IN6P_MTU)
|
1999-11-22 02:45:11 +00:00
|
|
|
|
2009-04-15 22:09:42 +00:00
|
|
|
/*
|
|
|
|
* Flags for inp_flags2.
|
|
|
|
*/
|
2009-04-15 22:22:00 +00:00
|
|
|
#define INP_LLE_VALID 0x00000001 /* cached lle is valid */
|
|
|
|
#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */
|
2009-04-15 22:09:42 +00:00
|
|
|
|
1999-11-22 02:45:11 +00:00
|
|
|
#define INPLOOKUP_WILDCARD 1
|
1994-05-24 10:09:53 +00:00
|
|
|
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
|
1999-11-22 02:45:11 +00:00
|
|
|
#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */
|
|
|
|
|
|
|
|
#define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
|
|
|
|
|
2004-08-16 18:32:07 +00:00
|
|
|
#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af)
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1999-12-29 04:46:21 +00:00
|
|
|
#ifdef _KERNEL
|
2008-12-11 16:26:38 +00:00
|
|
|
#ifdef VIMAGE_GLOBALS
|
2006-03-19 11:48:48 +00:00
|
|
|
extern int ipport_reservedhigh;
|
|
|
|
extern int ipport_reservedlow;
|
1999-11-22 02:45:11 +00:00
|
|
|
extern int ipport_lowfirstauto;
|
|
|
|
extern int ipport_lowlastauto;
|
|
|
|
extern int ipport_firstauto;
|
|
|
|
extern int ipport_lastauto;
|
|
|
|
extern int ipport_hifirstauto;
|
|
|
|
extern int ipport_hilastauto;
|
2008-10-20 18:43:59 +00:00
|
|
|
extern int ipport_randomized;
|
2008-11-19 09:39:34 +00:00
|
|
|
extern int ipport_randomcps;
|
|
|
|
extern int ipport_randomtime;
|
2008-10-20 18:43:59 +00:00
|
|
|
extern int ipport_stoprandom;
|
|
|
|
extern int ipport_tcpallocs;
|
2008-12-11 16:26:38 +00:00
|
|
|
#endif
|
2005-01-02 01:50:57 +00:00
|
|
|
extern struct callout ipport_tick_callout;
|
1999-11-22 02:45:11 +00:00
|
|
|
|
2002-06-10 20:05:46 +00:00
|
|
|
void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
|
2006-07-18 22:34:27 +00:00
|
|
|
int in_pcballoc(struct socket *, struct inpcbinfo *);
|
2004-03-27 21:05:46 +00:00
|
|
|
int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
|
2002-10-20 21:44:31 +00:00
|
|
|
int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
|
2004-03-27 21:05:46 +00:00
|
|
|
u_short *, struct ucred *);
|
|
|
|
int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *);
|
2002-10-21 13:55:50 +00:00
|
|
|
int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
|
|
|
|
u_short *, in_addr_t *, u_short *, struct inpcb **,
|
2004-03-27 21:05:46 +00:00
|
|
|
struct ucred *);
|
2002-03-19 21:25:46 +00:00
|
|
|
void in_pcbdetach(struct inpcb *);
|
|
|
|
void in_pcbdisconnect(struct inpcb *);
|
2006-04-25 11:17:35 +00:00
|
|
|
void in_pcbdrop(struct inpcb *);
|
2006-04-01 16:04:42 +00:00
|
|
|
void in_pcbfree(struct inpcb *);
|
2002-03-19 21:25:46 +00:00
|
|
|
int in_pcbinshash(struct inpcb *);
|
1994-05-24 10:09:53 +00:00
|
|
|
struct inpcb *
|
2002-03-19 21:25:46 +00:00
|
|
|
in_pcblookup_local(struct inpcbinfo *,
|
2008-07-10 13:31:11 +00:00
|
|
|
struct in_addr, u_short, int, struct ucred *);
|
1995-04-09 01:29:31 +00:00
|
|
|
struct inpcb *
|
2002-03-24 10:19:10 +00:00
|
|
|
in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int,
|
|
|
|
struct in_addr, u_int, int, struct ifnet *);
|
2002-06-10 20:05:46 +00:00
|
|
|
void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
|
2002-06-14 08:35:21 +00:00
|
|
|
int, struct inpcb *(*)(struct inpcb *, int));
|
Add a reference count to struct inpcb, which may be explicitly
incremented using in_pcbref(), and decremented using in_pcbfree()
or inpcbrele(). Protocols using only current in_pcballoc() and
in_pcbfree() calls will see the same semantics, but it is now
possible for TCP to call in_pcbref() and in_pcbrele() to prevent
an inpcb from being freed when both tcbinfo and per-inpcb locks
are released. This makes it possible to safely transition from
holding only the inpcb lock to both tcbinfo and inpcb lock
without re-looking up a connection in the input path, timer
path, etc.
Notice that in_pcbrele() does not unlock the connection after
decrementing the refcount, if the connection remains, so that
the caller can continue to use it; in_pcbrele() returns a flag
indicating whether or not the inpcb pointer is still valid, and
in_pcbfee() is now a simple wrapper around in_pcbrele().
MFC after: 1 month
Discussed with: bz, kmacy
Reviewed by: bz, gnn, kmacy
Tested by: kmacy
2008-12-08 20:18:50 +00:00
|
|
|
void in_pcbref(struct inpcb *);
|
2002-03-19 21:25:46 +00:00
|
|
|
void in_pcbrehash(struct inpcb *);
|
Add a reference count to struct inpcb, which may be explicitly
incremented using in_pcbref(), and decremented using in_pcbfree()
or inpcbrele(). Protocols using only current in_pcballoc() and
in_pcbfree() calls will see the same semantics, but it is now
possible for TCP to call in_pcbref() and in_pcbrele() to prevent
an inpcb from being freed when both tcbinfo and per-inpcb locks
are released. This makes it possible to safely transition from
holding only the inpcb lock to both tcbinfo and inpcb lock
without re-looking up a connection in the input path, timer
path, etc.
Notice that in_pcbrele() does not unlock the connection after
decrementing the refcount, if the connection remains, so that
the caller can continue to use it; in_pcbrele() returns a flag
indicating whether or not the inpcb pointer is still valid, and
in_pcbfee() is now a simple wrapper around in_pcbrele().
MFC after: 1 month
Discussed with: bz, kmacy
Reviewed by: bz, gnn, kmacy
Tested by: kmacy
2008-12-08 20:18:50 +00:00
|
|
|
int in_pcbrele(struct inpcb *);
|
Introduce a MAC label reference in 'struct inpcb', which caches
the MAC label referenced from 'struct socket' in the IPv4 and
IPv6-based protocols. This permits MAC labels to be checked during
network delivery operations without dereferencing inp->inp_socket
to get to so->so_label, which will eventually avoid our having to
grab the socket lock during delivery at the network layer.
This change introduces 'struct inpcb' as a labeled object to the
MAC Framework, along with the normal circus of entry points:
initialization, creation from socket, destruction, as well as a
delivery access control check.
For most policies, the inpcb label will simply be a cache of the
socket label, so a new protocol switch method is introduced,
pr_sosetlabel() to notify protocols that the socket layer label
has been updated so that the cache can be updated while holding
appropriate locks. Most protocols implement this using
pru_sosetlabel_null(), but IPv4/IPv6 protocols using inpcbs use
the the worker function in_pcbsosetlabel(), which calls into the
MAC Framework to perform a cache update.
Biba, LOMAC, and MLS implement these entry points, as do the stub
policy, and test policy.
Reviewed by: sam, bms
Obtained from: TrustedBSD Project
Sponsored by: DARPA, Network Associates Laboratories
2003-11-18 00:39:07 +00:00
|
|
|
void in_pcbsetsolabel(struct socket *so);
|
2007-05-11 10:20:51 +00:00
|
|
|
int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
|
|
|
|
int in_getsockaddr(struct socket *so, struct sockaddr **nam);
|
2002-08-21 11:57:12 +00:00
|
|
|
struct sockaddr *
|
|
|
|
in_sockaddr(in_port_t port, struct in_addr *addr);
|
Introduce a MAC label reference in 'struct inpcb', which caches
the MAC label referenced from 'struct socket' in the IPv4 and
IPv6-based protocols. This permits MAC labels to be checked during
network delivery operations without dereferencing inp->inp_socket
to get to so->so_label, which will eventually avoid our having to
grab the socket lock during delivery at the network layer.
This change introduces 'struct inpcb' as a labeled object to the
MAC Framework, along with the normal circus of entry points:
initialization, creation from socket, destruction, as well as a
delivery access control check.
For most policies, the inpcb label will simply be a cache of the
socket label, so a new protocol switch method is introduced,
pr_sosetlabel() to notify protocols that the socket layer label
has been updated so that the cache can be updated while holding
appropriate locks. Most protocols implement this using
pru_sosetlabel_null(), but IPv4/IPv6 protocols using inpcbs use
the the worker function in_pcbsosetlabel(), which calls into the
MAC Framework to perform a cache update.
Biba, LOMAC, and MLS implement these entry points, as do the stub
policy, and test policy.
Reviewed by: sam, bms
Obtained from: TrustedBSD Project
Sponsored by: DARPA, Network Associates Laboratories
2003-11-18 00:39:07 +00:00
|
|
|
void in_pcbsosetlabel(struct socket *so);
|
2005-01-02 01:50:57 +00:00
|
|
|
void ipport_tick(void *xtp);
|
1999-12-29 04:46:21 +00:00
|
|
|
#endif /* _KERNEL */
|
1998-03-24 18:06:34 +00:00
|
|
|
|
1998-03-28 10:18:26 +00:00
|
|
|
#endif /* !_NETINET_IN_PCB_H_ */
|